mirror of https://github.com/doccano/doccano.git
pythondatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learningannotation-tool
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
460 lines
9.6 KiB
460 lines
9.6 KiB
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Type
|
|
|
|
from pydantic import BaseModel
|
|
from typing_extensions import Literal
|
|
|
|
from .exceptions import FileFormatException
|
|
from projects.models import (
|
|
BOUNDING_BOX,
|
|
DOCUMENT_CLASSIFICATION,
|
|
IMAGE_CAPTIONING,
|
|
IMAGE_CLASSIFICATION,
|
|
INTENT_DETECTION_AND_SLOT_FILLING,
|
|
SEGMENTATION,
|
|
SEQ2SEQ,
|
|
SEQUENCE_LABELING,
|
|
SPEECH2TEXT,
|
|
)
|
|
|
|
# Define the example directories
|
|
EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"
|
|
TASK_AGNOSTIC_DIR = EXAMPLE_DIR / "task_agnostic"
|
|
TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
|
|
SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
|
|
RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
|
|
SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
|
|
INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
|
|
IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
|
|
SPEECH_TO_TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
|
|
|
|
# Define the task identifiers
|
|
RELATION_EXTRACTION = "RelationExtraction"
|
|
|
|
encodings = Literal[
|
|
"Auto",
|
|
"ascii",
|
|
"big5",
|
|
"big5hkscs",
|
|
"cp037",
|
|
"cp273",
|
|
"cp424",
|
|
"cp437",
|
|
"cp500",
|
|
"cp720",
|
|
"cp737",
|
|
"cp775",
|
|
"cp850",
|
|
"cp852",
|
|
"cp855",
|
|
"cp856",
|
|
"cp857",
|
|
"cp858",
|
|
"cp860",
|
|
"cp861",
|
|
"cp862",
|
|
"cp863",
|
|
"cp864",
|
|
"cp865",
|
|
"cp866",
|
|
"cp869",
|
|
"cp874",
|
|
"cp875",
|
|
"cp932",
|
|
"cp949",
|
|
"cp950",
|
|
"cp1006",
|
|
"cp1026",
|
|
"cp1125",
|
|
"cp1140",
|
|
"cp1250",
|
|
"cp1251",
|
|
"cp1252",
|
|
"cp1253",
|
|
"cp1254",
|
|
"cp1255",
|
|
"cp1256",
|
|
"cp1257",
|
|
"cp1258",
|
|
"cp65001",
|
|
"euc_jp",
|
|
"euc_jis_2004",
|
|
"euc_jisx0213",
|
|
"euc_kr",
|
|
"gb2312",
|
|
"gbk",
|
|
"gb18030",
|
|
"hz",
|
|
"iso2022_jp",
|
|
"iso2022_jp_1",
|
|
"iso2022_jp_2",
|
|
"iso2022_jp_2004",
|
|
"iso2022_jp_3",
|
|
"iso2022_jp_ext",
|
|
"iso2022_kr",
|
|
"latin_1",
|
|
"iso8859_2",
|
|
"iso8859_3",
|
|
"iso8859_4",
|
|
"iso8859_5",
|
|
"iso8859_6",
|
|
"iso8859_7",
|
|
"iso8859_8",
|
|
"iso8859_9",
|
|
"iso8859_10",
|
|
"iso8859_11",
|
|
"iso8859_13",
|
|
"iso8859_14",
|
|
"iso8859_15",
|
|
"iso8859_16",
|
|
"johab",
|
|
"koi8_r",
|
|
"koi8_t",
|
|
"koi8_u",
|
|
"kz1048",
|
|
"mac_cyrillic",
|
|
"mac_greek",
|
|
"mac_iceland",
|
|
"mac_latin2",
|
|
"mac_roman",
|
|
"mac_turkish",
|
|
"ptcp154",
|
|
"shift_jis",
|
|
"shift_jis_2004",
|
|
"shift_jisx0213",
|
|
"utf_32",
|
|
"utf_32_be",
|
|
"utf_32_le",
|
|
"utf_16",
|
|
"utf_16_be",
|
|
"utf_16_le",
|
|
"utf_7",
|
|
"utf_8",
|
|
"utf_8_sig",
|
|
]
|
|
|
|
|
|
class Format:
|
|
name = ""
|
|
accept_types = ""
|
|
|
|
@classmethod
|
|
def dict(cls):
|
|
return {"name": cls.name, "accept_types": cls.accept_types}
|
|
|
|
def validate_mime(self, mime: str):
|
|
return True
|
|
|
|
@staticmethod
|
|
def is_plain_text():
|
|
return False
|
|
|
|
|
|
class CSV(Format):
|
|
name = "CSV"
|
|
accept_types = "text/csv"
|
|
|
|
|
|
class FastText(Format):
|
|
name = "fastText"
|
|
accept_types = "text/plain"
|
|
|
|
|
|
class JSON(Format):
|
|
name = "JSON"
|
|
accept_types = "application/json"
|
|
|
|
|
|
class JSONL(Format):
|
|
name = "JSONL"
|
|
accept_types = "*"
|
|
|
|
|
|
class Excel(Format):
|
|
name = "Excel"
|
|
accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
|
class TextFile(Format):
|
|
name = "TextFile"
|
|
accept_types = "text/*"
|
|
|
|
@staticmethod
|
|
def is_plain_text():
|
|
return True
|
|
|
|
|
|
class TextLine(Format):
|
|
name = "TextLine"
|
|
accept_types = "text/*"
|
|
|
|
@staticmethod
|
|
def is_plain_text():
|
|
return True
|
|
|
|
|
|
class CoNLL(Format):
|
|
name = "CoNLL"
|
|
accept_types = "text/*"
|
|
|
|
|
|
class ImageFile(Format):
|
|
name = "ImageFile"
|
|
accept_types = "image/png, image/jpeg, image/bmp, image/gif"
|
|
|
|
def validate_mime(self, mime: str):
|
|
return mime in self.accept_types
|
|
|
|
|
|
class AudioFile(Format):
|
|
name = "AudioFile"
|
|
accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"
|
|
|
|
def validate_mime(self, mime: str):
|
|
return mime in self.accept_types
|
|
|
|
|
|
class ArgColumn(BaseModel):
|
|
encoding: encodings = "utf_8"
|
|
column_data: str = "text"
|
|
column_label: str = "label"
|
|
|
|
|
|
class ArgDelimiter(ArgColumn):
|
|
encoding: encodings = "utf_8"
|
|
delimiter: Literal[",", "\t", ";", "|", " "] = ","
|
|
|
|
|
|
class ArgEncoding(BaseModel):
|
|
encoding: encodings = "utf_8"
|
|
|
|
|
|
class ArgCoNLL(BaseModel):
|
|
encoding: encodings = "utf_8"
|
|
scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"
|
|
delimiter: Literal[" ", ""] = " "
|
|
|
|
|
|
class ArgNone(BaseModel):
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class Option:
|
|
display_name: str
|
|
task_id: str
|
|
file_format: Type[Format]
|
|
arg: Type[BaseModel]
|
|
file: Path
|
|
|
|
@property
|
|
def example(self) -> str:
|
|
with open(self.file, "r", encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
def dict(self) -> Dict:
|
|
return {
|
|
**self.file_format.dict(),
|
|
**self.arg.schema(),
|
|
"example": self.example,
|
|
"task_id": self.task_id,
|
|
"display_name": self.display_name,
|
|
}
|
|
|
|
|
|
def create_file_format(file_format: str) -> Format:
|
|
for format_class in Format.__subclasses__():
|
|
if format_class.name == file_format:
|
|
return format_class()
|
|
raise FileFormatException(file_format)
|
|
|
|
|
|
class Options:
|
|
options: Dict[str, List] = defaultdict(list)
|
|
|
|
@classmethod
|
|
def filter_by_task(cls, task_name: str, use_relation: bool = False):
|
|
options = cls.options[task_name]
|
|
if use_relation:
|
|
options = cls.options[task_name] + cls.options[RELATION_EXTRACTION]
|
|
return [option.dict() for option in options]
|
|
|
|
@classmethod
|
|
def register(cls, option: Option):
|
|
cls.options[option.task_id].append(option)
|
|
|
|
|
|
# Text tasks
|
|
text_tasks = [DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, INTENT_DETECTION_AND_SLOT_FILLING]
|
|
for task_id in text_tasks:
|
|
Options.register(
|
|
Option(
|
|
display_name=TextFile.name,
|
|
task_id=task_id,
|
|
file_format=TextFile,
|
|
arg=ArgEncoding,
|
|
file=TASK_AGNOSTIC_DIR / "text_files.txt",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=TextLine.name,
|
|
task_id=task_id,
|
|
file_format=TextLine,
|
|
arg=ArgEncoding,
|
|
file=TASK_AGNOSTIC_DIR / "text_lines.txt",
|
|
)
|
|
)
|
|
|
|
# Text Classification
|
|
Options.register(
|
|
Option(
|
|
display_name=CSV.name,
|
|
task_id=DOCUMENT_CLASSIFICATION,
|
|
file_format=CSV,
|
|
arg=ArgDelimiter,
|
|
file=TEXT_CLASSIFICATION_DIR / "example.csv",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=FastText.name,
|
|
task_id=DOCUMENT_CLASSIFICATION,
|
|
file_format=FastText,
|
|
arg=ArgEncoding,
|
|
file=TEXT_CLASSIFICATION_DIR / "example.txt",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=JSON.name,
|
|
task_id=DOCUMENT_CLASSIFICATION,
|
|
file_format=JSON,
|
|
arg=ArgColumn,
|
|
file=TEXT_CLASSIFICATION_DIR / "example.json",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=JSONL.name,
|
|
task_id=DOCUMENT_CLASSIFICATION,
|
|
file_format=JSONL,
|
|
arg=ArgColumn,
|
|
file=TEXT_CLASSIFICATION_DIR / "example.jsonl",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=Excel.name,
|
|
task_id=DOCUMENT_CLASSIFICATION,
|
|
file_format=Excel,
|
|
arg=ArgColumn,
|
|
file=TEXT_CLASSIFICATION_DIR / "example.csv",
|
|
)
|
|
)
|
|
|
|
# Sequence Labelling
|
|
Options.register(
|
|
Option(
|
|
display_name=JSONL.name,
|
|
task_id=SEQUENCE_LABELING,
|
|
file_format=JSONL,
|
|
arg=ArgColumn,
|
|
file=SEQUENCE_LABELING_DIR / "example.jsonl",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=CoNLL.name,
|
|
task_id=SEQUENCE_LABELING,
|
|
file_format=CoNLL,
|
|
arg=ArgCoNLL,
|
|
file=SEQUENCE_LABELING_DIR / "example.txt",
|
|
)
|
|
)
|
|
|
|
# Relation Extraction
|
|
Options.register(
|
|
Option(
|
|
display_name="JSONL(Relation)",
|
|
task_id=RELATION_EXTRACTION,
|
|
file_format=JSONL,
|
|
arg=ArgNone,
|
|
file=RELATION_EXTRACTION_DIR / "example.jsonl",
|
|
)
|
|
)
|
|
|
|
# Seq2seq
|
|
Options.register(
|
|
Option(
|
|
display_name=CSV.name,
|
|
task_id=SEQ2SEQ,
|
|
file_format=CSV,
|
|
arg=ArgDelimiter,
|
|
file=SEQ2SEQ_DIR / "example.csv",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=JSON.name,
|
|
task_id=SEQ2SEQ,
|
|
file_format=JSON,
|
|
arg=ArgColumn,
|
|
file=SEQ2SEQ_DIR / "example.json",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=JSONL.name,
|
|
task_id=SEQ2SEQ,
|
|
file_format=JSONL,
|
|
arg=ArgColumn,
|
|
file=SEQ2SEQ_DIR / "example.jsonl",
|
|
)
|
|
)
|
|
Options.register(
|
|
Option(
|
|
display_name=Excel.name,
|
|
task_id=SEQ2SEQ,
|
|
file_format=Excel,
|
|
arg=ArgColumn,
|
|
file=SEQ2SEQ_DIR / "example.csv",
|
|
)
|
|
)
|
|
|
|
# Intent detection
|
|
Options.register(
|
|
Option(
|
|
display_name=JSONL.name,
|
|
task_id=INTENT_DETECTION_AND_SLOT_FILLING,
|
|
file_format=JSONL,
|
|
arg=ArgNone,
|
|
file=INTENT_DETECTION_DIR / "example.jsonl",
|
|
)
|
|
)
|
|
|
|
# Image tasks
|
|
image_tasks = [IMAGE_CLASSIFICATION, IMAGE_CAPTIONING, BOUNDING_BOX, SEGMENTATION]
|
|
for task_name in image_tasks:
|
|
Options.register(
|
|
Option(
|
|
display_name=ImageFile.name,
|
|
task_id=task_name,
|
|
file_format=ImageFile,
|
|
arg=ArgNone,
|
|
file=IMAGE_CLASSIFICATION_DIR / "image_files.txt",
|
|
)
|
|
)
|
|
|
|
# Speech to Text
|
|
Options.register(
|
|
Option(
|
|
display_name=AudioFile.name,
|
|
task_id=SPEECH2TEXT,
|
|
file_format=AudioFile,
|
|
arg=ArgNone,
|
|
file=SPEECH_TO_TEXT_DIR / "audio_files.txt",
|
|
)
|
|
)
|