You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

460 lines
9.6 KiB

from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Type
from pydantic import BaseModel
from typing_extensions import Literal
from .exceptions import FileFormatException
from projects.models import (
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
SEQ2SEQ,
SEQUENCE_LABELING,
SPEECH2TEXT,
)
# Define the example directories
EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"
TASK_AGNOSTIC_DIR = EXAMPLE_DIR / "task_agnostic"
TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
SPEECH_TO_TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
# Define the task identifiers
RELATION_EXTRACTION = "RelationExtraction"
encodings = Literal[
"Auto",
"ascii",
"big5",
"big5hkscs",
"cp037",
"cp273",
"cp424",
"cp437",
"cp500",
"cp720",
"cp737",
"cp775",
"cp850",
"cp852",
"cp855",
"cp856",
"cp857",
"cp858",
"cp860",
"cp861",
"cp862",
"cp863",
"cp864",
"cp865",
"cp866",
"cp869",
"cp874",
"cp875",
"cp932",
"cp949",
"cp950",
"cp1006",
"cp1026",
"cp1125",
"cp1140",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"cp65001",
"euc_jp",
"euc_jis_2004",
"euc_jisx0213",
"euc_kr",
"gb2312",
"gbk",
"gb18030",
"hz",
"iso2022_jp",
"iso2022_jp_1",
"iso2022_jp_2",
"iso2022_jp_2004",
"iso2022_jp_3",
"iso2022_jp_ext",
"iso2022_kr",
"latin_1",
"iso8859_2",
"iso8859_3",
"iso8859_4",
"iso8859_5",
"iso8859_6",
"iso8859_7",
"iso8859_8",
"iso8859_9",
"iso8859_10",
"iso8859_11",
"iso8859_13",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"johab",
"koi8_r",
"koi8_t",
"koi8_u",
"kz1048",
"mac_cyrillic",
"mac_greek",
"mac_iceland",
"mac_latin2",
"mac_roman",
"mac_turkish",
"ptcp154",
"shift_jis",
"shift_jis_2004",
"shift_jisx0213",
"utf_32",
"utf_32_be",
"utf_32_le",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_7",
"utf_8",
"utf_8_sig",
]
class Format:
name = ""
accept_types = ""
@classmethod
def dict(cls):
return {"name": cls.name, "accept_types": cls.accept_types}
def validate_mime(self, mime: str):
return True
@staticmethod
def is_plain_text():
return False
class CSV(Format):
name = "CSV"
accept_types = "text/csv"
class FastText(Format):
name = "fastText"
accept_types = "text/plain"
class JSON(Format):
name = "JSON"
accept_types = "application/json"
class JSONL(Format):
name = "JSONL"
accept_types = "*"
class Excel(Format):
name = "Excel"
accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
class TextFile(Format):
name = "TextFile"
accept_types = "text/*"
@staticmethod
def is_plain_text():
return True
class TextLine(Format):
name = "TextLine"
accept_types = "text/*"
@staticmethod
def is_plain_text():
return True
class CoNLL(Format):
name = "CoNLL"
accept_types = "text/*"
class ImageFile(Format):
name = "ImageFile"
accept_types = "image/png, image/jpeg, image/bmp, image/gif"
def validate_mime(self, mime: str):
return mime in self.accept_types
class AudioFile(Format):
name = "AudioFile"
accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"
def validate_mime(self, mime: str):
return mime in self.accept_types
class ArgColumn(BaseModel):
encoding: encodings = "utf_8"
column_data: str = "text"
column_label: str = "label"
class ArgDelimiter(ArgColumn):
encoding: encodings = "utf_8"
delimiter: Literal[",", "\t", ";", "|", " "] = ","
class ArgEncoding(BaseModel):
encoding: encodings = "utf_8"
class ArgCoNLL(BaseModel):
encoding: encodings = "utf_8"
scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"
delimiter: Literal[" ", ""] = " "
class ArgNone(BaseModel):
pass
@dataclass
class Option:
display_name: str
task_id: str
file_format: Type[Format]
arg: Type[BaseModel]
file: Path
@property
def example(self) -> str:
with open(self.file, "r", encoding="utf-8") as f:
return f.read()
def dict(self) -> Dict:
return {
**self.file_format.dict(),
**self.arg.schema(),
"example": self.example,
"task_id": self.task_id,
"display_name": self.display_name,
}
def create_file_format(file_format: str) -> Format:
for format_class in Format.__subclasses__():
if format_class.name == file_format:
return format_class()
raise FileFormatException(file_format)
class Options:
options: Dict[str, List] = defaultdict(list)
@classmethod
def filter_by_task(cls, task_name: str, use_relation: bool = False):
options = cls.options[task_name]
if use_relation:
options = cls.options[task_name] + cls.options[RELATION_EXTRACTION]
return [option.dict() for option in options]
@classmethod
def register(cls, option: Option):
cls.options[option.task_id].append(option)
# Text tasks
text_tasks = [DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, INTENT_DETECTION_AND_SLOT_FILLING]
for task_id in text_tasks:
Options.register(
Option(
display_name=TextFile.name,
task_id=task_id,
file_format=TextFile,
arg=ArgEncoding,
file=TASK_AGNOSTIC_DIR / "text_files.txt",
)
)
Options.register(
Option(
display_name=TextLine.name,
task_id=task_id,
file_format=TextLine,
arg=ArgEncoding,
file=TASK_AGNOSTIC_DIR / "text_lines.txt",
)
)
# Text Classification
Options.register(
Option(
display_name=CSV.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=CSV,
arg=ArgDelimiter,
file=TEXT_CLASSIFICATION_DIR / "example.csv",
)
)
Options.register(
Option(
display_name=FastText.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=FastText,
arg=ArgEncoding,
file=TEXT_CLASSIFICATION_DIR / "example.txt",
)
)
Options.register(
Option(
display_name=JSON.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=JSON,
arg=ArgColumn,
file=TEXT_CLASSIFICATION_DIR / "example.json",
)
)
Options.register(
Option(
display_name=JSONL.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=JSONL,
arg=ArgColumn,
file=TEXT_CLASSIFICATION_DIR / "example.jsonl",
)
)
Options.register(
Option(
display_name=Excel.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=Excel,
arg=ArgColumn,
file=TEXT_CLASSIFICATION_DIR / "example.csv",
)
)
# Sequence Labelling
Options.register(
Option(
display_name=JSONL.name,
task_id=SEQUENCE_LABELING,
file_format=JSONL,
arg=ArgColumn,
file=SEQUENCE_LABELING_DIR / "example.jsonl",
)
)
Options.register(
Option(
display_name=CoNLL.name,
task_id=SEQUENCE_LABELING,
file_format=CoNLL,
arg=ArgCoNLL,
file=SEQUENCE_LABELING_DIR / "example.txt",
)
)
# Relation Extraction
Options.register(
Option(
display_name="JSONL(Relation)",
task_id=RELATION_EXTRACTION,
file_format=JSONL,
arg=ArgNone,
file=RELATION_EXTRACTION_DIR / "example.jsonl",
)
)
# Seq2seq
Options.register(
Option(
display_name=CSV.name,
task_id=SEQ2SEQ,
file_format=CSV,
arg=ArgDelimiter,
file=SEQ2SEQ_DIR / "example.csv",
)
)
Options.register(
Option(
display_name=JSON.name,
task_id=SEQ2SEQ,
file_format=JSON,
arg=ArgColumn,
file=SEQ2SEQ_DIR / "example.json",
)
)
Options.register(
Option(
display_name=JSONL.name,
task_id=SEQ2SEQ,
file_format=JSONL,
arg=ArgColumn,
file=SEQ2SEQ_DIR / "example.jsonl",
)
)
Options.register(
Option(
display_name=Excel.name,
task_id=SEQ2SEQ,
file_format=Excel,
arg=ArgColumn,
file=SEQ2SEQ_DIR / "example.csv",
)
)
# Intent detection
Options.register(
Option(
display_name=JSONL.name,
task_id=INTENT_DETECTION_AND_SLOT_FILLING,
file_format=JSONL,
arg=ArgNone,
file=INTENT_DETECTION_DIR / "example.jsonl",
)
)
# Image tasks
image_tasks = [IMAGE_CLASSIFICATION, IMAGE_CAPTIONING, BOUNDING_BOX, SEGMENTATION]
for task_name in image_tasks:
Options.register(
Option(
display_name=ImageFile.name,
task_id=task_name,
file_format=ImageFile,
arg=ArgNone,
file=IMAGE_CLASSIFICATION_DIR / "image_files.txt",
)
)
# Speech to Text
Options.register(
Option(
display_name=AudioFile.name,
task_id=SPEECH2TEXT,
file_format=AudioFile,
arg=ArgNone,
file=SPEECH_TO_TEXT_DIR / "audio_files.txt",
)
)