You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

426 lines
8.8 KiB

from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Type
from pydantic import BaseModel
from typing_extensions import Literal
from projects.models import (
DOCUMENT_CLASSIFICATION,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEQ2SEQ,
SEQUENCE_LABELING,
SPEECH2TEXT,
)
# Define the example directories
EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"
TASK_AGNOSTIC_DIR = EXAMPLE_DIR / "task_agnostic"
TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
SPEECH_TO_TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
# Define the task identifiers
RELATION_EXTRACTION = "RelationExtraction"
encodings = Literal[
"Auto",
"ascii",
"big5",
"big5hkscs",
"cp037",
"cp273",
"cp424",
"cp437",
"cp500",
"cp720",
"cp737",
"cp775",
"cp850",
"cp852",
"cp855",
"cp856",
"cp857",
"cp858",
"cp860",
"cp861",
"cp862",
"cp863",
"cp864",
"cp865",
"cp866",
"cp869",
"cp874",
"cp875",
"cp932",
"cp949",
"cp950",
"cp1006",
"cp1026",
"cp1125",
"cp1140",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"cp65001",
"euc_jp",
"euc_jis_2004",
"euc_jisx0213",
"euc_kr",
"gb2312",
"gbk",
"gb18030",
"hz",
"iso2022_jp",
"iso2022_jp_1",
"iso2022_jp_2",
"iso2022_jp_2004",
"iso2022_jp_3",
"iso2022_jp_ext",
"iso2022_kr",
"latin_1",
"iso8859_2",
"iso8859_3",
"iso8859_4",
"iso8859_5",
"iso8859_6",
"iso8859_7",
"iso8859_8",
"iso8859_9",
"iso8859_10",
"iso8859_11",
"iso8859_13",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"johab",
"koi8_r",
"koi8_t",
"koi8_u",
"kz1048",
"mac_cyrillic",
"mac_greek",
"mac_iceland",
"mac_latin2",
"mac_roman",
"mac_turkish",
"ptcp154",
"shift_jis",
"shift_jis_2004",
"shift_jisx0213",
"utf_32",
"utf_32_be",
"utf_32_le",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_7",
"utf_8",
"utf_8_sig",
]
class Format:
name = ""
accept_types = ""
@classmethod
def dict(cls):
return {"name": cls.name, "accept_types": cls.accept_types}
class CSV(Format):
name = "CSV"
accept_types = "text/csv"
class FastText(Format):
name = "fastText"
accept_types = "text/plain"
class JSON(Format):
name = "JSON"
accept_types = "application/json"
class JSONL(Format):
name = "JSONL"
accept_types = "*"
class Excel(Format):
name = "Excel"
accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
class TextFile(Format):
name = "TextFile"
accept_types = "text/*"
class TextLine(Format):
name = "TextLine"
accept_types = "text/*"
class CoNLL(Format):
name = "CoNLL"
accept_types = "text/*"
class ImageFile(Format):
name = "ImageFile"
accept_types = "image/png, image/jpeg, image/bmp, image/gif"
class AudioFile(Format):
name = "AudioFile"
accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"
class ArgColumn(BaseModel):
encoding: encodings = "utf_8"
column_data: str = "text"
column_label: str = "label"
class ArgDelimiter(ArgColumn):
encoding: encodings = "utf_8"
delimiter: Literal[",", "\t", ";", "|", " "] = ","
class ArgEncoding(BaseModel):
encoding: encodings = "utf_8"
class ArgCoNLL(BaseModel):
encoding: encodings = "utf_8"
scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"
delimiter: Literal[" ", ""] = " "
class ArgNone(BaseModel):
pass
@dataclass
class Option:
display_name: str
task_id: str
file_format: Type[Format]
arg: Type[BaseModel]
file: Path
@property
def example(self) -> str:
with open(self.file, "r", encoding="utf-8") as f:
return f.read()
def dict(self) -> Dict:
return {
**self.file_format.dict(),
**self.arg.schema(),
"example": self.example,
"task_id": self.task_id,
"display_name": self.display_name,
}
class Options:
options: Dict[str, List] = defaultdict(list)
@classmethod
def filter_by_task(cls, task_name: str, use_relation: bool = False):
options = cls.options[task_name]
if use_relation:
options = cls.options[task_name] + cls.options[RELATION_EXTRACTION]
return [option.dict() for option in options]
@classmethod
def register(cls, option: Option):
cls.options[option.task_id].append(option)
# Text tasks
text_tasks = [DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, INTENT_DETECTION_AND_SLOT_FILLING]
for task_id in text_tasks:
Options.register(
Option(
display_name=TextFile.name,
task_id=task_id,
file_format=TextFile,
arg=ArgEncoding,
file=TASK_AGNOSTIC_DIR / "text_files.txt",
)
)
Options.register(
Option(
display_name=TextLine.name,
task_id=task_id,
file_format=TextLine,
arg=ArgEncoding,
file=TASK_AGNOSTIC_DIR / "text_lines.txt",
)
)
# Text Classification
Options.register(
Option(
display_name=CSV.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=CSV,
arg=ArgDelimiter,
file=TEXT_CLASSIFICATION_DIR / "example.csv",
)
)
Options.register(
Option(
display_name=FastText.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=FastText,
arg=ArgEncoding,
file=TEXT_CLASSIFICATION_DIR / "example.txt",
)
)
Options.register(
Option(
display_name=JSON.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=JSON,
arg=ArgColumn,
file=TEXT_CLASSIFICATION_DIR / "example.json",
)
)
Options.register(
Option(
display_name=JSONL.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=JSONL,
arg=ArgColumn,
file=TEXT_CLASSIFICATION_DIR / "example.jsonl",
)
)
Options.register(
Option(
display_name=Excel.name,
task_id=DOCUMENT_CLASSIFICATION,
file_format=Excel,
arg=ArgColumn,
file=TEXT_CLASSIFICATION_DIR / "example.csv",
)
)
# Sequence Labelling
Options.register(
Option(
display_name=JSONL.name,
task_id=SEQUENCE_LABELING,
file_format=JSONL,
arg=ArgColumn,
file=SEQUENCE_LABELING_DIR / "example.jsonl",
)
)
Options.register(
Option(
display_name=CoNLL.name,
task_id=SEQUENCE_LABELING,
file_format=CoNLL,
arg=ArgCoNLL,
file=SEQUENCE_LABELING_DIR / "example.txt",
)
)
# Relation Extraction
Options.register(
Option(
display_name="JSONL(Relation)",
task_id=RELATION_EXTRACTION,
file_format=JSONL,
arg=ArgNone,
file=RELATION_EXTRACTION_DIR / "example.jsonl",
)
)
# Seq2seq
Options.register(
Option(
display_name=CSV.name,
task_id=SEQ2SEQ,
file_format=CSV,
arg=ArgDelimiter,
file=SEQ2SEQ_DIR / "example.csv",
)
)
Options.register(
Option(
display_name=JSON.name,
task_id=SEQ2SEQ,
file_format=JSON,
arg=ArgColumn,
file=SEQ2SEQ_DIR / "example.json",
)
)
Options.register(
Option(
display_name=JSONL.name,
task_id=SEQ2SEQ,
file_format=JSONL,
arg=ArgColumn,
file=SEQ2SEQ_DIR / "example.jsonl",
)
)
Options.register(
Option(
display_name=Excel.name,
task_id=SEQ2SEQ,
file_format=Excel,
arg=ArgColumn,
file=SEQ2SEQ_DIR / "example.csv",
)
)
# Intent detection
Options.register(
Option(
display_name=JSONL.name,
task_id=INTENT_DETECTION_AND_SLOT_FILLING,
file_format=JSONL,
arg=ArgNone,
file=INTENT_DETECTION_DIR / "example.jsonl",
)
)
# Image Classification
Options.register(
Option(
display_name=ImageFile.name,
task_id=IMAGE_CLASSIFICATION,
file_format=ImageFile,
arg=ArgNone,
file=IMAGE_CLASSIFICATION_DIR / "image_files.txt",
)
)
# Speech to Text
Options.register(
Option(
display_name=AudioFile.name,
task_id=SPEECH2TEXT,
file_format=AudioFile,
arg=ArgNone,
file=SPEECH_TO_TEXT_DIR / "audio_files.txt",
)
)