from collections import defaultdict from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Type from pydantic import BaseModel from typing_extensions import Literal from .exceptions import FileFormatException from projects.models import ( BOUNDING_BOX, DOCUMENT_CLASSIFICATION, IMAGE_CAPTIONING, IMAGE_CLASSIFICATION, INTENT_DETECTION_AND_SLOT_FILLING, SEGMENTATION, SEQ2SEQ, SEQUENCE_LABELING, SPEECH2TEXT, ) # Define the example directories EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples" TASK_AGNOSTIC_DIR = EXAMPLE_DIR / "task_agnostic" TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification" SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling" RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction" SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence" INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection" IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification" SPEECH_TO_TEXT_DIR = EXAMPLE_DIR / "speech_to_text" # Define the task identifiers RELATION_EXTRACTION = "RelationExtraction" encodings = Literal[ "Auto", "ascii", "big5", "big5hkscs", "cp037", "cp273", "cp424", "cp437", "cp500", "cp720", "cp737", "cp775", "cp850", "cp852", "cp855", "cp856", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863", "cp864", "cp865", "cp866", "cp869", "cp874", "cp875", "cp932", "cp949", "cp950", "cp1006", "cp1026", "cp1125", "cp1140", "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "cp65001", "euc_jp", "euc_jis_2004", "euc_jisx0213", "euc_kr", "gb2312", "gbk", "gb18030", "hz", "iso2022_jp", "iso2022_jp_1", "iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_3", "iso2022_jp_ext", "iso2022_kr", "latin_1", "iso8859_2", "iso8859_3", "iso8859_4", "iso8859_5", "iso8859_6", "iso8859_7", "iso8859_8", "iso8859_9", "iso8859_10", "iso8859_11", "iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "johab", "koi8_r", "koi8_t", "koi8_u", "kz1048", "mac_cyrillic", "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish", "ptcp154", "shift_jis", "shift_jis_2004", "shift_jisx0213", "utf_32", "utf_32_be", "utf_32_le", "utf_16", "utf_16_be", "utf_16_le", "utf_7", "utf_8", "utf_8_sig", ] class Format: name = "" accept_types = "" @classmethod def dict(cls): return {"name": cls.name, "accept_types": cls.accept_types} def validate_mime(self, mime: str): return True @staticmethod def is_plain_text(): return False class CSV(Format): name = "CSV" accept_types = "text/csv" class FastText(Format): name = "fastText" accept_types = "text/plain" class JSON(Format): name = "JSON" accept_types = "application/json" class JSONL(Format): name = "JSONL" accept_types = "*" class Excel(Format): name = "Excel" accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" class TextFile(Format): name = "TextFile" accept_types = "text/*" @staticmethod def is_plain_text(): return True class TextLine(Format): name = "TextLine" accept_types = "text/*" @staticmethod def is_plain_text(): return True class CoNLL(Format): name = "CoNLL" accept_types = "text/*" class ImageFile(Format): name = "ImageFile" accept_types = "image/png, image/jpeg, image/bmp, image/gif" def validate_mime(self, mime: str): return mime in self.accept_types class AudioFile(Format): name = "AudioFile" accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav" def validate_mime(self, mime: str): return mime in self.accept_types class ArgColumn(BaseModel): encoding: encodings = "utf_8" column_data: str = "text" column_label: str = "label" class ArgDelimiter(ArgColumn): encoding: encodings = "utf_8" delimiter: Literal[",", "\t", ";", "|", " "] = "," class ArgEncoding(BaseModel): encoding: encodings = "utf_8" class ArgCoNLL(BaseModel): encoding: encodings = "utf_8" scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2" delimiter: Literal[" ", ""] = " " class ArgNone(BaseModel): pass @dataclass class Option: display_name: str task_id: str file_format: Type[Format] arg: Type[BaseModel] file: Path @property def example(self) -> str: with open(self.file, "r", encoding="utf-8") as f: return f.read() def dict(self) -> Dict: return { **self.file_format.dict(), **self.arg.schema(), "example": self.example, "task_id": self.task_id, "display_name": self.display_name, } def create_file_format(file_format: str) -> Format: for format_class in Format.__subclasses__(): if format_class.name == file_format: return format_class() raise FileFormatException(file_format) class Options: options: Dict[str, List] = defaultdict(list) @classmethod def filter_by_task(cls, task_name: str, use_relation: bool = False): options = cls.options[task_name] if use_relation: options = cls.options[task_name] + cls.options[RELATION_EXTRACTION] return [option.dict() for option in options] @classmethod def register(cls, option: Option): cls.options[option.task_id].append(option) # Text tasks text_tasks = [DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, INTENT_DETECTION_AND_SLOT_FILLING] for task_id in text_tasks: Options.register( Option( display_name=TextFile.name, task_id=task_id, file_format=TextFile, arg=ArgEncoding, file=TASK_AGNOSTIC_DIR / "text_files.txt", ) ) Options.register( Option( display_name=TextLine.name, task_id=task_id, file_format=TextLine, arg=ArgEncoding, file=TASK_AGNOSTIC_DIR / "text_lines.txt", ) ) # Text Classification Options.register( Option( display_name=CSV.name, task_id=DOCUMENT_CLASSIFICATION, file_format=CSV, arg=ArgDelimiter, file=TEXT_CLASSIFICATION_DIR / "example.csv", ) ) Options.register( Option( display_name=FastText.name, task_id=DOCUMENT_CLASSIFICATION, file_format=FastText, arg=ArgEncoding, file=TEXT_CLASSIFICATION_DIR / "example.txt", ) ) Options.register( Option( display_name=JSON.name, task_id=DOCUMENT_CLASSIFICATION, file_format=JSON, arg=ArgColumn, file=TEXT_CLASSIFICATION_DIR / "example.json", ) ) Options.register( Option( display_name=JSONL.name, task_id=DOCUMENT_CLASSIFICATION, file_format=JSONL, arg=ArgColumn, file=TEXT_CLASSIFICATION_DIR / "example.jsonl", ) ) Options.register( Option( display_name=Excel.name, task_id=DOCUMENT_CLASSIFICATION, file_format=Excel, arg=ArgColumn, file=TEXT_CLASSIFICATION_DIR / "example.csv", ) ) # Sequence Labelling Options.register( Option( display_name=JSONL.name, task_id=SEQUENCE_LABELING, file_format=JSONL, arg=ArgColumn, file=SEQUENCE_LABELING_DIR / "example.jsonl", ) ) Options.register( Option( display_name=CoNLL.name, task_id=SEQUENCE_LABELING, file_format=CoNLL, arg=ArgCoNLL, file=SEQUENCE_LABELING_DIR / "example.txt", ) ) # Relation Extraction Options.register( Option( display_name="JSONL(Relation)", task_id=RELATION_EXTRACTION, file_format=JSONL, arg=ArgNone, file=RELATION_EXTRACTION_DIR / "example.jsonl", ) ) # Seq2seq Options.register( Option( display_name=CSV.name, task_id=SEQ2SEQ, file_format=CSV, arg=ArgDelimiter, file=SEQ2SEQ_DIR / "example.csv", ) ) Options.register( Option( display_name=JSON.name, task_id=SEQ2SEQ, file_format=JSON, arg=ArgColumn, file=SEQ2SEQ_DIR / "example.json", ) ) Options.register( Option( display_name=JSONL.name, task_id=SEQ2SEQ, file_format=JSONL, arg=ArgColumn, file=SEQ2SEQ_DIR / "example.jsonl", ) ) Options.register( Option( display_name=Excel.name, task_id=SEQ2SEQ, file_format=Excel, arg=ArgColumn, file=SEQ2SEQ_DIR / "example.csv", ) ) # Intent detection Options.register( Option( display_name=JSONL.name, task_id=INTENT_DETECTION_AND_SLOT_FILLING, file_format=JSONL, arg=ArgNone, file=INTENT_DETECTION_DIR / "example.jsonl", ) ) # Image tasks image_tasks = [IMAGE_CLASSIFICATION, IMAGE_CAPTIONING, BOUNDING_BOX, SEGMENTATION] for task_name in image_tasks: Options.register( Option( display_name=ImageFile.name, task_id=task_name, file_format=ImageFile, arg=ArgNone, file=IMAGE_CLASSIFICATION_DIR / "image_files.txt", ) ) # Speech to Text Options.register( Option( display_name=AudioFile.name, task_id=SPEECH2TEXT, file_format=AudioFile, arg=ArgNone, file=SPEECH_TO_TEXT_DIR / "audio_files.txt", ) )