doccano/backend/data_import/pipeline/catalog.py


								from collections import defaultdict

								from dataclasses import dataclass

								from pathlib import Path

								from typing import Dict, List, Type


								from pydantic import BaseModel

								from typing_extensions import Literal


								from .exceptions import FileFormatException

								from projects.models import ProjectType


								# Define the example directories

								EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"

								TASK_AGNOSTIC_DIR = EXAMPLE_DIR / "task_agnostic"

								TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"

								SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"

								RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"

								SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"

								INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"

								IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"

								SPEECH_TO_TEXT_DIR = EXAMPLE_DIR / "speech_to_text"


								# Define the task identifiers

								RELATION_EXTRACTION = "RelationExtraction"


								encodings = Literal[

								    "Auto",

								    "ascii",

								    "big5",

								    "big5hkscs",

								    "cp037",

								    "cp273",

								    "cp424",

								    "cp437",

								    "cp500",

								    "cp720",

								    "cp737",

								    "cp775",

								    "cp850",

								    "cp852",

								    "cp855",

								    "cp856",

								    "cp857",

								    "cp858",

								    "cp860",

								    "cp861",

								    "cp862",

								    "cp863",

								    "cp864",

								    "cp865",

								    "cp866",

								    "cp869",

								    "cp874",

								    "cp875",

								    "cp932",

								    "cp949",

								    "cp950",

								    "cp1006",

								    "cp1026",

								    "cp1125",

								    "cp1140",

								    "cp1250",

								    "cp1251",

								    "cp1252",

								    "cp1253",

								    "cp1254",

								    "cp1255",

								    "cp1256",

								    "cp1257",

								    "cp1258",

								    "cp65001",

								    "euc_jp",

								    "euc_jis_2004",

								    "euc_jisx0213",

								    "euc_kr",

								    "gb2312",

								    "gbk",

								    "gb18030",

								    "hz",

								    "iso2022_jp",

								    "iso2022_jp_1",

								    "iso2022_jp_2",

								    "iso2022_jp_2004",

								    "iso2022_jp_3",

								    "iso2022_jp_ext",

								    "iso2022_kr",

								    "latin_1",

								    "iso8859_2",

								    "iso8859_3",

								    "iso8859_4",

								    "iso8859_5",

								    "iso8859_6",

								    "iso8859_7",

								    "iso8859_8",

								    "iso8859_9",

								    "iso8859_10",

								    "iso8859_11",

								    "iso8859_13",

								    "iso8859_14",

								    "iso8859_15",

								    "iso8859_16",

								    "johab",

								    "koi8_r",

								    "koi8_t",

								    "koi8_u",

								    "kz1048",

								    "mac_cyrillic",

								    "mac_greek",

								    "mac_iceland",

								    "mac_latin2",

								    "mac_roman",

								    "mac_turkish",

								    "ptcp154",

								    "shift_jis",

								    "shift_jis_2004",

								    "shift_jisx0213",

								    "utf_32",

								    "utf_32_be",

								    "utf_32_le",

								    "utf_16",

								    "utf_16_be",

								    "utf_16_le",

								    "utf_7",

								    "utf_8",

								    "utf_8_sig",

								]


								class Format:

								    name = ""

								    accept_types = ""


								    @classmethod

								    def dict(cls):

								        return {"name": cls.name, "accept_types": cls.accept_types}


								    def validate_mime(self, mime: str):

								        return True


								    @staticmethod

								    def is_plain_text():

								        return False


								class CSV(Format):

								    name = "CSV"

								    accept_types = "text/csv"


								class FastText(Format):

								    name = "fastText"

								    accept_types = "text/plain"


								class JSON(Format):

								    name = "JSON"

								    accept_types = "application/json"


								class JSONL(Format):

								    name = "JSONL"

								    accept_types = "*"


								class Excel(Format):

								    name = "Excel"

								    accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"


								class TextFile(Format):

								    name = "TextFile"

								    accept_types = "text/*"


								    @staticmethod

								    def is_plain_text():

								        return True


								class TextLine(Format):

								    name = "TextLine"

								    accept_types = "text/*"


								    @staticmethod

								    def is_plain_text():

								        return True


								class CoNLL(Format):

								    name = "CoNLL"

								    accept_types = "text/*"


								class ImageFile(Format):

								    name = "ImageFile"

								    accept_types = "image/png, image/jpeg, image/bmp, image/gif"


								    def validate_mime(self, mime: str):

								        return mime in self.accept_types


								class AudioFile(Format):

								    name = "AudioFile"

								    accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"


								    def validate_mime(self, mime: str):

								        return mime in self.accept_types


								class ArgColumn(BaseModel):

								    encoding: encodings = "utf_8"

								    column_data: str = "text"

								    column_label: str = "label"


								class ArgDelimiter(ArgColumn):

								    encoding: encodings = "utf_8"

								    delimiter: Literal[",", "\t", ";", "|", " "] = ","


								class ArgEncoding(BaseModel):

								    encoding: encodings = "utf_8"


								class ArgCoNLL(BaseModel):

								    encoding: encodings = "utf_8"

								    scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"

								    delimiter: Literal[" ", ""] = " "


								class ArgNone(BaseModel):

								    pass


								@dataclass

								class Option:

								    display_name: str

								    task_id: str

								    file_format: Type[Format]

								    arg: Type[BaseModel]

								    file: Path


								    @property

								    def example(self) -> str:

								        with open(self.file, "r", encoding="utf-8") as f:

								            return f.read()


								    def dict(self) -> Dict:

								        return {

								            **self.file_format.dict(),

								            **self.arg.schema(),

								            "example": self.example,

								            "task_id": self.task_id,

								            "display_name": self.display_name,

								        }


								def create_file_format(file_format: str) -> Format:

								    for format_class in Format.__subclasses__():

								        if format_class.name == file_format:

								            return format_class()

								    raise FileFormatException(file_format)


								class Options:

								    options: Dict[str, List] = defaultdict(list)


								    @classmethod

								    def filter_by_task(cls, task_name: str, use_relation: bool = False):

								        options = cls.options[task_name]

								        if use_relation:

								            options = cls.options[task_name] + cls.options[RELATION_EXTRACTION]

								        return [option.dict() for option in options]


								    @classmethod

								    def register(cls, option: Option):

								        cls.options[option.task_id].append(option)


								# Text tasks

								text_tasks = [

								    ProjectType.DOCUMENT_CLASSIFICATION,

								    ProjectType.SEQUENCE_LABELING,

								    ProjectType.SEQ2SEQ,

								    ProjectType.INTENT_DETECTION_AND_SLOT_FILLING,

								]

								for task_id in text_tasks:

								    Options.register(

								        Option(

								            display_name=TextFile.name,

								            task_id=task_id,

								            file_format=TextFile,

								            arg=ArgEncoding,

								            file=TASK_AGNOSTIC_DIR / "text_files.txt",

								        )

								    )

								    Options.register(

								        Option(

								            display_name=TextLine.name,

								            task_id=task_id,

								            file_format=TextLine,

								            arg=ArgEncoding,

								            file=TASK_AGNOSTIC_DIR / "text_lines.txt",

								        )

								    )


								# Text Classification

								Options.register(

								    Option(

								        display_name=CSV.name,

								        task_id=ProjectType.DOCUMENT_CLASSIFICATION,

								        file_format=CSV,

								        arg=ArgDelimiter,

								        file=TEXT_CLASSIFICATION_DIR / "example.csv",

								    )

								)

								Options.register(

								    Option(

								        display_name=FastText.name,

								        task_id=ProjectType.DOCUMENT_CLASSIFICATION,

								        file_format=FastText,

								        arg=ArgEncoding,

								        file=TEXT_CLASSIFICATION_DIR / "example.txt",

								    )

								)

								Options.register(

								    Option(

								        display_name=JSON.name,

								        task_id=ProjectType.DOCUMENT_CLASSIFICATION,

								        file_format=JSON,

								        arg=ArgColumn,

								        file=TEXT_CLASSIFICATION_DIR / "example.json",

								    )

								)

								Options.register(

								    Option(

								        display_name=JSONL.name,

								        task_id=ProjectType.DOCUMENT_CLASSIFICATION,

								        file_format=JSONL,

								        arg=ArgColumn,

								        file=TEXT_CLASSIFICATION_DIR / "example.jsonl",

								    )

								)

								Options.register(

								    Option(

								        display_name=Excel.name,

								        task_id=ProjectType.DOCUMENT_CLASSIFICATION,

								        file_format=Excel,

								        arg=ArgColumn,

								        file=TEXT_CLASSIFICATION_DIR / "example.csv",

								    )

								)


								# Sequence Labelling

								Options.register(

								    Option(

								        display_name=JSONL.name,

								        task_id=ProjectType.SEQUENCE_LABELING,

								        file_format=JSONL,

								        arg=ArgColumn,

								        file=SEQUENCE_LABELING_DIR / "example.jsonl",

								    )

								)

								Options.register(

								    Option(

								        display_name=CoNLL.name,

								        task_id=ProjectType.SEQUENCE_LABELING,

								        file_format=CoNLL,

								        arg=ArgCoNLL,

								        file=SEQUENCE_LABELING_DIR / "example.txt",

								    )

								)


								# Relation Extraction

								Options.register(

								    Option(

								        display_name="JSONL(Relation)",

								        task_id=RELATION_EXTRACTION,

								        file_format=JSONL,

								        arg=ArgNone,

								        file=RELATION_EXTRACTION_DIR / "example.jsonl",

								    )

								)


								# Seq2seq

								Options.register(

								    Option(

								        display_name=CSV.name,

								        task_id=ProjectType.SEQ2SEQ,

								        file_format=CSV,

								        arg=ArgDelimiter,

								        file=SEQ2SEQ_DIR / "example.csv",

								    )

								)

								Options.register(

								    Option(

								        display_name=JSON.name,

								        task_id=ProjectType.SEQ2SEQ,

								        file_format=JSON,

								        arg=ArgColumn,

								        file=SEQ2SEQ_DIR / "example.json",

								    )

								)

								Options.register(

								    Option(

								        display_name=JSONL.name,

								        task_id=ProjectType.SEQ2SEQ,

								        file_format=JSONL,

								        arg=ArgColumn,

								        file=SEQ2SEQ_DIR / "example.jsonl",

								    )

								)

								Options.register(

								    Option(

								        display_name=Excel.name,

								        task_id=ProjectType.SEQ2SEQ,

								        file_format=Excel,

								        arg=ArgColumn,

								        file=SEQ2SEQ_DIR / "example.csv",

								    )

								)


								# Intent detection

								Options.register(

								    Option(

								        display_name=JSONL.name,

								        task_id=ProjectType.INTENT_DETECTION_AND_SLOT_FILLING,

								        file_format=JSONL,

								        arg=ArgNone,

								        file=INTENT_DETECTION_DIR / "example.jsonl",

								    )

								)


								# Image tasks

								image_tasks = [

								    ProjectType.IMAGE_CLASSIFICATION,

								    ProjectType.IMAGE_CAPTIONING,

								    ProjectType.BOUNDING_BOX,

								    ProjectType.SEGMENTATION,

								]

								for task_name in image_tasks:

								    Options.register(

								        Option(

								            display_name=ImageFile.name,

								            task_id=task_name,

								            file_format=ImageFile,

								            arg=ArgNone,

								            file=IMAGE_CLASSIFICATION_DIR / "image_files.txt",

								        )

								    )


								# Speech to Text

								Options.register(

								    Option(

								        display_name=AudioFile.name,

								        task_id=ProjectType.SPEECH2TEXT,

								        file_format=AudioFile,

								        arg=ArgNone,

								        file=SPEECH_TO_TEXT_DIR / "audio_files.txt",

								    )

								)