You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

249 lines
5.8 KiB

from collections import defaultdict
from typing import Dict, List, Type
from pydantic import BaseModel
from typing_extensions import Literal
from . import examples
from projects.models import (
DOCUMENT_CLASSIFICATION,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEQ2SEQ,
SEQUENCE_LABELING,
SPEECH2TEXT,
)
encodings = Literal[
"Auto",
"ascii",
"big5",
"big5hkscs",
"cp037",
"cp273",
"cp424",
"cp437",
"cp500",
"cp720",
"cp737",
"cp775",
"cp850",
"cp852",
"cp855",
"cp856",
"cp857",
"cp858",
"cp860",
"cp861",
"cp862",
"cp863",
"cp864",
"cp865",
"cp866",
"cp869",
"cp874",
"cp875",
"cp932",
"cp949",
"cp950",
"cp1006",
"cp1026",
"cp1125",
"cp1140",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"cp65001",
"euc_jp",
"euc_jis_2004",
"euc_jisx0213",
"euc_kr",
"gb2312",
"gbk",
"gb18030",
"hz",
"iso2022_jp",
"iso2022_jp_1",
"iso2022_jp_2",
"iso2022_jp_2004",
"iso2022_jp_3",
"iso2022_jp_ext",
"iso2022_kr",
"latin_1",
"iso8859_2",
"iso8859_3",
"iso8859_4",
"iso8859_5",
"iso8859_6",
"iso8859_7",
"iso8859_8",
"iso8859_9",
"iso8859_10",
"iso8859_11",
"iso8859_13",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"johab",
"koi8_r",
"koi8_t",
"koi8_u",
"kz1048",
"mac_cyrillic",
"mac_greek",
"mac_iceland",
"mac_latin2",
"mac_roman",
"mac_turkish",
"ptcp154",
"shift_jis",
"shift_jis_2004",
"shift_jisx0213",
"utf_32",
"utf_32_be",
"utf_32_le",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_7",
"utf_8",
"utf_8_sig",
]
class Format:
name = ""
accept_types = ""
@classmethod
def dict(cls):
return {"name": cls.name, "accept_types": cls.accept_types}
class CSV(Format):
name = "CSV"
accept_types = "text/csv"
class FastText(Format):
name = "fastText"
accept_types = "text/plain"
class JSON(Format):
name = "JSON"
accept_types = "application/json"
class JSONL(Format):
name = "JSONL"
accept_types = "*"
class Excel(Format):
name = "Excel"
accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
class TextFile(Format):
name = "TextFile"
accept_types = "text/*"
class TextLine(Format):
name = "TextLine"
accept_types = "text/*"
class CoNLL(Format):
name = "CoNLL"
accept_types = "text/*"
class ImageFile(Format):
name = "ImageFile"
accept_types = "image/png, image/jpeg, image/bmp, image/gif"
class AudioFile(Format):
name = "AudioFile"
accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"
class OptionColumn(BaseModel):
encoding: encodings = "utf_8"
column_data: str = "text"
column_label: str = "label"
class OptionDelimiter(OptionColumn):
encoding: encodings = "utf_8"
delimiter: Literal[",", "\t", ";", "|", " "] = ","
class OptionEncoding(BaseModel):
encoding: encodings = "utf_8"
class OptionCoNLL(BaseModel):
encoding: encodings = "utf_8"
scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"
delimiter: Literal[" ", ""] = " "
class OptionNone(BaseModel):
pass
class Options:
options: Dict[str, List] = defaultdict(list)
@classmethod
def filter_by_task(cls, task_name: str):
options = cls.options[task_name]
return [{**format.dict(), **option.schema(), "example": example} for format, option, example in options]
@classmethod
def register(cls, task: str, format: Type[Format], option: Type[BaseModel], example: str):
cls.options[task].append((format, option, example))
# Text Classification
Options.register(DOCUMENT_CLASSIFICATION, TextFile, OptionEncoding, examples.Generic_TextFile)
Options.register(DOCUMENT_CLASSIFICATION, TextLine, OptionEncoding, examples.Generic_TextLine)
Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionEncoding, examples.Category_fastText)
Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionColumn, examples.Category_JSON)
Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionColumn, examples.Category_JSONL)
Options.register(DOCUMENT_CLASSIFICATION, Excel, OptionColumn, examples.Category_CSV)
# Sequence Labeling
Options.register(SEQUENCE_LABELING, TextFile, OptionEncoding, examples.Generic_TextFile)
Options.register(SEQUENCE_LABELING, TextLine, OptionEncoding, examples.Generic_TextLine)
Options.register(SEQUENCE_LABELING, JSONL, OptionColumn, examples.Offset_JSONL)
Options.register(SEQUENCE_LABELING, CoNLL, OptionCoNLL, examples.Offset_CoNLL)
# Sequence to sequence
Options.register(SEQ2SEQ, TextFile, OptionEncoding, examples.Generic_TextFile)
Options.register(SEQ2SEQ, TextLine, OptionEncoding, examples.Generic_TextLine)
Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
Options.register(SEQ2SEQ, JSON, OptionColumn, examples.Text_JSON)
Options.register(SEQ2SEQ, JSONL, OptionColumn, examples.Text_JSONL)
Options.register(SEQ2SEQ, Excel, OptionColumn, examples.Text_CSV)
# Intent detection and slof filling
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextFile, OptionEncoding, examples.Generic_TextFile)
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextLine, OptionEncoding, examples.Generic_TextLine)
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, OptionNone, examples.IDSF_JSONL)
# Image classification
Options.register(IMAGE_CLASSIFICATION, ImageFile, OptionNone, examples.Generic_ImageFile)
# Speech to Text
Options.register(SPEECH2TEXT, AudioFile, OptionNone, examples.Generic_AudioFile)