mirror of https://github.com/doccano/doccano.git
pythonannotation-tooldatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learning
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
249 lines
5.8 KiB
249 lines
5.8 KiB
from collections import defaultdict
|
|
from typing import Dict, List, Type
|
|
|
|
from pydantic import BaseModel
|
|
from typing_extensions import Literal
|
|
|
|
from . import examples
|
|
from projects.models import (
|
|
DOCUMENT_CLASSIFICATION,
|
|
IMAGE_CLASSIFICATION,
|
|
INTENT_DETECTION_AND_SLOT_FILLING,
|
|
SEQ2SEQ,
|
|
SEQUENCE_LABELING,
|
|
SPEECH2TEXT,
|
|
)
|
|
|
|
encodings = Literal[
|
|
"Auto",
|
|
"ascii",
|
|
"big5",
|
|
"big5hkscs",
|
|
"cp037",
|
|
"cp273",
|
|
"cp424",
|
|
"cp437",
|
|
"cp500",
|
|
"cp720",
|
|
"cp737",
|
|
"cp775",
|
|
"cp850",
|
|
"cp852",
|
|
"cp855",
|
|
"cp856",
|
|
"cp857",
|
|
"cp858",
|
|
"cp860",
|
|
"cp861",
|
|
"cp862",
|
|
"cp863",
|
|
"cp864",
|
|
"cp865",
|
|
"cp866",
|
|
"cp869",
|
|
"cp874",
|
|
"cp875",
|
|
"cp932",
|
|
"cp949",
|
|
"cp950",
|
|
"cp1006",
|
|
"cp1026",
|
|
"cp1125",
|
|
"cp1140",
|
|
"cp1250",
|
|
"cp1251",
|
|
"cp1252",
|
|
"cp1253",
|
|
"cp1254",
|
|
"cp1255",
|
|
"cp1256",
|
|
"cp1257",
|
|
"cp1258",
|
|
"cp65001",
|
|
"euc_jp",
|
|
"euc_jis_2004",
|
|
"euc_jisx0213",
|
|
"euc_kr",
|
|
"gb2312",
|
|
"gbk",
|
|
"gb18030",
|
|
"hz",
|
|
"iso2022_jp",
|
|
"iso2022_jp_1",
|
|
"iso2022_jp_2",
|
|
"iso2022_jp_2004",
|
|
"iso2022_jp_3",
|
|
"iso2022_jp_ext",
|
|
"iso2022_kr",
|
|
"latin_1",
|
|
"iso8859_2",
|
|
"iso8859_3",
|
|
"iso8859_4",
|
|
"iso8859_5",
|
|
"iso8859_6",
|
|
"iso8859_7",
|
|
"iso8859_8",
|
|
"iso8859_9",
|
|
"iso8859_10",
|
|
"iso8859_11",
|
|
"iso8859_13",
|
|
"iso8859_14",
|
|
"iso8859_15",
|
|
"iso8859_16",
|
|
"johab",
|
|
"koi8_r",
|
|
"koi8_t",
|
|
"koi8_u",
|
|
"kz1048",
|
|
"mac_cyrillic",
|
|
"mac_greek",
|
|
"mac_iceland",
|
|
"mac_latin2",
|
|
"mac_roman",
|
|
"mac_turkish",
|
|
"ptcp154",
|
|
"shift_jis",
|
|
"shift_jis_2004",
|
|
"shift_jisx0213",
|
|
"utf_32",
|
|
"utf_32_be",
|
|
"utf_32_le",
|
|
"utf_16",
|
|
"utf_16_be",
|
|
"utf_16_le",
|
|
"utf_7",
|
|
"utf_8",
|
|
"utf_8_sig",
|
|
]
|
|
|
|
|
|
class Format:
|
|
name = ""
|
|
accept_types = ""
|
|
|
|
@classmethod
|
|
def dict(cls):
|
|
return {"name": cls.name, "accept_types": cls.accept_types}
|
|
|
|
|
|
class CSV(Format):
|
|
name = "CSV"
|
|
accept_types = "text/csv"
|
|
|
|
|
|
class FastText(Format):
|
|
name = "fastText"
|
|
accept_types = "text/plain"
|
|
|
|
|
|
class JSON(Format):
|
|
name = "JSON"
|
|
accept_types = "application/json"
|
|
|
|
|
|
class JSONL(Format):
|
|
name = "JSONL"
|
|
accept_types = "*"
|
|
|
|
|
|
class Excel(Format):
|
|
name = "Excel"
|
|
accept_types = "application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
|
|
|
|
class TextFile(Format):
|
|
name = "TextFile"
|
|
accept_types = "text/*"
|
|
|
|
|
|
class TextLine(Format):
|
|
name = "TextLine"
|
|
accept_types = "text/*"
|
|
|
|
|
|
class CoNLL(Format):
|
|
name = "CoNLL"
|
|
accept_types = "text/*"
|
|
|
|
|
|
class ImageFile(Format):
|
|
name = "ImageFile"
|
|
accept_types = "image/png, image/jpeg, image/bmp, image/gif"
|
|
|
|
|
|
class AudioFile(Format):
|
|
name = "AudioFile"
|
|
accept_types = "audio/ogg, audio/aac, audio/mpeg, audio/wav"
|
|
|
|
|
|
class OptionColumn(BaseModel):
|
|
encoding: encodings = "utf_8"
|
|
column_data: str = "text"
|
|
column_label: str = "label"
|
|
|
|
|
|
class OptionDelimiter(OptionColumn):
|
|
encoding: encodings = "utf_8"
|
|
delimiter: Literal[",", "\t", ";", "|", " "] = ","
|
|
|
|
|
|
class OptionEncoding(BaseModel):
|
|
encoding: encodings = "utf_8"
|
|
|
|
|
|
class OptionCoNLL(BaseModel):
|
|
encoding: encodings = "utf_8"
|
|
scheme: Literal["IOB2", "IOE2", "IOBES", "BILOU"] = "IOB2"
|
|
delimiter: Literal[" ", ""] = " "
|
|
|
|
|
|
class OptionNone(BaseModel):
|
|
pass
|
|
|
|
|
|
class Options:
|
|
options: Dict[str, List] = defaultdict(list)
|
|
|
|
@classmethod
|
|
def filter_by_task(cls, task_name: str):
|
|
options = cls.options[task_name]
|
|
return [{**format.dict(), **option.schema(), "example": example} for format, option, example in options]
|
|
|
|
@classmethod
|
|
def register(cls, task: str, format: Type[Format], option: Type[BaseModel], example: str):
|
|
cls.options[task].append((format, option, example))
|
|
|
|
|
|
# Text Classification
|
|
Options.register(DOCUMENT_CLASSIFICATION, TextFile, OptionEncoding, examples.Generic_TextFile)
|
|
Options.register(DOCUMENT_CLASSIFICATION, TextLine, OptionEncoding, examples.Generic_TextLine)
|
|
Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
|
|
Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionEncoding, examples.Category_fastText)
|
|
Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionColumn, examples.Category_JSON)
|
|
Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionColumn, examples.Category_JSONL)
|
|
Options.register(DOCUMENT_CLASSIFICATION, Excel, OptionColumn, examples.Category_CSV)
|
|
|
|
# Sequence Labeling
|
|
Options.register(SEQUENCE_LABELING, TextFile, OptionEncoding, examples.Generic_TextFile)
|
|
Options.register(SEQUENCE_LABELING, TextLine, OptionEncoding, examples.Generic_TextLine)
|
|
Options.register(SEQUENCE_LABELING, JSONL, OptionColumn, examples.Offset_JSONL)
|
|
Options.register(SEQUENCE_LABELING, CoNLL, OptionCoNLL, examples.Offset_CoNLL)
|
|
|
|
# Sequence to sequence
|
|
Options.register(SEQ2SEQ, TextFile, OptionEncoding, examples.Generic_TextFile)
|
|
Options.register(SEQ2SEQ, TextLine, OptionEncoding, examples.Generic_TextLine)
|
|
Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
|
|
Options.register(SEQ2SEQ, JSON, OptionColumn, examples.Text_JSON)
|
|
Options.register(SEQ2SEQ, JSONL, OptionColumn, examples.Text_JSONL)
|
|
Options.register(SEQ2SEQ, Excel, OptionColumn, examples.Text_CSV)
|
|
|
|
# Intent detection and slof filling
|
|
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextFile, OptionEncoding, examples.Generic_TextFile)
|
|
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, TextLine, OptionEncoding, examples.Generic_TextLine)
|
|
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, OptionNone, examples.IDSF_JSONL)
|
|
|
|
# Image classification
|
|
Options.register(IMAGE_CLASSIFICATION, ImageFile, OptionNone, examples.Generic_ImageFile)
|
|
|
|
# Speech to Text
|
|
Options.register(SPEECH2TEXT, AudioFile, OptionNone, examples.Generic_AudioFile)
|