from collections import defaultdict from typing import Dict, List, Type from pydantic import BaseModel from typing_extensions import Literal from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING, SPEECH2TEXT) from . import examples class Format: name = '' @classmethod def dict(cls): return { 'name': cls.name, } class CSV(Format): name = 'CSV' extension = 'csv' class FastText(Format): name = 'fastText' extension = 'txt' class JSON(Format): name = 'JSON' extension = 'json' class JSONL(Format): name = 'JSONL' extension = 'jsonl' class OptionDelimiter(BaseModel): delimiter: Literal[',', '\t', ';', '|', ' '] = ',' class OptionNone(BaseModel): pass class Options: options: Dict[str, List] = defaultdict(list) @classmethod def filter_by_task(cls, task_name: str): options = cls.options[task_name] return [ { **format.dict(), **option.schema(), 'example': example } for format, option, example in options ] @classmethod def register(cls, task: str, format: Type[Format], option: Type[BaseModel], example: str): cls.options[task].append((format, option, example)) # Text Classification Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV) Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionNone, examples.Category_fastText) Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionNone, examples.Category_JSON) Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionNone, examples.Category_JSONL) # Sequence Labeling Options.register(SEQUENCE_LABELING, JSONL, OptionNone, examples.Offset_JSONL) # Sequence to sequence Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV) Options.register(SEQ2SEQ, JSON, OptionNone, examples.Text_JSON) Options.register(SEQ2SEQ, JSONL, OptionNone, examples.Text_JSONL) # Image Classification Options.register(IMAGE_CLASSIFICATION, JSONL, OptionNone, examples.CategoryImageClassification) # Speech to Text Options.register(SPEECH2TEXT, JSONL, OptionNone, examples.Speech2Text)