You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

108 lines
3.2 KiB

from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Type
from pydantic import BaseModel
from typing_extensions import Literal
from projects.models import (
DOCUMENT_CLASSIFICATION,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEQ2SEQ,
SEQUENCE_LABELING,
SPEECH2TEXT,
)
EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"
class Format:
name = ""
@classmethod
def dict(cls):
return {
"name": cls.name,
}
class CSV(Format):
name = "CSV"
extension = "csv"
class FastText(Format):
name = "fastText"
extension = "txt"
class JSON(Format):
name = "JSON"
extension = "json"
class JSONL(Format):
name = "JSONL"
extension = "jsonl"
class OptionDelimiter(BaseModel):
delimiter: Literal[",", "\t", ";", "|", " "] = ","
class OptionNone(BaseModel):
pass
class Options:
options: Dict[str, List] = defaultdict(list)
@classmethod
def filter_by_task(cls, task_name: str):
options = cls.options[task_name]
return [
{**file_format.dict(), **option.schema(), "example": example} for file_format, option, example in options
]
@classmethod
def register(cls, task: str, file_format: Type[Format], option: Type[BaseModel], file: Path):
example = cls.load_example(file)
cls.options[task].append((file_format, option, example))
@staticmethod
def load_example(file):
with open(file, encoding="utf-8") as f:
return f.read()
# Text Classification
TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, TEXT_CLASSIFICATION_DIR / "example.csv")
Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionNone, TEXT_CLASSIFICATION_DIR / "example.txt")
Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionNone, TEXT_CLASSIFICATION_DIR / "example.json")
Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionNone, TEXT_CLASSIFICATION_DIR / "example.jsonl")
# Sequence Labeling
SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
Options.register(SEQUENCE_LABELING, JSONL, OptionNone, SEQUENCE_LABELING_DIR / "example.jsonl")
Options.register(SEQUENCE_LABELING, JSONL, OptionNone, RELATION_EXTRACTION_DIR / "example.jsonl")
# Sequence to sequence
SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
Options.register(SEQ2SEQ, CSV, OptionDelimiter, SEQ2SEQ_DIR / "example.csv")
Options.register(SEQ2SEQ, JSON, OptionNone, SEQ2SEQ_DIR / "example.json")
Options.register(SEQ2SEQ, JSONL, OptionNone, SEQ2SEQ_DIR / "example.jsonl")
# Intent detection and slot filling
INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, OptionNone, INTENT_DETECTION_DIR / "example.jsonl")
# Image Classification
IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
Options.register(IMAGE_CLASSIFICATION, JSONL, OptionNone, IMAGE_CLASSIFICATION_DIR / "example.jsonl")
# Speech to Text
SPEECH2TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
Options.register(SPEECH2TEXT, JSONL, OptionNone, SPEECH2TEXT_DIR / "example.jsonl")