doccano/backend/data_export/pipeline/catalog.py

from collections import defaultdict
from typing import Dict, List, Type

from pydantic import BaseModel
from typing_extensions import Literal

from . import examples
from projects.models import (
    DOCUMENT_CLASSIFICATION,
    IMAGE_CLASSIFICATION,
    INTENT_DETECTION_AND_SLOT_FILLING,
    SEQ2SEQ,
    SEQUENCE_LABELING,
    SPEECH2TEXT,
)


class Format:
    name = ""

    @classmethod
    def dict(cls):
        return {
            "name": cls.name,
        }


class CSV(Format):
    name = "CSV"
    extension = "csv"


class FastText(Format):
    name = "fastText"
    extension = "txt"


class JSON(Format):
    name = "JSON"
    extension = "json"


class JSONL(Format):
    name = "JSONL"
    extension = "jsonl"


class IntentAndSlot(Format):
    name = "JSONL(intent and slot)"
    extension = "jsonl"


class OptionDelimiter(BaseModel):
    delimiter: Literal[",", "\t", ";", "|", " "] = ","


class OptionNone(BaseModel):
    pass


class Options:
    options: Dict[str, List] = defaultdict(list)

    @classmethod
    def filter_by_task(cls, task_name: str):
        options = cls.options[task_name]
        return [{**format.dict(), **option.schema(), "example": example} for format, option, example in options]

    @classmethod
    def register(cls, task: str, format: Type[Format], option: Type[BaseModel], example: str):
        cls.options[task].append((format, option, example))


# Text Classification
Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionNone, examples.Category_fastText)
Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionNone, examples.Category_JSON)
Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionNone, examples.Category_JSONL)

# Sequence Labeling
Options.register(SEQUENCE_LABELING, JSONL, OptionNone, examples.Offset_JSONL)

# Sequence to sequence
Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
Options.register(SEQ2SEQ, JSON, OptionNone, examples.Text_JSON)
Options.register(SEQ2SEQ, JSONL, OptionNone, examples.Text_JSONL)

# Intent detection and slot filling
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, IntentAndSlot, OptionNone, examples.INTENT_JSONL)

# Image Classification
Options.register(IMAGE_CLASSIFICATION, JSONL, OptionNone, examples.CategoryImageClassification)

# Speech to Text
Options.register(SPEECH2TEXT, JSONL, OptionNone, examples.Speech2Text)