mirror of https://github.com/doccano/doccano.git
pythondatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learningannotation-tool
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
107 lines
3.3 KiB
107 lines
3.3 KiB
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Type
|
|
|
|
from projects.models import (
|
|
BOUNDING_BOX,
|
|
DOCUMENT_CLASSIFICATION,
|
|
IMAGE_CAPTIONING,
|
|
IMAGE_CLASSIFICATION,
|
|
INTENT_DETECTION_AND_SLOT_FILLING,
|
|
SEGMENTATION,
|
|
SEQ2SEQ,
|
|
SEQUENCE_LABELING,
|
|
SPEECH2TEXT,
|
|
)
|
|
|
|
EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"
|
|
|
|
|
|
class Format:
|
|
name = ""
|
|
|
|
@classmethod
|
|
def dict(cls):
|
|
return {
|
|
"name": cls.name,
|
|
}
|
|
|
|
|
|
class CSV(Format):
|
|
name = "CSV"
|
|
|
|
|
|
class FastText(Format):
|
|
name = "fastText"
|
|
|
|
|
|
class JSON(Format):
|
|
name = "JSON"
|
|
|
|
|
|
class JSONL(Format):
|
|
name = "JSONL"
|
|
|
|
|
|
class Options:
|
|
options: Dict[str, List] = defaultdict(list)
|
|
|
|
@classmethod
|
|
def filter_by_task(cls, task_name: str, use_relation: bool = False):
|
|
options = cls.options[task_name]
|
|
return [
|
|
{**file_format.dict(), "example": example}
|
|
for file_format, example, use_rel in options
|
|
if use_rel == use_relation
|
|
]
|
|
|
|
@classmethod
|
|
def register(cls, task: str, file_format: Type[Format], file: Path, use_relation: bool = False):
|
|
example = cls.load_example(file)
|
|
cls.options[task].append((file_format, example, use_relation))
|
|
|
|
@staticmethod
|
|
def load_example(file):
|
|
with open(file, encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
|
|
# Text Classification
|
|
TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
|
|
Options.register(DOCUMENT_CLASSIFICATION, CSV, TEXT_CLASSIFICATION_DIR / "example.csv")
|
|
Options.register(DOCUMENT_CLASSIFICATION, FastText, TEXT_CLASSIFICATION_DIR / "example.txt")
|
|
Options.register(DOCUMENT_CLASSIFICATION, JSON, TEXT_CLASSIFICATION_DIR / "example.json")
|
|
Options.register(DOCUMENT_CLASSIFICATION, JSONL, TEXT_CLASSIFICATION_DIR / "example.jsonl")
|
|
|
|
# Sequence Labeling
|
|
SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
|
|
RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
|
|
Options.register(SEQUENCE_LABELING, JSONL, SEQUENCE_LABELING_DIR / "example.jsonl")
|
|
Options.register(SEQUENCE_LABELING, JSONL, RELATION_EXTRACTION_DIR / "example.jsonl", True)
|
|
|
|
# Sequence to sequence
|
|
SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
|
|
Options.register(SEQ2SEQ, CSV, SEQ2SEQ_DIR / "example.csv")
|
|
Options.register(SEQ2SEQ, JSON, SEQ2SEQ_DIR / "example.json")
|
|
Options.register(SEQ2SEQ, JSONL, SEQ2SEQ_DIR / "example.jsonl")
|
|
|
|
# Intent detection and slot filling
|
|
INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
|
|
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, INTENT_DETECTION_DIR / "example.jsonl")
|
|
|
|
# Image Classification
|
|
IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
|
|
Options.register(IMAGE_CLASSIFICATION, JSONL, IMAGE_CLASSIFICATION_DIR / "example.jsonl")
|
|
|
|
BOUNDING_BOX_DIR = EXAMPLE_DIR / "bounding_box"
|
|
Options.register(BOUNDING_BOX, JSONL, BOUNDING_BOX_DIR / "example.jsonl")
|
|
|
|
SEGMENTATION_DIR = EXAMPLE_DIR / "segmentation"
|
|
Options.register(SEGMENTATION, JSONL, SEGMENTATION_DIR / "example.jsonl")
|
|
|
|
IMAGE_CAPTIONING_DIR = EXAMPLE_DIR / "image_captioning"
|
|
Options.register(IMAGE_CAPTIONING, JSONL, IMAGE_CAPTIONING_DIR / "example.jsonl")
|
|
|
|
# Speech to Text
|
|
SPEECH2TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
|
|
Options.register(SPEECH2TEXT, JSONL, SPEECH2TEXT_DIR / "example.jsonl")
|