Browse Source

Support data export for image captioning

pull/1899/head
Hironsan 2 years ago
parent
commit
2443365ed7
6 changed files with 97 additions and 2 deletions
  1. 4
      backend/data_export/pipeline/catalog.py
  2. 3
      backend/data_export/pipeline/examples/image_captioning/example.jsonl
  3. 16
      backend/data_export/pipeline/factories.py
  4. 4
      backend/data_export/tests/test_catalog.py
  5. 70
      backend/data_export/tests/test_task.py
  6. 2
      backend/projects/tests/utils.py

4
backend/data_export/pipeline/catalog.py

@ -5,6 +5,7 @@ from typing import Dict, List, Type
from projects.models import (
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
@ -98,6 +99,9 @@ Options.register(BOUNDING_BOX, JSONL, BOUNDING_BOX_DIR / "example.jsonl")
SEGMENTATION_DIR = EXAMPLE_DIR / "segmentation"
Options.register(SEGMENTATION, JSONL, SEGMENTATION_DIR / "example.jsonl")
IMAGE_CAPTIONING_DIR = EXAMPLE_DIR / "image_captioning"
Options.register(IMAGE_CAPTIONING, JSONL, IMAGE_CAPTIONING_DIR / "example.jsonl")
# Speech to Text
SPEECH2TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
Options.register(SPEECH2TEXT, JSONL, SPEECH2TEXT_DIR / "example.jsonl")

3
backend/data_export/pipeline/examples/image_captioning/example.jsonl

@ -0,0 +1,3 @@
{"filename": "cat.jpg", "label": ["meow"]}
{"filename": "dog.jpg", "label": ["bow"]}
{"filename": "pig.jpg", "label": ["wee"]}

16
backend/data_export/pipeline/factories.py

@ -18,6 +18,7 @@ from data_export.models import DATA, ExportedExample
from projects.models import (
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
@ -42,15 +43,22 @@ def create_writer(file_format: str) -> writers.Writer:
def create_formatter(project: Project, file_format: str) -> List[Formatter]:
use_relation = getattr(project, "use_relation", False)
# text tasks
mapper_text_classification = {DATA: "text", Categories.column: "label"}
mapper_sequence_labeling = {DATA: "text", Spans.column: "label"}
mapper_seq2seq = {DATA: "text", Texts.column: "label"}
mapper_intent_detection = {DATA: "text", Categories.column: "cats"}
mapper_relation_extraction = {DATA: "text"}
# image tasks
mapper_image_classification = {DATA: "filename", Categories.column: "label"}
mapper_bounding_box = {DATA: "filename", BoundingBoxes.column: "bbox"}
mapper_segmentation = {DATA: "filename", BoundingBoxes.column: "segmentation"}
mapper_image_captioning = {DATA: "filename", Texts.column: "label"}
# audio tasks
mapper_speech2text = {DATA: "filename", Texts.column: "label"}
mapper_intent_detection = {DATA: "text", Categories.column: "cats"}
mapper_relation_extraction = {DATA: "text"}
mapping: Dict[str, Dict[str, List[Formatter]]] = {
DOCUMENT_CLASSIFICATION: {
CSV.name: [
@ -99,6 +107,9 @@ def create_formatter(project: Project, file_format: str) -> List[Formatter]:
},
BOUNDING_BOX: {JSONL.name: [DictFormatter(BoundingBoxes.column), RenameFormatter(**mapper_bounding_box)]},
SEGMENTATION: {JSONL.name: [DictFormatter(Segments.column), RenameFormatter(**mapper_segmentation)]},
IMAGE_CAPTIONING: {
JSONL.name: [ListedCategoryFormatter(Texts.column), RenameFormatter(**mapper_image_captioning)]
},
}
return mapping[project.project_type][file_format]
@ -114,6 +125,7 @@ def select_label_collection(project: Project) -> List[Type[Labels]]:
INTENT_DETECTION_AND_SLOT_FILLING: [Categories, Spans],
BOUNDING_BOX: [BoundingBoxes],
SEGMENTATION: [Segments],
IMAGE_CAPTIONING: [Texts],
}
return mapping[project.project_type]

4
backend/data_export/tests/test_catalog.py

@ -4,8 +4,10 @@ from ..pipeline.catalog import Options
from projects.models import (
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
SEQ2SEQ,
SEQUENCE_LABELING,
SPEECH2TEXT,
@ -17,8 +19,10 @@ class TestOptions(unittest.TestCase):
tasks = [
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
SEQ2SEQ,
SEQUENCE_LABELING,
SPEECH2TEXT,

70
backend/data_export/tests/test_task.py

@ -10,6 +10,7 @@ from data_export.models import DATA
from projects.models import (
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
@ -622,6 +623,75 @@ class TestExportSegmentation(TestExport):
self.assertEqual(dataset, expected_dataset)
class TestExportImageCaptioning(TestExport):
def prepare_data(self, collaborative=False):
self.project = prepare_project(IMAGE_CAPTIONING, collaborative_annotation=collaborative)
self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed")
self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin)
self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator)
mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin)
self.data1 = self.data_to_filename(self.example1)
self.data2 = self.data_to_filename(self.example2)
self.column = "label"
def test_unconfirmed_and_non_collaborative(self):
self.prepare_data()
datasets = self.export_dataset()
expected_datasets = {
self.project.admin.username: [
{**self.data1, self.column: [self.text1.text]},
{**self.data2, self.column: []},
],
self.project.approver.username: [
{**self.data1, self.column: []},
{**self.data2, self.column: []},
],
self.project.annotator.username: [
{**self.data1, self.column: [self.text2.text]},
{**self.data2, self.column: []},
],
}
for username, dataset in expected_datasets.items():
self.assertEqual(datasets[username], dataset)
def test_unconfirmed_and_collaborative(self):
self.prepare_data(collaborative=True)
dataset = self.export_dataset()
expected_dataset = [
{
**self.data1,
self.column: sorted([self.text1.text, self.text2.text]),
},
{**self.data2, self.column: []},
]
self.assertEqual(dataset, expected_dataset)
def test_confirmed_and_non_collaborative(self):
self.prepare_data()
datasets = self.export_dataset(confirmed_only=True)
expected_datasets = {
self.project.admin.username: [
{**self.data1, self.column: [self.text1.text]},
],
self.project.approver.username: [],
self.project.annotator.username: [],
}
for username, dataset in datasets.items():
self.assertEqual(dataset, expected_datasets[username])
def test_confirmed_and_collaborative(self):
self.prepare_data(collaborative=True)
dataset = self.export_dataset(confirmed_only=True)
expected_dataset = [
{
**self.data1,
self.column: sorted([self.text1.text, self.text2.text]),
}
]
self.assertEqual(dataset, expected_dataset)
class TestExportRelation(TestExport):
def prepare_data(self, collaborative=False):
self.project = prepare_project(SEQUENCE_LABELING, use_relation=True, collaborative_annotation=collaborative)

2
backend/projects/tests/utils.py

@ -6,6 +6,7 @@ from model_mommy import mommy
from projects.models import (
BOUNDING_BOX,
DOCUMENT_CLASSIFICATION,
IMAGE_CAPTIONING,
IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING,
SEGMENTATION,
@ -72,6 +73,7 @@ def make_project(task: str, users: List[str], roles: List[str], collaborative_an
INTENT_DETECTION_AND_SLOT_FILLING: "IntentDetectionAndSlotFillingProject",
BOUNDING_BOX: "BoundingBoxProject",
SEGMENTATION: "SegmentationProject",
IMAGE_CAPTIONING: "ImageCaptioningProject",
}.get(task, "Project")
project = mommy.make(
_model=project_model,

Loading…
Cancel
Save