From 1e0f5fc1284d3873865b6b6cba08fe60b74f985e Mon Sep 17 00:00:00 2001 From: Hironsan Date: Sun, 24 Apr 2022 09:40:44 +0900 Subject: [PATCH] Refactor test_task.py --- backend/data_export/models.py | 4 +- backend/data_export/pipeline/formatters.py | 4 +- backend/data_export/tests/test_task.py | 834 ++++++++------------- 3 files changed, 339 insertions(+), 503 deletions(-) diff --git a/backend/data_export/models.py b/backend/data_export/models.py index c90aeb7a..fe808bca 100644 --- a/backend/data_export/models.py +++ b/backend/data_export/models.py @@ -5,10 +5,12 @@ from django.db import models from examples.models import Example from labels.models import Category, Relation, Span, TextLabel +DATA = "data" + class ExportedExample(Example): def to_dict(self) -> Dict[str, Any]: - return {"id": self.id, "data": self.text if self.project.is_text_project else self.upload_name, **self.meta} + return {"id": self.id, DATA: self.text if self.project.is_text_project else self.upload_name, **self.meta} class Meta: proxy = True diff --git a/backend/data_export/pipeline/formatters.py b/backend/data_export/pipeline/formatters.py index bfe6e460..5c4d758f 100644 --- a/backend/data_export/pipeline/formatters.py +++ b/backend/data_export/pipeline/formatters.py @@ -5,6 +5,8 @@ import abc import pandas as pd +from data_export.models import DATA + class Formatter(abc.ABC): def __init__(self, target_column: str): @@ -43,7 +45,7 @@ class FastTextCategoryFormatter(Formatter): """Format the label column to `__label__LabelA __label__LabelB` format. Also, drop the columns except for `data` and `self.target_column`. """ - dataset = dataset[["data", self.target_column]] + dataset = dataset[[DATA, self.target_column]] dataset[self.target_column] = dataset[self.target_column].apply( lambda labels: sorted(f"__label__{label.to_string()}" for label in labels) ) diff --git a/backend/data_export/tests/test_task.py b/backend/data_export/tests/test_task.py index 111c6322..8b579290 100644 --- a/backend/data_export/tests/test_task.py +++ b/backend/data_export/tests/test_task.py @@ -1,11 +1,9 @@ import os import zipfile -import numpy as np import pandas as pd from django.test import TestCase, override_settings from model_mommy import mommy -from pandas.testing import assert_frame_equal from ..celery_tasks import export_dataset from projects.models import ( @@ -19,203 +17,165 @@ from projects.models import ( from projects.tests.utils import prepare_project -def read_zip_content(file, file_format="csv"): +def read_zip_content(file): datasets = {} with zipfile.ZipFile(file) as z: for file in z.filelist: username = file.filename.split(".")[0] with z.open(file) as f: try: - if file_format == "csv": - df = pd.read_csv(f) - elif file_format == "json": - df = pd.read_json(f) - elif file_format == "jsonl": - df = pd.read_json(f, lines=True) + df = pd.read_json(f, lines=True) except pd.errors.EmptyDataError: continue - datasets[username] = df + datasets[username] = df.to_dict(orient="records") return datasets @override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportCategory(TestCase): +class TestExport(TestCase): + def export_dataset(self, confirmed_only=False): + file = export_dataset(self.project.id, "JSONL", confirmed_only) + if self.project.item.collaborative_annotation: + dataset = pd.read_json(file, lines=True).to_dict(orient="records") + else: + dataset = read_zip_content(file) + os.remove(file) + return dataset + + +class TestExportCategory(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(DOCUMENT_CLASSIFICATION, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") - self.category1 = mommy.make("Category", example=self.example1, user=self.project.admin) - self.category2 = mommy.make("Category", example=self.example1, user=self.project.annotator) + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="example1") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="example2") + self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin) + self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "CSV", False) - datasets = read_zip_content(file) - os.remove(file) + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "categories": self.category1.label.text}, - {"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "categories": np.nan}, - {"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "categories": self.category2.label.text}, - {"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, - ] - ), + self.project.admin.username: [ + {**self.example1.to_dict(), "categories": [self.category1.to_string()]}, + {**self.example2.to_dict(), "categories": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "categories": []}, + {**self.example2.to_dict(), "categories": []}, + ], + self.project.annotator.username: [ + {**self.example1.to_dict(), "categories": [self.category2.to_string()]}, + {**self.example2.to_dict(), "categories": []}, + ], } for username, dataset in expected_datasets.items(): - assert_frame_equal(dataset, datasets[username]) + self.assertEqual(datasets[username], dataset) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "CSV", False) - dataset = pd.read_csv(file) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "categories": "#".join(sorted([self.category1.label.text, self.category2.label.text])), - }, - {"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "categories": sorted([self.category1.to_string(), self.category2.to_string()]), + }, + {**self.example2.to_dict(), "categories": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "CSV", True) - datasets = read_zip_content(file) - os.remove(file) + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [{"id": self.example1.id, "data": self.example1.text, "categories": self.category1.label.text}] - ) + self.project.admin.username: [{**self.example1.to_dict(), "categories": [self.category1.to_string()]}] } for username, dataset in expected_datasets.items(): - assert_frame_equal(dataset, datasets[username]) + self.assertEqual(datasets[username], dataset) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "CSV", True) - dataset = pd.read_csv(file) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "categories": "#".join(sorted([self.category1.label.text, self.category2.label.text])), - } - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "categories": sorted([self.category1.to_string(), self.category2.to_string()]), + } + ] + self.assertEqual(dataset, expected_dataset) -@override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportSeq2seq(TestCase): +class TestExportSeq2seq(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(SEQ2SEQ, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin) self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "CSV", False) - datasets = read_zip_content(file) - os.remove(file) + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "labels": self.text1.text}, - {"id": self.example2.id, "data": self.example2.text, "labels": np.nan}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "labels": np.nan}, - {"id": self.example2.id, "data": self.example2.text, "labels": np.nan}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "labels": self.text2.text}, - {"id": self.example2.id, "data": self.example2.text, "labels": np.nan}, - ] - ), + self.project.admin.username: [ + {**self.example1.to_dict(), "labels": [self.text1.text]}, + {**self.example2.to_dict(), "labels": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "labels": []}, + {**self.example2.to_dict(), "labels": []}, + ], + self.project.annotator.username: [ + {**self.example1.to_dict(), "labels": [self.text2.text]}, + {**self.example2.to_dict(), "labels": []}, + ], } for username, dataset in expected_datasets.items(): - assert_frame_equal(dataset, datasets[username]) + self.assertEqual(datasets[username], dataset) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "CSV", False) - dataset = pd.read_csv(file) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "labels": "#".join(sorted([self.text1.text, self.text2.text])), - }, - {"id": self.example2.id, "data": self.example2.text, "labels": np.nan}, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "labels": sorted([self.text1.text, self.text2.text]), + }, + {**self.example2.to_dict(), "labels": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "CSV", True) - datasets = read_zip_content(file) - os.remove(file) + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "labels": self.text1.text}, - ] - ) + self.project.admin.username: [ + {**self.example1.to_dict(), "labels": [self.text1.text]}, + ], + self.project.approver.username: [], + self.project.annotator.username: [], } for username, dataset in datasets.items(): - assert_frame_equal(dataset, expected_datasets[username]) + self.assertEqual(dataset, expected_datasets[username]) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "CSV", True) - dataset = pd.read_csv(file) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "labels": "#".join(sorted([self.text1.text, self.text2.text])), - } - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "labels": sorted([self.text1.text, self.text2.text]), + } + ] + self.assertEqual(dataset, expected_dataset) -@override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportIntentDetectionAndSlotFilling(TestCase): +class TestExportIntentDetectionAndSlotFilling(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(INTENT_DETECTION_AND_SLOT_FILLING, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin) self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator) self.span = mommy.make( @@ -225,107 +185,79 @@ class TestExportIntentDetectionAndSlotFilling(TestCase): def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", False) - datasets = read_zip_content(file, "jsonl") - os.remove(file) - + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [list(self.span.to_tuple())], - "categories": [self.category1.to_string()], - }, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [], - "categories": [self.category2.to_string()], - }, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "entities": [], "categories": []}, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []}, - ] - ), + self.project.admin.username: [ + { + **self.example1.to_dict(), + "entities": [list(self.span.to_tuple())], + "categories": [self.category1.to_string()], + }, + {**self.example2.to_dict(), "entities": [], "categories": []}, + ], + self.project.annotator.username: [ + { + **self.example1.to_dict(), + "entities": [], + "categories": [self.category2.to_string()], + }, + {**self.example2.to_dict(), "entities": [], "categories": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "entities": [], "categories": []}, + {**self.example2.to_dict(), "entities": [], "categories": []}, + ], } for username, dataset in expected_datasets.items(): - self.assertEqual(dataset.to_dict(), datasets[username].to_dict()) + self.assertEqual(dataset, datasets[username]) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", False) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [list(self.span.to_tuple())], - "categories": sorted([self.category1.to_string(), self.category2.to_string()]), - }, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []}, - ] - ) - self.assertEqual(dataset.to_dict(), expected_dataset.to_dict()) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "entities": [list(self.span.to_tuple())], + "categories": sorted([self.category1.to_string(), self.category2.to_string()]), + }, + {**self.example2.to_dict(), "entities": [], "categories": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", True) - datasets = read_zip_content(file, "jsonl") - os.remove(file) - + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [list(self.span.to_tuple())], - "categories": [self.category1.to_string()], - }, - ] - ), - self.project.annotator.username: pd.DataFrame(), - self.project.approver.username: pd.DataFrame(), + self.project.admin.username: [ + { + **self.example1.to_dict(), + "entities": [list(self.span.to_tuple())], + "categories": [self.category1.to_string()], + }, + ], + self.project.annotator.username: [], + self.project.approver.username: [], } for username, dataset in expected_datasets.items(): - self.assertEqual(dataset.to_dict(), datasets[username].to_dict()) + self.assertEqual(dataset, datasets[username]) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", True) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [list(self.span.to_tuple())], - "categories": sorted([self.category1.to_string(), self.category2.to_string()]), - }, - ] - ) - self.assertEqual(dataset.to_dict(), expected_dataset.to_dict()) - - -@override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportSequenceLabeling(TestCase): + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "entities": [list(self.span.to_tuple())], + "categories": sorted([self.category1.to_string(), self.category2.to_string()]), + }, + ] + self.assertEqual(dataset, expected_dataset) + + +class TestExportSequenceLabeling(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(SEQUENCE_LABELING, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed") self.span1 = mommy.make( "ExportedSpan", example=self.example1, user=self.project.admin, start_offset=0, end_offset=1 ) @@ -333,277 +265,204 @@ class TestExportSequenceLabeling(TestCase): "ExportedSpan", example=self.example1, user=self.project.annotator, start_offset=1, end_offset=2 ) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", False) - datasets = read_zip_content(file, "jsonl") - os.remove(file) - + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "entities": [list(self.span1.to_tuple())]}, - {"id": self.example2.id, "data": self.example2.text, "entities": []}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "entities": [list(self.span2.to_tuple())]}, - {"id": self.example2.id, "data": self.example2.text, "entities": []}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "entities": []}, - {"id": self.example2.id, "data": self.example2.text, "entities": []}, - ] - ), + self.project.admin.username: [ + {**self.example1.to_dict(), "entities": [list(self.span1.to_tuple())]}, + {**self.example2.to_dict(), "entities": []}, + ], + self.project.annotator.username: [ + {**self.example1.to_dict(), "entities": [list(self.span2.to_tuple())]}, + {**self.example2.to_dict(), "entities": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "entities": []}, + {**self.example2.to_dict(), "entities": []}, + ], } for username, dataset in expected_datasets.items(): - self.assertEqual(dataset.to_dict(), datasets[username].to_dict()) + self.assertEqual(dataset, datasets[username]) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", False) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span1.to_tuple(), self.span2.to_tuple()], - }, - {"id": self.example2.id, "data": self.example2.text, "entities": []}, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "entities": [list(self.span1.to_tuple()), list(self.span2.to_tuple())], + }, + {**self.example2.to_dict(), "entities": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", True) - datasets = read_zip_content(file, "jsonl") - os.remove(file) - + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "entities": [list(self.span1.to_tuple())]}, - ] - ), - self.project.annotator.username: pd.DataFrame(), - self.project.approver.username: pd.DataFrame(), + self.project.admin.username: [ + {**self.example1.to_dict(), "entities": [list(self.span1.to_tuple())]}, + ], + self.project.annotator.username: [], + self.project.approver.username: [], } for username, dataset in expected_datasets.items(): - self.assertEqual(dataset.to_dict(), datasets[username].to_dict()) + self.assertEqual(dataset, datasets[username]) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", True) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span1.to_tuple(), self.span2.to_tuple()], - }, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "entities": [list(self.span1.to_tuple()), list(self.span2.to_tuple())], + }, + ] + self.assertEqual(dataset, expected_dataset) -@override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportSpeechToText(TestCase): +class TestExportSpeechToText(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(SPEECH2TEXT, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin) self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", False) - datasets = read_zip_content(file, "jsonl") - os.remove(file) + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.upload_name, "labels": [self.text1.text]}, - {"id": self.example2.id, "data": self.example2.upload_name, "labels": []}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.upload_name, "labels": []}, - {"id": self.example2.id, "data": self.example2.upload_name, "labels": []}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.upload_name, "labels": [self.text2.text]}, - {"id": self.example2.id, "data": self.example2.upload_name, "labels": []}, - ] - ), + self.project.admin.username: [ + {**self.example1.to_dict(), "labels": [self.text1.text]}, + {**self.example2.to_dict(), "labels": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "labels": []}, + {**self.example2.to_dict(), "labels": []}, + ], + self.project.annotator.username: [ + {**self.example1.to_dict(), "labels": [self.text2.text]}, + {**self.example2.to_dict(), "labels": []}, + ], } for username, dataset in expected_datasets.items(): - assert_frame_equal(dataset, datasets[username]) + self.assertEqual(datasets[username], dataset) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", False) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.upload_name, - "labels": sorted([self.text1.text, self.text2.text]), - }, - {"id": self.example2.id, "data": self.example2.upload_name, "labels": []}, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "labels": sorted([self.text1.text, self.text2.text]), + }, + {**self.example2.to_dict(), "labels": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", True) - datasets = read_zip_content(file, "jsonl") - os.remove(file) + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.upload_name, "labels": [self.text1.text]}, - ] - ), - self.project.annotator.username: pd.DataFrame(), - self.project.approver.username: pd.DataFrame(), + self.project.admin.username: [ + {**self.example1.to_dict(), "labels": [self.text1.text]}, + ], + self.project.annotator.username: [], + self.project.approver.username: [], } for username, dataset in datasets.items(): - self.assertEqual(dataset.to_dict(), expected_datasets[username].to_dict()) + self.assertEqual(dataset, expected_datasets[username]) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", True) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.upload_name, - "labels": sorted([self.text1.text, self.text2.text]), - } - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "labels": sorted([self.text1.text, self.text2.text]), + } + ] + self.assertEqual(dataset, expected_dataset) -@override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportImageClassification(TestCase): +class TestExportImageClassification(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(IMAGE_CLASSIFICATION, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") - self.category1 = mommy.make("Category", example=self.example1, user=self.project.admin) - self.category2 = mommy.make("Category", example=self.example1, user=self.project.annotator) + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") + self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin) + self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", False) - datasets = read_zip_content(file, "jsonl") - os.remove(file) + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.upload_name, - "categories": [self.category1.label.text], - }, - {"id": self.example2.id, "data": self.example2.upload_name, "categories": []}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.upload_name, "categories": []}, - {"id": self.example2.id, "data": self.example2.upload_name, "categories": []}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.upload_name, - "categories": [self.category2.label.text], - }, - {"id": self.example2.id, "data": self.example2.upload_name, "categories": []}, - ] - ), + self.project.admin.username: [ + { + **self.example1.to_dict(), + "categories": [self.category1.to_string()], + }, + {**self.example2.to_dict(), "categories": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "categories": []}, + {**self.example2.to_dict(), "categories": []}, + ], + self.project.annotator.username: [ + { + **self.example1.to_dict(), + "categories": [self.category2.to_string()], + }, + {**self.example2.to_dict(), "categories": []}, + ], } for username, dataset in expected_datasets.items(): - assert_frame_equal(dataset, datasets[username]) + self.assertEqual(datasets[username], dataset) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", False) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.upload_name, - "categories": sorted([self.category1.label.text, self.category2.label.text]), - }, - {"id": self.example2.id, "data": self.example2.upload_name, "categories": []}, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "categories": sorted([self.category1.to_string(), self.category2.to_string()]), + }, + {**self.example2.to_dict(), "categories": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", True) - datasets = read_zip_content(file, "jsonl") - os.remove(file) + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [{"id": self.example1.id, "data": self.example1.upload_name, "categories": [self.category1.label.text]}] - ) + self.project.admin.username: [{**self.example1.to_dict(), "categories": [self.category1.to_string()]}] } for username, dataset in expected_datasets.items(): - assert_frame_equal(dataset, datasets[username]) + self.assertEqual(datasets[username], dataset) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", True) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.upload_name, - "categories": sorted([self.category1.label.text, self.category2.label.text]), - } - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "categories": sorted([self.category1.to_string(), self.category2.to_string()]), + } + ] + self.assertEqual(dataset, expected_dataset) -@override_settings(MEDIA_URL=os.path.dirname(__file__)) -class TestExportRelation(TestCase): +class TestExportRelation(TestExport): def prepare_data(self, collaborative=False): self.project = prepare_project(SEQUENCE_LABELING, use_relation=True, collaborative_annotation=collaborative) - self.example1 = mommy.make("Example", project=self.project.item, text="example") - self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") + self.example1 = mommy.make("ExportedExample", project=self.project.item, text="example") + self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") self.span1 = mommy.make( "ExportedSpan", example=self.example1, user=self.project.admin, start_offset=0, end_offset=1 ) @@ -620,97 +479,70 @@ class TestExportRelation(TestCase): def test_unconfirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", False) - datasets = read_zip_content(file, "jsonl") - os.remove(file) - + datasets = self.export_dataset() expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span1.to_dict(), self.span2.to_dict()], - "relations": [self.relation.to_dict()], - }, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []}, - ] - ), - self.project.annotator.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span3.to_dict()], - "relations": [], - }, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []}, - ] - ), - self.project.approver.username: pd.DataFrame( - [ - {"id": self.example1.id, "data": self.example1.text, "entities": [], "relations": []}, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []}, - ] - ), + self.project.admin.username: [ + { + **self.example1.to_dict(), + "entities": [self.span1.to_dict(), self.span2.to_dict()], + "relations": [self.relation.to_dict()], + }, + {**self.example2.to_dict(), "entities": [], "relations": []}, + ], + self.project.annotator.username: [ + { + **self.example1.to_dict(), + "entities": [self.span3.to_dict()], + "relations": [], + }, + {**self.example2.to_dict(), "entities": [], "relations": []}, + ], + self.project.approver.username: [ + {**self.example1.to_dict(), "entities": [], "relations": []}, + {**self.example2.to_dict(), "entities": [], "relations": []}, + ], } for username, dataset in expected_datasets.items(): - self.assertEqual(dataset.to_dict(), datasets[username].to_dict()) + self.assertEqual(datasets[username], dataset) def test_unconfirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", False) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()], - "relations": [self.relation.to_dict()], - }, - {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []}, - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset() + expected_dataset = [ + { + **self.example1.to_dict(), + "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()], + "relations": [self.relation.to_dict()], + }, + {**self.example2.to_dict(), "entities": [], "relations": []}, + ] + self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() - file = export_dataset(self.project.id, "JSONL", True) - datasets = read_zip_content(file, "jsonl") - os.remove(file) - + datasets = self.export_dataset(confirmed_only=True) expected_datasets = { - self.project.admin.username: pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span1.to_dict(), self.span2.to_dict()], - "relations": [self.relation.to_dict()], - }, - ] - ), - self.project.annotator.username: pd.DataFrame(), - self.project.approver.username: pd.DataFrame(), + self.project.admin.username: [ + { + **self.example1.to_dict(), + "entities": [self.span1.to_dict(), self.span2.to_dict()], + "relations": [self.relation.to_dict()], + }, + ], + self.project.annotator.username: [], + self.project.approver.username: [], } for username, dataset in datasets.items(): - self.assertEqual(dataset.to_dict(), expected_datasets[username].to_dict()) + self.assertEqual(dataset, expected_datasets[username]) def test_confirmed_and_collaborative(self): self.prepare_data(collaborative=True) - file = export_dataset(self.project.id, "JSONL", True) - dataset = pd.read_json(file, lines=True) - os.remove(file) - expected_dataset = pd.DataFrame( - [ - { - "id": self.example1.id, - "data": self.example1.text, - "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()], - "relations": [self.relation.to_dict()], - } - ] - ) - assert_frame_equal(dataset, expected_dataset) + dataset = self.export_dataset(confirmed_only=True) + expected_dataset = [ + { + **self.example1.to_dict(), + "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()], + "relations": [self.relation.to_dict()], + } + ] + self.assertEqual(dataset, expected_dataset)