diff --git a/backend/data_export/models.py b/backend/data_export/models.py index ae6dd201..197bb6be 100644 --- a/backend/data_export/models.py +++ b/backend/data_export/models.py @@ -85,5 +85,8 @@ class ExportedComment(Comment): def to_string(self) -> str: return self.text + def to_dict(self): + return {"id": self.id, "comment": self.text} + class Meta: proxy = True diff --git a/backend/data_export/pipeline/factories.py b/backend/data_export/pipeline/factories.py index be139819..2b2e2e96 100644 --- a/backend/data_export/pipeline/factories.py +++ b/backend/data_export/pipeline/factories.py @@ -71,29 +71,52 @@ def create_formatter(project: Project, file_format: str) -> List[Formatter]: JSONL.name: [ DictFormatter(Spans.column), DictFormatter(Relations.column), + DictFormatter(Comments.column), RenameFormatter(**mapper_relation_extraction), ] if use_relation - else [TupledSpanFormatter(Spans.column), RenameFormatter(**mapper_sequence_labeling)] + else [ + TupledSpanFormatter(Spans.column), + ListedCategoryFormatter(Comments.column), + RenameFormatter(**mapper_sequence_labeling) + ] }, SEQ2SEQ: { - CSV.name: [JoinedCategoryFormatter(Texts.column), RenameFormatter(**mapper_seq2seq)], - JSON.name: [ListedCategoryFormatter(Texts.column), RenameFormatter(**mapper_seq2seq)], - JSONL.name: [ListedCategoryFormatter(Texts.column), RenameFormatter(**mapper_seq2seq)], + CSV.name: [ + JoinedCategoryFormatter(Texts.column), + JoinedCategoryFormatter(Comments.column), + RenameFormatter(**mapper_seq2seq) + ], + JSON.name: [ + ListedCategoryFormatter(Texts.column), + ListedCategoryFormatter(Comments.column), + RenameFormatter(**mapper_seq2seq) + ], + JSONL.name: [ + ListedCategoryFormatter(Texts.column), + ListedCategoryFormatter(Comments.column), + RenameFormatter(**mapper_seq2seq) + ], }, IMAGE_CLASSIFICATION: { JSONL.name: [ ListedCategoryFormatter(Categories.column), + ListedCategoryFormatter(Comments.column), RenameFormatter(**mapper_image_classification), ], }, SPEECH2TEXT: { - JSONL.name: [ListedCategoryFormatter(Texts.column), RenameFormatter(**mapper_speech2text)], + JSONL.name: [ + ListedCategoryFormatter(Texts.column), + ListedCategoryFormatter(Comments.column), + RenameFormatter(**mapper_speech2text) + ], }, INTENT_DETECTION_AND_SLOT_FILLING: { JSONL.name: [ ListedCategoryFormatter(Categories.column), TupledSpanFormatter(Spans.column), + ListedCategoryFormatter(Comments.column), RenameFormatter(**mapper_intent_detection), ] }, diff --git a/backend/data_export/pipeline/formatters.py b/backend/data_export/pipeline/formatters.py index 41efa19d..1ce74270 100644 --- a/backend/data_export/pipeline/formatters.py +++ b/backend/data_export/pipeline/formatters.py @@ -52,7 +52,7 @@ class FastTextCategoryFormatter(Formatter): ) dataset[self.target_column] = dataset[self.target_column].fillna("") dataset["Comments"] = dataset["Comments"].apply( - lambda comments: "#".join(comment.to_string() for comment in comments) + lambda comments: " ".join(f"__comment__{comment.to_string()}" for comment in comments) ) dataset = dataset[self.target_column] + " " + dataset[DATA] + " " + dataset["Comments"] return dataset diff --git a/backend/data_export/tests/test_dataset.py b/backend/data_export/tests/test_dataset.py index df9eea57..9ec57a37 100644 --- a/backend/data_export/tests/test_dataset.py +++ b/backend/data_export/tests/test_dataset.py @@ -18,12 +18,12 @@ class TestDataset(unittest.TestCase): self.labels = MagicMock() self.labels.__iter__.return_value = [label] comment = MagicMock() - comment.find_by.return_value = {"Comments": ["comment"]} + comment.find_by.return_value = {"comments": ["comment"]} self.comments = MagicMock() self.comments.__iter__.return_value = [comment] def test_to_dataframe(self): - dataset = Dataset(self.examples, self.labels) + dataset = Dataset(self.examples, self.labels, self.comments) df = dataset.to_dataframe() - expected = pd.DataFrame([{"data": "example", "labels": ["label"], "Comments": ["comment"]}]) + expected = pd.DataFrame([{"data": "example", "labels": ["label"], "comments": ["comment"]}]) assert_frame_equal(df, expected) diff --git a/backend/data_export/tests/test_formatters.py b/backend/data_export/tests/test_formatters.py index 09deb97b..c1c31661 100644 --- a/backend/data_export/tests/test_formatters.py +++ b/backend/data_export/tests/test_formatters.py @@ -75,15 +75,20 @@ class TestTupledSpanFormatter(unittest.TestCase): class TestFastTextFormatter(unittest.TestCase): def setUp(self): - self.return_value = "Label" + self.return_value_label = "Label" + self.return_value_comment = "Comment" label = MagicMock() - label.to_string.return_value = self.return_value - self.dataset = pd.DataFrame([{TARGET_COLUMN: [label], DATA: "example", "Comments": "comment"}]) + comment = MagicMock() + label.to_string.return_value = self.return_value_label + comment.to_string.return_value = self.return_value_comment + self.dataset = pd.DataFrame([{TARGET_COLUMN: [label], DATA: "example", "Comments": [comment]}]) def test_format(self): formatter = FastTextCategoryFormatter(TARGET_COLUMN) dataset = formatter.format(self.dataset) - expected_dataset = pd.DataFrame([f"__label__{self.return_value} example comment"]) + expected_dataset = pd.DataFrame( + [f"__label__{self.return_value_label} example __comment__{self.return_value_comment}"] + ) self.assertEqual(dataset.to_csv(index=False, header=None), expected_dataset.to_csv(index=False, header=None)) diff --git a/backend/data_export/tests/test_task.py b/backend/data_export/tests/test_task.py index a60e2d78..458f0317 100644 --- a/backend/data_export/tests/test_task.py +++ b/backend/data_export/tests/test_task.py @@ -62,7 +62,7 @@ class TestExportCategory(TestExport): self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin) self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator) self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) self.data1 = self.data_to_text(self.example1) self.data2 = self.data_to_text(self.example2) @@ -72,7 +72,7 @@ class TestExportCategory(TestExport): datasets = self.export_dataset() expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [self.category1.to_string()], "Comments": self.comment1}, + {**self.data1, "label": [self.category1.to_string()], "Comments": [self.comment1.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], self.project.approver.username: [ @@ -80,7 +80,7 @@ class TestExportCategory(TestExport): {**self.data2, "label": [], "Comments": []}, ], self.project.annotator.username: [ - {**self.data1, "label": [self.category2.to_string()], "Comments": self.comment2}, + {**self.data1, "label": [self.category2.to_string()], "Comments": [self.comment2.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], } @@ -94,16 +94,20 @@ class TestExportCategory(TestExport): { **self.data1, "label": sorted([self.category1.to_string(), self.category2.to_string()]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, - {**self.data2, "label": []}, + {**self.data2, "label": [], "Comments": []}, ] self.assertEqual(dataset, expected_dataset) def test_confirmed_and_non_collaborative(self): self.prepare_data() datasets = self.export_dataset(confirmed_only=True) - expected_datasets = {self.project.admin.username: [{**self.data1, "label": [self.category1.to_string()], "Comments": self.comment1}]} + expected_datasets = { + self.project.admin.username: [ + {**self.data1, "label": [self.category1.to_string()], "Comments": [self.comment1.to_string()]} + ] + } for username, dataset in expected_datasets.items(): self.assertEqual(datasets[username], dataset) @@ -114,7 +118,7 @@ class TestExportCategory(TestExport): { **self.data1, "label": sorted([self.category1.to_string(), self.category2.to_string()]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) } ] self.assertEqual(dataset, expected_dataset) @@ -128,7 +132,7 @@ class TestExportSeq2seq(TestExport): self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin) self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator) self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) self.data1 = self.data_to_text(self.example1) self.data2 = self.data_to_text(self.example2) @@ -138,7 +142,7 @@ class TestExportSeq2seq(TestExport): datasets = self.export_dataset() expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [self.text1.text], "Comments": self.comment1}, + {**self.data1, "label": [self.text1.text], "Comments": [self.comment1.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], self.project.approver.username: [ @@ -146,7 +150,7 @@ class TestExportSeq2seq(TestExport): {**self.data2, "label": [], "Comments": []}, ], self.project.annotator.username: [ - {**self.data1, "label": [self.text2.text], "Comments": self.comment2}, + {**self.data1, "label": [self.text2.text], "Comments": [self.comment2.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], } @@ -160,9 +164,9 @@ class TestExportSeq2seq(TestExport): { **self.data1, "label": sorted([self.text1.text, self.text2.text]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, - {**self.data2, "label": []}, + {**self.data2, "label": [], "Comments": []}, ] self.assertEqual(dataset, expected_dataset) @@ -171,7 +175,7 @@ class TestExportSeq2seq(TestExport): datasets = self.export_dataset(confirmed_only=True) expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [self.text1.text], "Comments": self.comment1}, + {**self.data1, "label": [self.text1.text], "Comments": [self.comment1.to_string()]}, ], self.project.approver.username: [], self.project.annotator.username: [], @@ -186,7 +190,7 @@ class TestExportSeq2seq(TestExport): { **self.data1, "label": sorted([self.text1.text, self.text2.text]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) } ] self.assertEqual(dataset, expected_dataset) @@ -200,7 +204,7 @@ class TestExportIntentDetectionAndSlotFilling(TestExport): self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin) self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator) self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) self.span = mommy.make( "ExportedSpan", example=self.example1, user=self.project.admin, start_offset=0, end_offset=1 ) @@ -217,7 +221,7 @@ class TestExportIntentDetectionAndSlotFilling(TestExport): **self.data1, "entities": [list(self.span.to_tuple())], "cats": [self.category1.to_string()], - "Comments": self.comment1 + "Comments": [self.comment1.to_string()] }, {**self.data2, "entities": [], "cats": [], "Comments": []}, ], @@ -226,7 +230,7 @@ class TestExportIntentDetectionAndSlotFilling(TestExport): **self.data1, "entities": [], "cats": [self.category2.to_string()], - "Comments": self.comment2 + "Comments": [self.comment2.to_string()] }, {**self.data2, "entities": [], "cats": [], "Comments": []}, ], @@ -246,7 +250,7 @@ class TestExportIntentDetectionAndSlotFilling(TestExport): **self.data1, "entities": [list(self.span.to_tuple())], "cats": sorted([self.category1.to_string(), self.category2.to_string()]), - "Comments": self.comment1 + "Comments": [self.comment1.to_string(), self.comment2.to_string()] }, {**self.data2, "entities": [], "cats": [], "Comments": []}, ] @@ -261,7 +265,7 @@ class TestExportIntentDetectionAndSlotFilling(TestExport): **self.data1, "entities": [list(self.span.to_tuple())], "cats": [self.category1.to_string()], - "Comments": self.comment1 + "Comments": [self.comment1.to_string()] }, ], self.project.annotator.username: [], @@ -278,7 +282,7 @@ class TestExportIntentDetectionAndSlotFilling(TestExport): **self.data1, "entities": [list(self.span.to_tuple())], "cats": sorted([self.category1.to_string(), self.category2.to_string()]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, ] self.assertEqual(dataset, expected_dataset) @@ -297,7 +301,7 @@ class TestExportSequenceLabeling(TestExport): mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed") self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) self.data1 = self.data_to_text(self.example1) self.data2 = self.data_to_text(self.example2) @@ -306,11 +310,11 @@ class TestExportSequenceLabeling(TestExport): datasets = self.export_dataset() expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [list(self.span1.to_tuple())], "Comments": self.comment1}, + {**self.data1, "label": [list(self.span1.to_tuple())], "Comments": [self.comment1.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], self.project.annotator.username: [ - {**self.data1, "label": [list(self.span2.to_tuple())], "Comments": self.comment2}, + {**self.data1, "label": [list(self.span2.to_tuple())], "Comments": [self.comment2.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], self.project.approver.username: [ @@ -328,9 +332,9 @@ class TestExportSequenceLabeling(TestExport): { **self.data1, "label": [list(self.span1.to_tuple()), list(self.span2.to_tuple())], - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, - {**self.data2, "label": []}, + {**self.data2, "label": [], "Comments": []}, ] self.assertEqual(dataset, expected_dataset) @@ -339,7 +343,7 @@ class TestExportSequenceLabeling(TestExport): datasets = self.export_dataset(confirmed_only=True) expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [list(self.span1.to_tuple())], "Comments": self.comment1}, + {**self.data1, "label": [list(self.span1.to_tuple())], "Comments": [self.comment1.to_string()]}, ], self.project.annotator.username: [], self.project.approver.username: [], @@ -354,7 +358,7 @@ class TestExportSequenceLabeling(TestExport): { **self.data1, "label": [list(self.span1.to_tuple()), list(self.span2.to_tuple())], - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, ] self.assertEqual(dataset, expected_dataset) @@ -368,7 +372,7 @@ class TestExportSpeechToText(TestExport): self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin) self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator) self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) self.data1 = self.data_to_filename(self.example1) self.data2 = self.data_to_filename(self.example2) @@ -378,7 +382,7 @@ class TestExportSpeechToText(TestExport): datasets = self.export_dataset() expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [self.text1.text], "Comments": self.comment1}, + {**self.data1, "label": [self.text1.text], "Comments": [self.comment1.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], self.project.approver.username: [ @@ -386,7 +390,7 @@ class TestExportSpeechToText(TestExport): {**self.data2, "label": [], "Comments": []}, ], self.project.annotator.username: [ - {**self.data1, "label": [self.text2.text], "Comments": self.comment2}, + {**self.data1, "label": [self.text2.text], "Comments": [self.comment2.to_string()]}, {**self.data2, "label": [], "Comments": []}, ], } @@ -400,9 +404,9 @@ class TestExportSpeechToText(TestExport): { **self.data1, "label": sorted([self.text1.text, self.text2.text]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, - {**self.data2, "label": []}, + {**self.data2, "label": [], "Comments": []}, ] self.assertEqual(dataset, expected_dataset) @@ -411,7 +415,7 @@ class TestExportSpeechToText(TestExport): datasets = self.export_dataset(confirmed_only=True) expected_datasets = { self.project.admin.username: [ - {**self.data1, "label": [self.text1.text], "Comments": self.comment1}, + {**self.data1, "label": [self.text1.text], "Comments": [self.comment1.to_string()]}, ], self.project.annotator.username: [], self.project.approver.username: [], @@ -426,7 +430,7 @@ class TestExportSpeechToText(TestExport): { **self.data1, "label": sorted([self.text1.text, self.text2.text]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) } ] self.assertEqual(dataset, expected_dataset) @@ -440,7 +444,7 @@ class TestExportImageClassification(TestExport): self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin) self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator) self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) self.data1 = self.data_to_filename(self.example1) self.data2 = self.data_to_filename(self.example2) @@ -453,9 +457,9 @@ class TestExportImageClassification(TestExport): { **self.data1, "label": [self.category1.to_string()], - "Comments": self.comment1 + "Comments": [self.comment1.to_string()] }, - {**self.data2, "label": [], "Comments": self.comment2}, + {**self.data2, "label": [], "Comments": []}, ], self.project.approver.username: [ {**self.data1, "label": [], "Comments": []}, @@ -465,7 +469,7 @@ class TestExportImageClassification(TestExport): { **self.data1, "label": [self.category2.to_string()], - "Comments": self.comment2 + "Comments": [self.comment2.to_string()] }, {**self.data2, "label": [], "Comments": []}, ], @@ -480,7 +484,7 @@ class TestExportImageClassification(TestExport): { **self.data1, "label": sorted([self.category1.to_string(), self.category2.to_string()]), - "Comments": [self.comment1, self.comment2] + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) }, {**self.data2, "label": [], "Comments": []}, ] @@ -489,7 +493,11 @@ class TestExportImageClassification(TestExport): def test_confirmed_and_non_collaborative(self): self.prepare_data() datasets = self.export_dataset(confirmed_only=True) - expected_datasets = {self.project.admin.username: [{**self.data1, "label": [self.category1.to_string()]}]} + expected_datasets = { + self.project.admin.username: [ + {**self.data1, "label": [self.category1.to_string()], "Comments": [self.comment1.to_string()]} + ] + } for username, dataset in expected_datasets.items(): self.assertEqual(datasets[username], dataset) @@ -500,6 +508,7 @@ class TestExportImageClassification(TestExport): { **self.data1, "label": sorted([self.category1.to_string(), self.category2.to_string()]), + "Comments": sorted([self.comment1.to_string(), self.comment2.to_string()]) } ] self.assertEqual(dataset, expected_dataset) @@ -523,7 +532,7 @@ class TestExportRelation(TestExport): "ExportedRelation", from_id=self.span1, to_id=self.span2, example=self.example1, user=self.project.admin ) self.comment1 = mommy.make("ExportedComment", example=self.example1, user=self.project.admin) - self.comment2 = mommy.make("ExportedComment", example=self.example2, user=self.project.annotator) + self.comment2 = mommy.make("ExportedComment", example=self.example1, user=self.project.annotator) mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) self.data1 = self.data_to_text(self.example1) self.data2 = self.data_to_text(self.example2) @@ -537,7 +546,7 @@ class TestExportRelation(TestExport): **self.data1, "entities": [self.span1.to_dict(), self.span2.to_dict()], "relations": [self.relation.to_dict()], - "Comments": self.comment1 + "Comments": [self.comment1.to_dict()] }, {**self.data2, "entities": [], "relations": [], "Comments": []}, ], @@ -546,9 +555,9 @@ class TestExportRelation(TestExport): **self.data1, "entities": [self.span3.to_dict()], "relations": [], - "Comments": self.comment2 + "Comments": [self.comment2.to_dict()] }, - {**self.data2, "entities": [], "relations": [], "Comments": self.comment2}, + {**self.data2, "entities": [], "relations": [], "Comments": []}, ], self.project.approver.username: [ {**self.data1, "entities": [], "relations": [], "Comments": []}, @@ -566,7 +575,7 @@ class TestExportRelation(TestExport): **self.data1, "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()], "relations": [self.relation.to_dict()], - "Comments": [self.comment1, self.comment2] + "Comments": [self.comment1.to_dict(), self.comment2.to_dict()] }, {**self.data2, "entities": [], "relations": [], "Comments": []}, ] @@ -581,7 +590,7 @@ class TestExportRelation(TestExport): **self.data1, "entities": [self.span1.to_dict(), self.span2.to_dict()], "relations": [self.relation.to_dict()], - "Comments": self.comment1 + "Comments": [self.comment1.to_dict()] }, ], self.project.annotator.username: [], @@ -598,7 +607,7 @@ class TestExportRelation(TestExport): **self.data1, "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()], "relations": [self.relation.to_dict()], - "Comments": [self.comment1, self.comment2] + "Comments": [self.comment1.to_dict(), self.comment2.to_dict()] } ] self.assertEqual(dataset, expected_dataset)