diff --git a/backend/api/tests/data/example.txt b/backend/api/tests/data/example.txt index 161bb8ec..da38709c 100644 --- a/backend/api/tests/data/example.txt +++ b/backend/api/tests/data/example.txt @@ -1,3 +1,3 @@ -example1 -example2 -example3 \ No newline at end of file +exampleA +exampleB +exampleC diff --git a/backend/api/tests/data/seq2seq/example.csv b/backend/api/tests/data/seq2seq/example.csv index d8ccc403..41bae2ef 100644 --- a/backend/api/tests/data/seq2seq/example.csv +++ b/backend/api/tests/data/seq2seq/example.csv @@ -1,5 +1,5 @@ -text,label,meta -AAA,LabelA, -BBB,LabelB,MetaB -CCC,LabelC -DDD,,MetaD +text,label +exampleA,label1 +exampleB, +,label2 +, diff --git a/backend/api/tests/data/seq2seq/example.json b/backend/api/tests/data/seq2seq/example.json index dfac44b9..e83bbb8b 100644 --- a/backend/api/tests/data/seq2seq/example.json +++ b/backend/api/tests/data/seq2seq/example.json @@ -1,5 +1,6 @@ [ - {"text": "example", "label": ["example1", "example2"]}, - {"text": "example", "label": ["example"]}, - {"text": "example", "label": ["example"]} + {"text": "exampleA", "label": ["label1"]}, + {"text": "exampleB", "label": []}, + {"label": ["example"]}, + {} ] diff --git a/backend/api/tests/data/seq2seq/example.jsonl b/backend/api/tests/data/seq2seq/example.jsonl index 2069686e..823bf621 100644 --- a/backend/api/tests/data/seq2seq/example.jsonl +++ b/backend/api/tests/data/seq2seq/example.jsonl @@ -1,3 +1,4 @@ -{"text": "example", "label": ["example1", "example2"], "meta": {"wikiPageID": 1}} -{"text": "example", "label": ["example"], "meta": {"wikiPageID": 2}} -{"text": "example", "label": ["example"], "meta": {"wikiPageID": 3}} +{"text": "exampleA", "label": ["label1"]} +{"text": "exampleB", "label": []} +{"label": ["label2"]} +{} diff --git a/backend/api/tests/data/sequence_labeling/example.conll b/backend/api/tests/data/sequence_labeling/example.conll index 6acbf79f..4893d360 100644 --- a/backend/api/tests/data/sequence_labeling/example.conll +++ b/backend/api/tests/data/sequence_labeling/example.conll @@ -1,22 +1,5 @@ -SOCCER O -- O JAPAN B-LOC GET O -LUCKY O -WIN O -, O -CHINA B-PER -IN O -SURPRISE O -DEFEAT O -. O Nadim B-PER Ladki I-PER - -AL-AIN B-LOC -, O -United B-LOC -Arab I-LOC -Emirates I-LOC -1996-12-06 O diff --git a/backend/api/tests/data/sequence_labeling/example.jsonl b/backend/api/tests/data/sequence_labeling/example.jsonl index 418b1b5c..ecb92aac 100644 --- a/backend/api/tests/data/sequence_labeling/example.jsonl +++ b/backend/api/tests/data/sequence_labeling/example.jsonl @@ -1,3 +1,3 @@ -{"text": "example", "label": [[0, 1, "LOC"], [0, 2, "ORG"]], "meta": {"wikiPageID": 1}} -{"text": "example", "label": [[0, 1, "LOC"]], "meta": {"wikiPageID": 2}} -{"text": "example", "label": [[0, 1, "PER"]], "meta": {"wikiPageID": 3}} +{"text": "exampleA", "label": [[0, 1, "LOC"]], "meta": {"wikiPageID": 1}} +{"text": "exampleB", "label": [], "meta": {"wikiPageID": 2}} +{"label": [[0, 1, "PER"]], "meta": {"wikiPageID": 3}} diff --git a/backend/api/tests/data/text_classification/example.csv b/backend/api/tests/data/text_classification/example.csv index 6b1a31de..30f43cc3 100644 --- a/backend/api/tests/data/text_classification/example.csv +++ b/backend/api/tests/data/text_classification/example.csv @@ -1,5 +1,5 @@ -text,label,meta -AAA,, -BBB,Positive,The following is meta data -CCC,Negative -DDD,,This is meta data \ No newline at end of file +text,label +exampleA,positive +exampleB, +,negative +, diff --git a/backend/api/tests/data/text_classification/example.json b/backend/api/tests/data/text_classification/example.json index 7143f59a..2243bcb1 100644 --- a/backend/api/tests/data/text_classification/example.json +++ b/backend/api/tests/data/text_classification/example.json @@ -1,6 +1,6 @@ [ - {"text": "example", "label": ["positive"]}, - {"text": "example", "label": ["positive", "negative"]}, - {"text": "example", "label": ["negative"]}, - {"text": "example", "label": ["neutral"]} + {"text": "exampleA", "label": ["positive"]}, + {"text": "exampleB", "label": ["positive", "negative"]}, + {"text": "exampleC", "label": []}, + {"label": ["neutral"]} ] diff --git a/backend/api/tests/data/text_classification/example.jsonl b/backend/api/tests/data/text_classification/example.jsonl index 13a16369..910d3965 100644 --- a/backend/api/tests/data/text_classification/example.jsonl +++ b/backend/api/tests/data/text_classification/example.jsonl @@ -1,4 +1,3 @@ -{"text": "example", "labels": ["positive"], "meta": {"wikiPageID": 1}} -{"text": "example", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}} -{"text": "example", "labels": ["negative"], "meta": {"wikiPageID": 3}} -{"text": "example", "labels": ["neutral"], "meta": {"wikiPageID": 4}} +{"text": "exampleA", "labels": ["positive"], "meta": {"wikiPageID": 1}} +{"text": "exampleB", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}} +{"text": "exampleC", "labels": [], "meta": {"wikiPageID": 3}} diff --git a/backend/api/tests/data/text_classification/example.xlsx b/backend/api/tests/data/text_classification/example.xlsx index 12913fdd..3b1821c3 100644 Binary files a/backend/api/tests/data/text_classification/example.xlsx and b/backend/api/tests/data/text_classification/example.xlsx differ diff --git a/backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv b/backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv deleted file mode 100644 index 4e2a2552..00000000 --- a/backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv +++ /dev/null @@ -1,4 +0,0 @@ -text, label -AAA -BBB -CCC \ No newline at end of file diff --git a/backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx b/backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx deleted file mode 100644 index 0731da01..00000000 Binary files a/backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx and /dev/null differ diff --git a/backend/api/tests/data/text_classification/example_fasttext.txt b/backend/api/tests/data/text_classification/example_fasttext.txt index d994c9fd..819b8c87 100644 --- a/backend/api/tests/data/text_classification/example_fasttext.txt +++ b/backend/api/tests/data/text_classification/example_fasttext.txt @@ -1,4 +1,4 @@ -__label__house mansion home -__label__president __label__american __label__us Obama Trump Kennedy -VW __label__car BMW -dog cat \ No newline at end of file +__label__positive exampleA +__label__positive __label__negative exampleB +exampleC +__label__positive diff --git a/backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt b/backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt deleted file mode 100644 index 41457ae4..00000000 --- a/backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt +++ /dev/null @@ -1 +0,0 @@ -__label__ house cat dog \ No newline at end of file diff --git a/backend/api/tests/data/text_classification/example_fasttext_without_text.txt b/backend/api/tests/data/text_classification/example_fasttext_without_text.txt deleted file mode 100644 index a85dc28d..00000000 --- a/backend/api/tests/data/text_classification/example_fasttext_without_text.txt +++ /dev/null @@ -1,2 +0,0 @@ -__label__cat ex ex ex -__label__dog \ No newline at end of file diff --git a/backend/api/tests/data/text_classification/example_one_column.csv b/backend/api/tests/data/text_classification/example_one_column.csv deleted file mode 100644 index 9b34b36b..00000000 --- a/backend/api/tests/data/text_classification/example_one_column.csv +++ /dev/null @@ -1,4 +0,0 @@ -text -AAA -BBB -CCC \ No newline at end of file diff --git a/backend/api/tests/data/text_classification/example_one_column.xlsx b/backend/api/tests/data/text_classification/example_one_column.xlsx deleted file mode 100644 index a9fc4eee..00000000 Binary files a/backend/api/tests/data/text_classification/example_one_column.xlsx and /dev/null differ diff --git a/backend/api/tests/data/text_classification/example_out_of_order_columns.csv b/backend/api/tests/data/text_classification/example_out_of_order_columns.csv index 4bba91c8..c463fa29 100644 --- a/backend/api/tests/data/text_classification/example_out_of_order_columns.csv +++ b/backend/api/tests/data/text_classification/example_out_of_order_columns.csv @@ -1,4 +1,4 @@ -label,foo,text,bar,baz -Positive,foo1,AAA,barA,baz -Positive,foo2,BBB,barB,bazz -Negative,foo3,CCC,barC,bazzz \ No newline at end of file +label,foo,text +positive,foo1,exampleA +,foo2,exampleB +negative,foo3, diff --git a/backend/api/tests/test_tasks.py b/backend/api/tests/test_tasks.py index db7ea29d..5b3845c6 100644 --- a/backend/api/tests/test_tasks.py +++ b/backend/api/tests/test_tasks.py @@ -3,7 +3,7 @@ import pathlib from django.test import TestCase from ..models import (DOCUMENT_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING, - Category, Example, Label, Span, TextLabel) + Category, Example) from ..tasks import injest_data from .api.utils import prepare_project @@ -17,92 +17,171 @@ class TestIngestData(TestCase): self.user = self.project.users[0] self.data_path = pathlib.Path(__file__).parent / 'data' - def assert_count(self, - filename, - file_format, - kwargs=None, - expected_example=0, - expected_label=0, - expected_annotation=0): + def ingest_data(self, filename, file_format, kwargs=None): filenames = [str(self.data_path / filename)] kwargs = kwargs or {} injest_data(self.user.id, self.project.item.id, filenames, file_format, **kwargs) - self.assertEqual(Example.objects.count(), expected_example) - self.assertEqual(Label.objects.count(), expected_label) - self.assertEqual(self.annotation_class.objects.count(), expected_annotation) class TestIngestClassificationData(TestIngestData): task = DOCUMENT_CLASSIFICATION - annotation_class = Category + + def assert_examples(self, dataset): + for text, expected_labels in dataset: + example = Example.objects.get(text=text) + labels = set(cat.label.text for cat in example.categories.all()) + self.assertEqual(labels, set(expected_labels)) def test_jsonl(self): filename = 'text_classification/example.jsonl' file_format = 'JSONL' kwargs = {'column_label': 'labels'} - self.assert_count(filename, file_format, kwargs, expected_example=4, expected_label=3, expected_annotation=5) + dataset = [ + ('exampleA', ['positive']), + ('exampleB', ['positive', 'negative']), + ('exampleC', []) + ] + self.ingest_data(filename, file_format, kwargs) + self.assert_examples(dataset) def test_csv(self): filename = 'text_classification/example.csv' file_format = 'CSV' - self.assert_count(filename, file_format, expected_example=4, expected_label=2, expected_annotation=2) + dataset = [ + ('exampleA', ['positive']), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) + + def test_csv_out_of_order_columns(self): + filename = 'text_classification/example_out_of_order_columns.csv' + file_format = 'CSV' + dataset = [ + ('exampleA', ['positive']), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_fasttext(self): filename = 'text_classification/example_fasttext.txt' file_format = 'fastText' - self.assert_count(filename, file_format, expected_example=4, expected_label=5, expected_annotation=5) + dataset = [ + ('exampleA', ['positive']), + ('exampleB', ['positive', 'negative']), + ('exampleC', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_excel(self): filename = 'text_classification/example.xlsx' file_format = 'Excel' - self.assert_count(filename, file_format, expected_example=3, expected_label=2, expected_annotation=3) + dataset = [ + ('exampleA', ['positive']), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_json(self): filename = 'text_classification/example.json' file_format = 'JSON' - self.assert_count(filename, file_format, expected_example=4, expected_label=3, expected_annotation=5) + dataset = [ + ('exampleA', ['positive']), + ('exampleB', ['positive', 'negative']), + ('exampleC', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_textfile(self): filename = 'example.txt' file_format = 'TextFile' - self.assert_count(filename, file_format, expected_example=1, expected_label=0, expected_annotation=0) + dataset = [ + ('exampleA\nexampleB\nexampleC\n', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_textline(self): filename = 'example.txt' file_format = 'TextLine' - self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=0) + dataset = [ + ('exampleA', []), + ('exampleB', []), + ('exampleC', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) class TestIngestSequenceLabelingData(TestIngestData): task = SEQUENCE_LABELING - annotation_class = Span + + def assert_examples(self, dataset): + for text, expected_labels in dataset: + example = Example.objects.get(text=text) + labels = [[span.start_offset, span.end_offset, span.label.text] for span in example.spans.all()] + self.assertEqual(labels, expected_labels) def test_jsonl(self): filename = 'sequence_labeling/example.jsonl' file_format = 'JSONL' - self.assert_count(filename, file_format, expected_example=3, expected_label=3, expected_annotation=4) + dataset = [ + ('exampleA', [[0, 1, 'LOC']]), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_conll(self): filename = 'sequence_labeling/example.conll' file_format = 'CoNLL' - self.assert_count(filename, file_format, expected_example=3, expected_label=2, expected_annotation=5) + dataset = [ + ('JAPAN GET', [[0, 5, 'LOC']]), + ('Nadim Ladki', [[0, 11, 'PER']]) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) class TestIngestSeq2seqData(TestIngestData): task = SEQ2SEQ - annotation_class = TextLabel + + def assert_examples(self, dataset): + for text, expected_labels in dataset: + example = Example.objects.get(text=text) + labels = set(text_label.text for text_label in example.texts.all()) + self.assertEqual(labels, set(expected_labels)) def test_jsonl(self): filename = 'seq2seq/example.jsonl' file_format = 'JSONL' - self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=4) + dataset = [ + ('exampleA', ['label1']), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_json(self): filename = 'seq2seq/example.json' file_format = 'JSON' - self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=4) + dataset = [ + ('exampleA', ['label1']), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset) def test_csv(self): filename = 'seq2seq/example.csv' file_format = 'CSV' - self.assert_count(filename, file_format, expected_example=4, expected_label=0, expected_annotation=3) + dataset = [ + ('exampleA', ['label1']), + ('exampleB', []) + ] + self.ingest_data(filename, file_format) + self.assert_examples(dataset)