Update test cases for ingesting data to check mapping

3 years ago · 4fc3496529
19 changed files with 144 additions and 92 deletions
--- a/backend/api/tests/data/example.txt
+++ b/backend/api/tests/data/example.txt
@ -1,3 +1,3 @@
-example1
-example2
-example3
+exampleA
+exampleB
+exampleC
--- a/backend/api/tests/data/seq2seq/example.csv
+++ b/backend/api/tests/data/seq2seq/example.csv
@ -1,5 +1,5 @@
-text,label,meta
-AAA,LabelA,
-BBB,LabelB,MetaB
-CCC,LabelC
-DDD,,MetaD
+text,label
+exampleA,label1
+exampleB,
+,label2
+,
--- a/backend/api/tests/data/seq2seq/example.json
+++ b/backend/api/tests/data/seq2seq/example.json
@ -1,5 +1,6 @@
 [
-  {"text": "example", "label": ["example1", "example2"]},
-  {"text": "example", "label": ["example"]},
-  {"text": "example", "label": ["example"]}
+  {"text": "exampleA", "label": ["label1"]},
+  {"text": "exampleB", "label": []},
+  {"label": ["example"]},
+  {}
 ]
--- a/backend/api/tests/data/seq2seq/example.jsonl
+++ b/backend/api/tests/data/seq2seq/example.jsonl
@ -1,3 +1,4 @@
-{"text": "example", "label": ["example1", "example2"], "meta": {"wikiPageID": 1}}
-{"text": "example", "label": ["example"], "meta": {"wikiPageID": 2}}
-{"text": "example", "label": ["example"], "meta": {"wikiPageID": 3}}
+{"text": "exampleA", "label": ["label1"]}
+{"text": "exampleB", "label": []}
+{"label": ["label2"]}
+{}
--- a/backend/api/tests/data/sequence_labeling/example.conll
+++ b/backend/api/tests/data/sequence_labeling/example.conll
@ -1,22 +1,5 @@
-SOCCER	O
-	O
 JAPAN	B-LOC
 GET	O
-LUCKY	O
-WIN	O
-,	O
-CHINA	B-PER
-IN	O
-SURPRISE	O
-DEFEAT	O
-.	O

 Nadim	B-PER
 Ladki	I-PER
-
-AL-AIN	B-LOC
-,	O
-United	B-LOC
-Arab	I-LOC
-Emirates	I-LOC
-1996-12-06	O
--- a/backend/api/tests/data/sequence_labeling/example.jsonl
+++ b/backend/api/tests/data/sequence_labeling/example.jsonl
@ -1,3 +1,3 @@
-{"text": "example", "label": [[0, 1, "LOC"], [0, 2, "ORG"]], "meta": {"wikiPageID": 1}}
-{"text": "example", "label": [[0, 1, "LOC"]], "meta": {"wikiPageID": 2}}
-{"text": "example", "label": [[0, 1, "PER"]], "meta": {"wikiPageID": 3}}
+{"text": "exampleA", "label": [[0, 1, "LOC"]], "meta": {"wikiPageID": 1}}
+{"text": "exampleB", "label": [], "meta": {"wikiPageID": 2}}
+{"label": [[0, 1, "PER"]], "meta": {"wikiPageID": 3}}
--- a/backend/api/tests/data/text_classification/example.csv
+++ b/backend/api/tests/data/text_classification/example.csv
@ -1,5 +1,5 @@
-text,label,meta
-AAA,,
-BBB,Positive,The following is meta data
-CCC,Negative
-DDD,,This is meta data
+text,label
+exampleA,positive
+exampleB,
+,negative
+,
--- a/backend/api/tests/data/text_classification/example.json
+++ b/backend/api/tests/data/text_classification/example.json
@ -1,6 +1,6 @@
 [
-  {"text": "example", "label": ["positive"]},
-  {"text": "example", "label": ["positive", "negative"]},
-  {"text": "example", "label": ["negative"]},
-  {"text": "example", "label": ["neutral"]}
+  {"text": "exampleA", "label": ["positive"]},
+  {"text": "exampleB", "label": ["positive", "negative"]},
+  {"text": "exampleC", "label": []},
+  {"label": ["neutral"]}
 ]
--- a/backend/api/tests/data/text_classification/example.jsonl
+++ b/backend/api/tests/data/text_classification/example.jsonl
@ -1,4 +1,3 @@
-{"text": "example", "labels": ["positive"], "meta": {"wikiPageID": 1}}
-{"text": "example", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
-{"text": "example", "labels": ["negative"], "meta": {"wikiPageID": 3}}
-{"text": "example", "labels": ["neutral"], "meta": {"wikiPageID": 4}}
+{"text": "exampleA", "labels": ["positive"], "meta": {"wikiPageID": 1}}
+{"text": "exampleB", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
+{"text": "exampleC", "labels": [], "meta": {"wikiPageID": 3}}
--- a/backend/api/tests/data/text_classification/example.xlsx
+++ b/backend/api/tests/data/text_classification/example.xlsx
--- a/backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv
+++ b/backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv
@ -1,4 +0,0 @@
-text, label
-AAA
-BBB
-CCC
--- a/backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx
+++ b/backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx
--- a/backend/api/tests/data/text_classification/example_fasttext.txt
+++ b/backend/api/tests/data/text_classification/example_fasttext.txt
@ -1,4 +1,4 @@
-__label__house mansion home
-__label__president __label__american __label__us Obama Trump Kennedy
-VW __label__car BMW
-dog cat
+__label__positive exampleA
+__label__positive __label__negative exampleB
+exampleC
+__label__positive
--- a/backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt
+++ b/backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt
@ -1 +0,0 @@
-__label__ house cat dog
--- a/backend/api/tests/data/text_classification/example_fasttext_without_text.txt
+++ b/backend/api/tests/data/text_classification/example_fasttext_without_text.txt
@ -1,2 +0,0 @@
-__label__cat ex ex ex
-__label__dog 
--- a/backend/api/tests/data/text_classification/example_one_column.csv
+++ b/backend/api/tests/data/text_classification/example_one_column.csv
@ -1,4 +0,0 @@
-text
-AAA
-BBB
-CCC
--- a/backend/api/tests/data/text_classification/example_one_column.xlsx
+++ b/backend/api/tests/data/text_classification/example_one_column.xlsx
--- a/backend/api/tests/data/text_classification/example_out_of_order_columns.csv
+++ b/backend/api/tests/data/text_classification/example_out_of_order_columns.csv
@ -1,4 +1,4 @@
-label,foo,text,bar,baz
-Positive,foo1,AAA,barA,baz
-Positive,foo2,BBB,barB,bazz
-Negative,foo3,CCC,barC,bazzz
+label,foo,text
+positive,foo1,exampleA
+,foo2,exampleB
+negative,foo3,
--- a/backend/api/tests/test_tasks.py
+++ b/backend/api/tests/test_tasks.py
@ -3,7 +3,7 @@ import pathlib
 from django.test import TestCase

 from ..models import (DOCUMENT_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING,
-                      Category, Example, Label, Span, TextLabel)
+                      Category, Example)
 from ..tasks import injest_data
 from .api.utils import prepare_project

@ -17,92 +17,171 @@ class TestIngestData(TestCase):
        self.user = self.project.users[0]
        self.data_path = pathlib.Path(__file__).parent / 'data'

-    def assert_count(self,
-                     filename,
-                     file_format,
-                     kwargs=None,
-                     expected_example=0,
-                     expected_label=0,
-                     expected_annotation=0):
+    def ingest_data(self, filename, file_format, kwargs=None):
        filenames = [str(self.data_path / filename)]
        kwargs = kwargs or {}
        injest_data(self.user.id, self.project.item.id, filenames, file_format, **kwargs)
-        self.assertEqual(Example.objects.count(), expected_example)
-        self.assertEqual(Label.objects.count(), expected_label)
-        self.assertEqual(self.annotation_class.objects.count(), expected_annotation)


 class TestIngestClassificationData(TestIngestData):
    task = DOCUMENT_CLASSIFICATION
-    annotation_class = Category
+
+    def assert_examples(self, dataset):
+        for text, expected_labels in dataset:
+            example = Example.objects.get(text=text)
+            labels = set(cat.label.text for cat in example.categories.all())
+            self.assertEqual(labels, set(expected_labels))

    def test_jsonl(self):
        filename = 'text_classification/example.jsonl'
        file_format = 'JSONL'
        kwargs = {'column_label': 'labels'}
-        self.assert_count(filename, file_format, kwargs, expected_example=4, expected_label=3, expected_annotation=5)
+        dataset = [
+            ('exampleA', ['positive']),
+            ('exampleB', ['positive', 'negative']),
+            ('exampleC', [])
+        ]
+        self.ingest_data(filename, file_format, kwargs)
+        self.assert_examples(dataset)

    def test_csv(self):
        filename = 'text_classification/example.csv'
        file_format = 'CSV'
-        self.assert_count(filename, file_format, expected_example=4, expected_label=2, expected_annotation=2)
+        dataset = [
+            ('exampleA', ['positive']),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)
+
+    def test_csv_out_of_order_columns(self):
+        filename = 'text_classification/example_out_of_order_columns.csv'
+        file_format = 'CSV'
+        dataset = [
+            ('exampleA', ['positive']),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_fasttext(self):
        filename = 'text_classification/example_fasttext.txt'
        file_format = 'fastText'
-        self.assert_count(filename, file_format, expected_example=4, expected_label=5, expected_annotation=5)
+        dataset = [
+            ('exampleA', ['positive']),
+            ('exampleB', ['positive', 'negative']),
+            ('exampleC', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_excel(self):
        filename = 'text_classification/example.xlsx'
        file_format = 'Excel'
-        self.assert_count(filename, file_format, expected_example=3, expected_label=2, expected_annotation=3)
+        dataset = [
+            ('exampleA', ['positive']),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_json(self):
        filename = 'text_classification/example.json'
        file_format = 'JSON'
-        self.assert_count(filename, file_format, expected_example=4, expected_label=3, expected_annotation=5)
+        dataset = [
+            ('exampleA', ['positive']),
+            ('exampleB', ['positive', 'negative']),
+            ('exampleC', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_textfile(self):
        filename = 'example.txt'
        file_format = 'TextFile'
-        self.assert_count(filename, file_format, expected_example=1, expected_label=0, expected_annotation=0)
+        dataset = [
+            ('exampleA\nexampleB\nexampleC\n', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_textline(self):
        filename = 'example.txt'
        file_format = 'TextLine'
-        self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=0)
+        dataset = [
+            ('exampleA', []),
+            ('exampleB', []),
+            ('exampleC', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)


 class TestIngestSequenceLabelingData(TestIngestData):
    task = SEQUENCE_LABELING
-    annotation_class = Span
+
+    def assert_examples(self, dataset):
+        for text, expected_labels in dataset:
+            example = Example.objects.get(text=text)
+            labels = [[span.start_offset, span.end_offset, span.label.text] for span in example.spans.all()]
+            self.assertEqual(labels, expected_labels)

    def test_jsonl(self):
        filename = 'sequence_labeling/example.jsonl'
        file_format = 'JSONL'
-        self.assert_count(filename, file_format, expected_example=3, expected_label=3, expected_annotation=4)
+        dataset = [
+            ('exampleA', [[0, 1, 'LOC']]),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_conll(self):
        filename = 'sequence_labeling/example.conll'
        file_format = 'CoNLL'
-        self.assert_count(filename, file_format, expected_example=3, expected_label=2, expected_annotation=5)
+        dataset = [
+            ('JAPAN GET', [[0, 5, 'LOC']]),
+            ('Nadim Ladki', [[0, 11, 'PER']])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)


 class TestIngestSeq2seqData(TestIngestData):
    task = SEQ2SEQ
-    annotation_class = TextLabel
+
+    def assert_examples(self, dataset):
+        for text, expected_labels in dataset:
+            example = Example.objects.get(text=text)
+            labels = set(text_label.text for text_label in example.texts.all())
+            self.assertEqual(labels, set(expected_labels))

    def test_jsonl(self):
        filename = 'seq2seq/example.jsonl'
        file_format = 'JSONL'
-        self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=4)
+        dataset = [
+            ('exampleA', ['label1']),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_json(self):
        filename = 'seq2seq/example.json'
        file_format = 'JSON'
-        self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=4)
+        dataset = [
+            ('exampleA', ['label1']),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)

    def test_csv(self):
        filename = 'seq2seq/example.csv'
        file_format = 'CSV'
-        self.assert_count(filename, file_format, expected_example=4, expected_label=0, expected_annotation=3)
+        dataset = [
+            ('exampleA', ['label1']),
+            ('exampleB', [])
+        ]
+        self.ingest_data(filename, file_format)
+        self.assert_examples(dataset)