Browse Source

Update test cases for ingesting data to check mapping

pull/1544/head
Hironsan 3 years ago
parent
commit
4fc3496529
19 changed files with 144 additions and 92 deletions
  1. 6
      backend/api/tests/data/example.txt
  2. 10
      backend/api/tests/data/seq2seq/example.csv
  3. 7
      backend/api/tests/data/seq2seq/example.json
  4. 7
      backend/api/tests/data/seq2seq/example.jsonl
  5. 17
      backend/api/tests/data/sequence_labeling/example.conll
  6. 6
      backend/api/tests/data/sequence_labeling/example.jsonl
  7. 10
      backend/api/tests/data/text_classification/example.csv
  8. 8
      backend/api/tests/data/text_classification/example.json
  9. 7
      backend/api/tests/data/text_classification/example.jsonl
  10. BIN
      backend/api/tests/data/text_classification/example.xlsx
  11. 4
      backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv
  12. BIN
      backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx
  13. 8
      backend/api/tests/data/text_classification/example_fasttext.txt
  14. 1
      backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt
  15. 2
      backend/api/tests/data/text_classification/example_fasttext_without_text.txt
  16. 4
      backend/api/tests/data/text_classification/example_one_column.csv
  17. BIN
      backend/api/tests/data/text_classification/example_one_column.xlsx
  18. 8
      backend/api/tests/data/text_classification/example_out_of_order_columns.csv
  19. 131
      backend/api/tests/test_tasks.py

6
backend/api/tests/data/example.txt

@ -1,3 +1,3 @@
example1
example2
example3
exampleA
exampleB
exampleC

10
backend/api/tests/data/seq2seq/example.csv

@ -1,5 +1,5 @@
text,label,meta
AAA,LabelA,
BBB,LabelB,MetaB
CCC,LabelC
DDD,,MetaD
text,label
exampleA,label1
exampleB,
,label2
,

7
backend/api/tests/data/seq2seq/example.json

@ -1,5 +1,6 @@
[
{"text": "example", "label": ["example1", "example2"]},
{"text": "example", "label": ["example"]},
{"text": "example", "label": ["example"]}
{"text": "exampleA", "label": ["label1"]},
{"text": "exampleB", "label": []},
{"label": ["example"]},
{}
]

7
backend/api/tests/data/seq2seq/example.jsonl

@ -1,3 +1,4 @@
{"text": "example", "label": ["example1", "example2"], "meta": {"wikiPageID": 1}}
{"text": "example", "label": ["example"], "meta": {"wikiPageID": 2}}
{"text": "example", "label": ["example"], "meta": {"wikiPageID": 3}}
{"text": "exampleA", "label": ["label1"]}
{"text": "exampleB", "label": []}
{"label": ["label2"]}
{}

17
backend/api/tests/data/sequence_labeling/example.conll

@ -1,22 +1,5 @@
SOCCER O
- O
JAPAN B-LOC
GET O
LUCKY O
WIN O
, O
CHINA B-PER
IN O
SURPRISE O
DEFEAT O
. O
Nadim B-PER
Ladki I-PER
AL-AIN B-LOC
, O
United B-LOC
Arab I-LOC
Emirates I-LOC
1996-12-06 O

6
backend/api/tests/data/sequence_labeling/example.jsonl

@ -1,3 +1,3 @@
{"text": "example", "label": [[0, 1, "LOC"], [0, 2, "ORG"]], "meta": {"wikiPageID": 1}}
{"text": "example", "label": [[0, 1, "LOC"]], "meta": {"wikiPageID": 2}}
{"text": "example", "label": [[0, 1, "PER"]], "meta": {"wikiPageID": 3}}
{"text": "exampleA", "label": [[0, 1, "LOC"]], "meta": {"wikiPageID": 1}}
{"text": "exampleB", "label": [], "meta": {"wikiPageID": 2}}
{"label": [[0, 1, "PER"]], "meta": {"wikiPageID": 3}}

10
backend/api/tests/data/text_classification/example.csv

@ -1,5 +1,5 @@
text,label,meta
AAA,,
BBB,Positive,The following is meta data
CCC,Negative
DDD,,This is meta data
text,label
exampleA,positive
exampleB,
,negative
,

8
backend/api/tests/data/text_classification/example.json

@ -1,6 +1,6 @@
[
{"text": "example", "label": ["positive"]},
{"text": "example", "label": ["positive", "negative"]},
{"text": "example", "label": ["negative"]},
{"text": "example", "label": ["neutral"]}
{"text": "exampleA", "label": ["positive"]},
{"text": "exampleB", "label": ["positive", "negative"]},
{"text": "exampleC", "label": []},
{"label": ["neutral"]}
]

7
backend/api/tests/data/text_classification/example.jsonl

@ -1,4 +1,3 @@
{"text": "example", "labels": ["positive"], "meta": {"wikiPageID": 1}}
{"text": "example", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
{"text": "example", "labels": ["negative"], "meta": {"wikiPageID": 3}}
{"text": "example", "labels": ["neutral"], "meta": {"wikiPageID": 4}}
{"text": "exampleA", "labels": ["positive"], "meta": {"wikiPageID": 1}}
{"text": "exampleB", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
{"text": "exampleC", "labels": [], "meta": {"wikiPageID": 3}}

BIN
backend/api/tests/data/text_classification/example.xlsx

4
backend/api/tests/data/text_classification/example_column_and_row_not_matching.csv

@ -1,4 +0,0 @@
text, label
AAA
BBB
CCC

BIN
backend/api/tests/data/text_classification/example_column_and_row_not_matching.xlsx

8
backend/api/tests/data/text_classification/example_fasttext.txt

@ -1,4 +1,4 @@
__label__house mansion home
__label__president __label__american __label__us Obama Trump Kennedy
VW __label__car BMW
dog cat
__label__positive exampleA
__label__positive __label__negative exampleB
exampleC
__label__positive

1
backend/api/tests/data/text_classification/example_fasttext_label_tag_without_name.txt

@ -1 +0,0 @@
__label__ house cat dog

2
backend/api/tests/data/text_classification/example_fasttext_without_text.txt

@ -1,2 +0,0 @@
__label__cat ex ex ex
__label__dog

4
backend/api/tests/data/text_classification/example_one_column.csv

@ -1,4 +0,0 @@
text
AAA
BBB
CCC

BIN
backend/api/tests/data/text_classification/example_one_column.xlsx

8
backend/api/tests/data/text_classification/example_out_of_order_columns.csv

@ -1,4 +1,4 @@
label,foo,text,bar,baz
Positive,foo1,AAA,barA,baz
Positive,foo2,BBB,barB,bazz
Negative,foo3,CCC,barC,bazzz
label,foo,text
positive,foo1,exampleA
,foo2,exampleB
negative,foo3,

131
backend/api/tests/test_tasks.py

@ -3,7 +3,7 @@ import pathlib
from django.test import TestCase
from ..models import (DOCUMENT_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING,
Category, Example, Label, Span, TextLabel)
Category, Example)
from ..tasks import injest_data
from .api.utils import prepare_project
@ -17,92 +17,171 @@ class TestIngestData(TestCase):
self.user = self.project.users[0]
self.data_path = pathlib.Path(__file__).parent / 'data'
def assert_count(self,
filename,
file_format,
kwargs=None,
expected_example=0,
expected_label=0,
expected_annotation=0):
def ingest_data(self, filename, file_format, kwargs=None):
filenames = [str(self.data_path / filename)]
kwargs = kwargs or {}
injest_data(self.user.id, self.project.item.id, filenames, file_format, **kwargs)
self.assertEqual(Example.objects.count(), expected_example)
self.assertEqual(Label.objects.count(), expected_label)
self.assertEqual(self.annotation_class.objects.count(), expected_annotation)
class TestIngestClassificationData(TestIngestData):
task = DOCUMENT_CLASSIFICATION
annotation_class = Category
def assert_examples(self, dataset):
for text, expected_labels in dataset:
example = Example.objects.get(text=text)
labels = set(cat.label.text for cat in example.categories.all())
self.assertEqual(labels, set(expected_labels))
def test_jsonl(self):
filename = 'text_classification/example.jsonl'
file_format = 'JSONL'
kwargs = {'column_label': 'labels'}
self.assert_count(filename, file_format, kwargs, expected_example=4, expected_label=3, expected_annotation=5)
dataset = [
('exampleA', ['positive']),
('exampleB', ['positive', 'negative']),
('exampleC', [])
]
self.ingest_data(filename, file_format, kwargs)
self.assert_examples(dataset)
def test_csv(self):
filename = 'text_classification/example.csv'
file_format = 'CSV'
self.assert_count(filename, file_format, expected_example=4, expected_label=2, expected_annotation=2)
dataset = [
('exampleA', ['positive']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_csv_out_of_order_columns(self):
filename = 'text_classification/example_out_of_order_columns.csv'
file_format = 'CSV'
dataset = [
('exampleA', ['positive']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_fasttext(self):
filename = 'text_classification/example_fasttext.txt'
file_format = 'fastText'
self.assert_count(filename, file_format, expected_example=4, expected_label=5, expected_annotation=5)
dataset = [
('exampleA', ['positive']),
('exampleB', ['positive', 'negative']),
('exampleC', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_excel(self):
filename = 'text_classification/example.xlsx'
file_format = 'Excel'
self.assert_count(filename, file_format, expected_example=3, expected_label=2, expected_annotation=3)
dataset = [
('exampleA', ['positive']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_json(self):
filename = 'text_classification/example.json'
file_format = 'JSON'
self.assert_count(filename, file_format, expected_example=4, expected_label=3, expected_annotation=5)
dataset = [
('exampleA', ['positive']),
('exampleB', ['positive', 'negative']),
('exampleC', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_textfile(self):
filename = 'example.txt'
file_format = 'TextFile'
self.assert_count(filename, file_format, expected_example=1, expected_label=0, expected_annotation=0)
dataset = [
('exampleA\nexampleB\nexampleC\n', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_textline(self):
filename = 'example.txt'
file_format = 'TextLine'
self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=0)
dataset = [
('exampleA', []),
('exampleB', []),
('exampleC', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
class TestIngestSequenceLabelingData(TestIngestData):
task = SEQUENCE_LABELING
annotation_class = Span
def assert_examples(self, dataset):
for text, expected_labels in dataset:
example = Example.objects.get(text=text)
labels = [[span.start_offset, span.end_offset, span.label.text] for span in example.spans.all()]
self.assertEqual(labels, expected_labels)
def test_jsonl(self):
filename = 'sequence_labeling/example.jsonl'
file_format = 'JSONL'
self.assert_count(filename, file_format, expected_example=3, expected_label=3, expected_annotation=4)
dataset = [
('exampleA', [[0, 1, 'LOC']]),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_conll(self):
filename = 'sequence_labeling/example.conll'
file_format = 'CoNLL'
self.assert_count(filename, file_format, expected_example=3, expected_label=2, expected_annotation=5)
dataset = [
('JAPAN GET', [[0, 5, 'LOC']]),
('Nadim Ladki', [[0, 11, 'PER']])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
class TestIngestSeq2seqData(TestIngestData):
task = SEQ2SEQ
annotation_class = TextLabel
def assert_examples(self, dataset):
for text, expected_labels in dataset:
example = Example.objects.get(text=text)
labels = set(text_label.text for text_label in example.texts.all())
self.assertEqual(labels, set(expected_labels))
def test_jsonl(self):
filename = 'seq2seq/example.jsonl'
file_format = 'JSONL'
self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=4)
dataset = [
('exampleA', ['label1']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_json(self):
filename = 'seq2seq/example.json'
file_format = 'JSON'
self.assert_count(filename, file_format, expected_example=3, expected_label=0, expected_annotation=4)
dataset = [
('exampleA', ['label1']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
def test_csv(self):
filename = 'seq2seq/example.csv'
file_format = 'CSV'
self.assert_count(filename, file_format, expected_example=4, expected_label=0, expected_annotation=3)
dataset = [
('exampleA', ['label1']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.assert_examples(dataset)
Loading…
Cancel
Save