|
|
@ -8,6 +8,7 @@ from django_drf_filepond.models import StoredUpload, TemporaryUpload |
|
|
|
from django_drf_filepond.utils import _get_file_id |
|
|
|
|
|
|
|
from data_import.celery_tasks import import_dataset |
|
|
|
from data_import.pipeline.catalog import RELATION_EXTRACTION |
|
|
|
from examples.models import Example |
|
|
|
from label_types.models import SpanType |
|
|
|
from labels.models import Category, Span |
|
|
@ -40,7 +41,7 @@ class TestImportData(TestCase): |
|
|
|
except StoredUpload.DoesNotExist: |
|
|
|
pass |
|
|
|
|
|
|
|
def import_dataset(self, filename, file_format, kwargs=None): |
|
|
|
def import_dataset(self, filename, file_format, task, kwargs=None): |
|
|
|
file_path = str(self.data_path / filename) |
|
|
|
TemporaryUpload.objects.create( |
|
|
|
upload_id=self.upload_id, |
|
|
@ -51,7 +52,7 @@ class TestImportData(TestCase): |
|
|
|
) |
|
|
|
upload_ids = [self.upload_id] |
|
|
|
kwargs = kwargs or {} |
|
|
|
return import_dataset(self.user.id, self.project.item.id, file_format, upload_ids, **kwargs) |
|
|
|
return import_dataset(self.user.id, self.project.item.id, file_format, upload_ids, task, **kwargs) |
|
|
|
|
|
|
|
|
|
|
|
@override_settings(MAX_UPLOAD_SIZE=0) |
|
|
@ -62,7 +63,7 @@ class TestMaxFileSize(TestImportData): |
|
|
|
filename = "text_classification/example.jsonl" |
|
|
|
file_format = "JSONL" |
|
|
|
kwargs = {"column_label": "labels"} |
|
|
|
response = self.import_dataset(filename, file_format, kwargs) |
|
|
|
response = self.import_dataset(filename, file_format, self.task, kwargs) |
|
|
|
self.assertEqual(len(response["error"]), 1) |
|
|
|
self.assertIn("maximum file size", response["error"][0]["message"]) |
|
|
|
|
|
|
@ -89,80 +90,80 @@ class TestImportClassificationData(TestImportData): |
|
|
|
file_format = "JSONL" |
|
|
|
kwargs = {"column_label": "labels"} |
|
|
|
dataset = [("exampleA", ["positive"]), ("exampleB", ["positive", "negative"]), ("exampleC", [])] |
|
|
|
self.import_dataset(filename, file_format, kwargs) |
|
|
|
self.import_dataset(filename, file_format, self.task, kwargs) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_csv(self): |
|
|
|
filename = "text_classification/example.csv" |
|
|
|
file_format = "CSV" |
|
|
|
dataset = [("exampleA", ["positive"]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_csv_out_of_order_columns(self): |
|
|
|
filename = "text_classification/example_out_of_order_columns.csv" |
|
|
|
file_format = "CSV" |
|
|
|
dataset = [("exampleA", ["positive"]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_fasttext(self): |
|
|
|
filename = "text_classification/example_fasttext.txt" |
|
|
|
file_format = "fastText" |
|
|
|
dataset = [("exampleA", ["positive"]), ("exampleB", ["positive", "negative"]), ("exampleC", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_excel(self): |
|
|
|
filename = "text_classification/example.xlsx" |
|
|
|
file_format = "Excel" |
|
|
|
dataset = [("exampleA", ["positive"]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_json(self): |
|
|
|
filename = "text_classification/example.json" |
|
|
|
file_format = "JSON" |
|
|
|
dataset = [("exampleA", ["positive"]), ("exampleB", ["positive", "negative"]), ("exampleC", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_textfile(self): |
|
|
|
filename = "example.txt" |
|
|
|
file_format = "TextFile" |
|
|
|
dataset = [("exampleA\nexampleB\n\nexampleC\n", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_textline(self): |
|
|
|
filename = "example.txt" |
|
|
|
file_format = "TextLine" |
|
|
|
dataset = [("exampleA", []), ("exampleB", []), ("exampleC", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_wrong_jsonl(self): |
|
|
|
filename = "text_classification/example.json" |
|
|
|
file_format = "JSONL" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_parse_error(response) |
|
|
|
|
|
|
|
def test_wrong_json(self): |
|
|
|
filename = "text_classification/example.jsonl" |
|
|
|
file_format = "JSON" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_parse_error(response) |
|
|
|
|
|
|
|
def test_wrong_excel(self): |
|
|
|
filename = "text_classification/example.jsonl" |
|
|
|
file_format = "Excel" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_parse_error(response) |
|
|
|
|
|
|
|
def test_wrong_csv(self): |
|
|
|
filename = "text_classification/example.jsonl" |
|
|
|
file_format = "CSV" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_parse_error(response) |
|
|
|
|
|
|
|
|
|
|
@ -186,26 +187,26 @@ class TestImportSequenceLabelingData(TestImportData): |
|
|
|
filename = "sequence_labeling/example.jsonl" |
|
|
|
file_format = "JSONL" |
|
|
|
dataset = [("exampleA", [[0, 1, "LOC"]]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_conll(self): |
|
|
|
filename = "sequence_labeling/example.conll" |
|
|
|
file_format = "CoNLL" |
|
|
|
dataset = [("JAPAN GET", [[0, 5, "LOC"]]), ("Nadim Ladki", [[0, 11, "PER"]])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_wrong_conll(self): |
|
|
|
filename = "sequence_labeling/example.jsonl" |
|
|
|
file_format = "CoNLL" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_parse_error(response) |
|
|
|
|
|
|
|
def test_jsonl_with_overlapping(self): |
|
|
|
filename = "sequence_labeling/example_overlapping.jsonl" |
|
|
|
file_format = "JSONL" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assertEqual(len(response["error"]), 0) |
|
|
|
|
|
|
|
|
|
|
@ -241,7 +242,7 @@ class TestImportRelationExtractionData(TestImportData): |
|
|
|
[[0, 6, "ORG"], [22, 39, "DATE"], [44, 54, "PERSON"], [59, 70, "PERSON"]], |
|
|
|
), |
|
|
|
] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, RELATION_EXTRACTION) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
|
|
|
@ -259,21 +260,21 @@ class TestImportSeq2seqData(TestImportData): |
|
|
|
filename = "seq2seq/example.jsonl" |
|
|
|
file_format = "JSONL" |
|
|
|
dataset = [("exampleA", ["label1"]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_json(self): |
|
|
|
filename = "seq2seq/example.json" |
|
|
|
file_format = "JSON" |
|
|
|
dataset = [("exampleA", ["label1"]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
def test_csv(self): |
|
|
|
filename = "seq2seq/example.csv" |
|
|
|
file_format = "CSV" |
|
|
|
dataset = [("exampleA", ["label1"]), ("exampleB", [])] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
|
|
|
@ -298,7 +299,7 @@ class TestImportIntentDetectionAndSlotFillingData(TestImportData): |
|
|
|
("exampleC", {"cats": [], "entities": [(0, 1, "LOC")]}), |
|
|
|
("exampleD", {"cats": [], "entities": []}), |
|
|
|
] |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assert_examples(dataset) |
|
|
|
|
|
|
|
|
|
|
@ -308,7 +309,7 @@ class TestImportImageClassificationData(TestImportData): |
|
|
|
def test_example(self): |
|
|
|
filename = "images/1500x500.jpeg" |
|
|
|
file_format = "ImageFile" |
|
|
|
self.import_dataset(filename, file_format) |
|
|
|
self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assertEqual(Example.objects.count(), 1) |
|
|
|
|
|
|
|
|
|
|
@ -319,6 +320,6 @@ class TestFileTypeChecking(TestImportData): |
|
|
|
def test_example(self): |
|
|
|
filename = "images/example.ico" |
|
|
|
file_format = "ImageFile" |
|
|
|
response = self.import_dataset(filename, file_format) |
|
|
|
response = self.import_dataset(filename, file_format, self.task) |
|
|
|
self.assertEqual(len(response["error"]), 1) |
|
|
|
self.assertIn("unexpected", response["error"][0]["message"]) |