mirror of https://github.com/doccano/doccano.git
pythondatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learningannotation-tool
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
229 lines
7.3 KiB
229 lines
7.3 KiB
import pathlib
|
|
|
|
from django.test import TestCase
|
|
|
|
from ..models import (DOCUMENT_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING,
|
|
Category, Example, Label, Span)
|
|
from ..tasks import injest_data
|
|
from .api.utils import prepare_project
|
|
|
|
|
|
class TestIngestData(TestCase):
|
|
task = 'Any'
|
|
annotation_class = Category
|
|
|
|
def setUp(self):
|
|
self.project = prepare_project(self.task)
|
|
self.user = self.project.users[0]
|
|
self.data_path = pathlib.Path(__file__).parent / 'data'
|
|
|
|
def ingest_data(self, filename, file_format, kwargs=None):
|
|
filenames = [str(self.data_path / filename)]
|
|
kwargs = kwargs or {}
|
|
return injest_data(self.user.id, self.project.item.id, filenames, file_format, **kwargs)
|
|
|
|
|
|
class TestIngestClassificationData(TestIngestData):
|
|
task = DOCUMENT_CLASSIFICATION
|
|
|
|
def assert_examples(self, dataset):
|
|
for text, expected_labels in dataset:
|
|
example = Example.objects.get(text=text)
|
|
labels = set(cat.label.text for cat in example.categories.all())
|
|
self.assertEqual(labels, set(expected_labels))
|
|
|
|
def assert_parse_error(self, response):
|
|
self.assertGreaterEqual(len(response['error']), 1)
|
|
self.assertEqual(Example.objects.count(), 0)
|
|
self.assertEqual(Label.objects.count(), 0)
|
|
self.assertEqual(Category.objects.count(), 0)
|
|
|
|
def test_jsonl(self):
|
|
filename = 'text_classification/example.jsonl'
|
|
file_format = 'JSONL'
|
|
kwargs = {'column_label': 'labels'}
|
|
dataset = [
|
|
('exampleA', ['positive']),
|
|
('exampleB', ['positive', 'negative']),
|
|
('exampleC', [])
|
|
]
|
|
self.ingest_data(filename, file_format, kwargs)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_csv(self):
|
|
filename = 'text_classification/example.csv'
|
|
file_format = 'CSV'
|
|
dataset = [
|
|
('exampleA', ['positive']),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_csv_out_of_order_columns(self):
|
|
filename = 'text_classification/example_out_of_order_columns.csv'
|
|
file_format = 'CSV'
|
|
dataset = [
|
|
('exampleA', ['positive']),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_fasttext(self):
|
|
filename = 'text_classification/example_fasttext.txt'
|
|
file_format = 'fastText'
|
|
dataset = [
|
|
('exampleA', ['positive']),
|
|
('exampleB', ['positive', 'negative']),
|
|
('exampleC', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_excel(self):
|
|
filename = 'text_classification/example.xlsx'
|
|
file_format = 'Excel'
|
|
dataset = [
|
|
('exampleA', ['positive']),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_json(self):
|
|
filename = 'text_classification/example.json'
|
|
file_format = 'JSON'
|
|
dataset = [
|
|
('exampleA', ['positive']),
|
|
('exampleB', ['positive', 'negative']),
|
|
('exampleC', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_textfile(self):
|
|
filename = 'example.txt'
|
|
file_format = 'TextFile'
|
|
dataset = [
|
|
('exampleA\nexampleB\nexampleC\n', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_textline(self):
|
|
filename = 'example.txt'
|
|
file_format = 'TextLine'
|
|
dataset = [
|
|
('exampleA', []),
|
|
('exampleB', []),
|
|
('exampleC', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_wrong_jsonl(self):
|
|
filename = 'text_classification/example.json'
|
|
file_format = 'JSONL'
|
|
response = self.ingest_data(filename, file_format)
|
|
self.assert_parse_error(response)
|
|
|
|
def test_wrong_json(self):
|
|
filename = 'text_classification/example.jsonl'
|
|
file_format = 'JSON'
|
|
response = self.ingest_data(filename, file_format)
|
|
self.assert_parse_error(response)
|
|
|
|
def test_wrong_excel(self):
|
|
filename = 'text_classification/example.jsonl'
|
|
file_format = 'Excel'
|
|
response = self.ingest_data(filename, file_format)
|
|
self.assert_parse_error(response)
|
|
|
|
def test_wrong_csv(self):
|
|
filename = 'text_classification/example.jsonl'
|
|
file_format = 'CSV'
|
|
response = self.ingest_data(filename, file_format)
|
|
self.assert_parse_error(response)
|
|
|
|
|
|
class TestIngestSequenceLabelingData(TestIngestData):
|
|
task = SEQUENCE_LABELING
|
|
|
|
def assert_examples(self, dataset):
|
|
for text, expected_labels in dataset:
|
|
example = Example.objects.get(text=text)
|
|
labels = [[span.start_offset, span.end_offset, span.label.text] for span in example.spans.all()]
|
|
self.assertEqual(labels, expected_labels)
|
|
|
|
def assert_parse_error(self, response):
|
|
self.assertGreaterEqual(len(response['error']), 1)
|
|
self.assertEqual(Example.objects.count(), 0)
|
|
self.assertEqual(Label.objects.count(), 0)
|
|
self.assertEqual(Span.objects.count(), 0)
|
|
|
|
def test_jsonl(self):
|
|
filename = 'sequence_labeling/example.jsonl'
|
|
file_format = 'JSONL'
|
|
dataset = [
|
|
('exampleA', [[0, 1, 'LOC']]),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_conll(self):
|
|
filename = 'sequence_labeling/example.conll'
|
|
file_format = 'CoNLL'
|
|
dataset = [
|
|
('JAPAN GET', [[0, 5, 'LOC']]),
|
|
('Nadim Ladki', [[0, 11, 'PER']])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_wrong_conll(self):
|
|
filename = 'sequence_labeling/example.jsonl'
|
|
file_format = 'CoNLL'
|
|
response = self.ingest_data(filename, file_format)
|
|
self.assert_parse_error(response)
|
|
|
|
|
|
class TestIngestSeq2seqData(TestIngestData):
|
|
task = SEQ2SEQ
|
|
|
|
def assert_examples(self, dataset):
|
|
for text, expected_labels in dataset:
|
|
example = Example.objects.get(text=text)
|
|
labels = set(text_label.text for text_label in example.texts.all())
|
|
self.assertEqual(labels, set(expected_labels))
|
|
|
|
def test_jsonl(self):
|
|
filename = 'seq2seq/example.jsonl'
|
|
file_format = 'JSONL'
|
|
dataset = [
|
|
('exampleA', ['label1']),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_json(self):
|
|
filename = 'seq2seq/example.json'
|
|
file_format = 'JSON'
|
|
dataset = [
|
|
('exampleA', ['label1']),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|
|
|
|
def test_csv(self):
|
|
filename = 'seq2seq/example.csv'
|
|
file_format = 'CSV'
|
|
dataset = [
|
|
('exampleA', ['label1']),
|
|
('exampleB', [])
|
|
]
|
|
self.ingest_data(filename, file_format)
|
|
self.assert_examples(dataset)
|