doccano/backend/api/tests/test_tasks.py


								import pathlib


								from django.test import TestCase


								from ..models import (DOCUMENT_CLASSIFICATION, SEQ2SEQ, SEQUENCE_LABELING,

								                      Category, Example, Label, Span)

								from ..tasks import injest_data

								from .api.utils import prepare_project


								class TestIngestData(TestCase):

								    task = 'Any'

								    annotation_class = Category


								    def setUp(self):

								        self.project = prepare_project(self.task)

								        self.user = self.project.users[0]

								        self.data_path = pathlib.Path(__file__).parent / 'data'


								    def ingest_data(self, filename, file_format, kwargs=None):

								        filenames = [str(self.data_path / filename)]

								        kwargs = kwargs or {}

								        return injest_data(self.user.id, self.project.item.id, filenames, file_format, **kwargs)


								class TestIngestClassificationData(TestIngestData):

								    task = DOCUMENT_CLASSIFICATION


								    def assert_examples(self, dataset):

								        for text, expected_labels in dataset:

								            example = Example.objects.get(text=text)

								            labels = set(cat.label.text for cat in example.categories.all())

								            self.assertEqual(labels, set(expected_labels))


								    def assert_parse_error(self, response):

								        self.assertGreaterEqual(len(response['error']), 1)

								        self.assertEqual(Example.objects.count(), 0)

								        self.assertEqual(Label.objects.count(), 0)

								        self.assertEqual(Category.objects.count(), 0)


								    def test_jsonl(self):

								        filename = 'text_classification/example.jsonl'

								        file_format = 'JSONL'

								        kwargs = {'column_label': 'labels'}

								        dataset = [

								            ('exampleA', ['positive']),

								            ('exampleB', ['positive', 'negative']),

								            ('exampleC', [])

								        ]

								        self.ingest_data(filename, file_format, kwargs)

								        self.assert_examples(dataset)


								    def test_csv(self):

								        filename = 'text_classification/example.csv'

								        file_format = 'CSV'

								        dataset = [

								            ('exampleA', ['positive']),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_csv_out_of_order_columns(self):

								        filename = 'text_classification/example_out_of_order_columns.csv'

								        file_format = 'CSV'

								        dataset = [

								            ('exampleA', ['positive']),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_fasttext(self):

								        filename = 'text_classification/example_fasttext.txt'

								        file_format = 'fastText'

								        dataset = [

								            ('exampleA', ['positive']),

								            ('exampleB', ['positive', 'negative']),

								            ('exampleC', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_excel(self):

								        filename = 'text_classification/example.xlsx'

								        file_format = 'Excel'

								        dataset = [

								            ('exampleA', ['positive']),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_json(self):

								        filename = 'text_classification/example.json'

								        file_format = 'JSON'

								        dataset = [

								            ('exampleA', ['positive']),

								            ('exampleB', ['positive', 'negative']),

								            ('exampleC', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_textfile(self):

								        filename = 'example.txt'

								        file_format = 'TextFile'

								        dataset = [

								            ('exampleA\nexampleB\nexampleC\n', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_textline(self):

								        filename = 'example.txt'

								        file_format = 'TextLine'

								        dataset = [

								            ('exampleA', []),

								            ('exampleB', []),

								            ('exampleC', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_wrong_jsonl(self):

								        filename = 'text_classification/example.json'

								        file_format = 'JSONL'

								        response = self.ingest_data(filename, file_format)

								        self.assert_parse_error(response)


								    def test_wrong_json(self):

								        filename = 'text_classification/example.jsonl'

								        file_format = 'JSON'

								        response = self.ingest_data(filename, file_format)

								        self.assert_parse_error(response)


								    def test_wrong_excel(self):

								        filename = 'text_classification/example.jsonl'

								        file_format = 'Excel'

								        response = self.ingest_data(filename, file_format)

								        self.assert_parse_error(response)


								    def test_wrong_csv(self):

								        filename = 'text_classification/example.jsonl'

								        file_format = 'CSV'

								        response = self.ingest_data(filename, file_format)

								        self.assert_parse_error(response)


								class TestIngestSequenceLabelingData(TestIngestData):

								    task = SEQUENCE_LABELING


								    def assert_examples(self, dataset):

								        for text, expected_labels in dataset:

								            example = Example.objects.get(text=text)

								            labels = [[span.start_offset, span.end_offset, span.label.text] for span in example.spans.all()]

								            self.assertEqual(labels, expected_labels)


								    def assert_parse_error(self, response):

								        self.assertGreaterEqual(len(response['error']), 1)

								        self.assertEqual(Example.objects.count(), 0)

								        self.assertEqual(Label.objects.count(), 0)

								        self.assertEqual(Span.objects.count(), 0)


								    def test_jsonl(self):

								        filename = 'sequence_labeling/example.jsonl'

								        file_format = 'JSONL'

								        dataset = [

								            ('exampleA', [[0, 1, 'LOC']]),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_conll(self):

								        filename = 'sequence_labeling/example.conll'

								        file_format = 'CoNLL'

								        dataset = [

								            ('JAPAN GET', [[0, 5, 'LOC']]),

								            ('Nadim Ladki', [[0, 11, 'PER']])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_wrong_conll(self):

								        filename = 'sequence_labeling/example.jsonl'

								        file_format = 'CoNLL'

								        response = self.ingest_data(filename, file_format)

								        self.assert_parse_error(response)


								class TestIngestSeq2seqData(TestIngestData):

								    task = SEQ2SEQ


								    def assert_examples(self, dataset):

								        for text, expected_labels in dataset:

								            example = Example.objects.get(text=text)

								            labels = set(text_label.text for text_label in example.texts.all())

								            self.assertEqual(labels, set(expected_labels))


								    def test_jsonl(self):

								        filename = 'seq2seq/example.jsonl'

								        file_format = 'JSONL'

								        dataset = [

								            ('exampleA', ['label1']),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_json(self):

								        filename = 'seq2seq/example.json'

								        file_format = 'JSON'

								        dataset = [

								            ('exampleA', ['label1']),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)


								    def test_csv(self):

								        filename = 'seq2seq/example.csv'

								        file_format = 'CSV'

								        dataset = [

								            ('exampleA', ['label1']),

								            ('exampleB', [])

								        ]

								        self.ingest_data(filename, file_format)

								        self.assert_examples(dataset)