diff --git a/backend/api/models.py b/backend/api/models.py index 2bf35542..d278a48f 100644 --- a/backend/api/models.py +++ b/backend/api/models.py @@ -40,39 +40,18 @@ class Project(PolymorphicModel): def get_absolute_url(self): return reverse('upload', args=[self.id]) - def get_bundle_name(self): - raise NotImplementedError() - - def get_bundle_name_upload(self): - raise NotImplementedError() - - def get_bundle_name_download(self): - raise NotImplementedError() - def get_annotation_serializer(self): raise NotImplementedError() def get_annotation_class(self): raise NotImplementedError() - def get_storage(self, data): - raise NotImplementedError() - def __str__(self): return self.name class TextClassificationProject(Project): - def get_bundle_name(self): - return 'document_classification' - - def get_bundle_name_upload(self): - return 'upload_text_classification' - - def get_bundle_name_download(self): - return 'download_text_classification' - def get_annotation_serializer(self): from .serializers import DocumentAnnotationSerializer return DocumentAnnotationSerializer @@ -80,22 +59,9 @@ class TextClassificationProject(Project): def get_annotation_class(self): return DocumentAnnotation - def get_storage(self, data): - from .utils import ClassificationStorage - return ClassificationStorage(data, self) - class SequenceLabelingProject(Project): - def get_bundle_name(self): - return 'sequence_labeling' - - def get_bundle_name_upload(self): - return 'upload_sequence_labeling' - - def get_bundle_name_download(self): - return 'download_sequence_labeling' - def get_annotation_serializer(self): from .serializers import SequenceAnnotationSerializer return SequenceAnnotationSerializer @@ -103,22 +69,9 @@ class SequenceLabelingProject(Project): def get_annotation_class(self): return SequenceAnnotation - def get_storage(self, data): - from .utils import SequenceLabelingStorage - return SequenceLabelingStorage(data, self) - class Seq2seqProject(Project): - def get_bundle_name(self): - return 'seq2seq' - - def get_bundle_name_upload(self): - return 'upload_seq2seq' - - def get_bundle_name_download(self): - return 'download_seq2seq' - def get_annotation_serializer(self): from .serializers import Seq2seqAnnotationSerializer return Seq2seqAnnotationSerializer @@ -126,22 +79,9 @@ class Seq2seqProject(Project): def get_annotation_class(self): return Seq2seqAnnotation - def get_storage(self, data): - from .utils import Seq2seqStorage - return Seq2seqStorage(data, self) - class Speech2textProject(Project): - def get_bundle_name(self): - return 'speech2text' - - def get_bundle_name_upload(self): - return 'upload_speech2text' - - def get_bundle_name_download(self): - return 'download_speech2text' - def get_annotation_serializer(self): from .serializers import Speech2textAnnotationSerializer return Speech2textAnnotationSerializer @@ -149,10 +89,6 @@ class Speech2textProject(Project): def get_annotation_class(self): return Speech2textAnnotation - def get_storage(self, data): - from .utils import Speech2textStorage - return Speech2textStorage(data, self) - class Label(models.Model): PREFIX_KEYS = ( diff --git a/backend/api/tests/api/test_upload.py b/backend/api/tests/api/test_upload.py index 08b373d8..3966443b 100644 --- a/backend/api/tests/api/test_upload.py +++ b/backend/api/tests/api/test_upload.py @@ -1,14 +1,9 @@ -import os - from django.contrib.auth.models import User from django.test import override_settings from rest_framework.reverse import reverse from rest_framework.test import APITestCase -from ...exceptions import FileParseException -from ...utils import (CoNLLParser, CSVParser, FastTextParser, JSONParser, - PlainTextParser) -from .utils import DATA_DIR, create_default_roles +from .utils import create_default_roles class TestFeatures(APITestCase): @@ -27,62 +22,3 @@ class TestFeatures(APITestCase): response = self.client.get(reverse('features')) self.assertFalse(response.json().get('cloud_upload')) - - -@override_settings(IMPORT_BATCH_SIZE=2) -class TestParser(APITestCase): - - def parser_helper(self, filename, parser, include_label=True): - with open(os.path.join(DATA_DIR, filename), mode='rb') as f: - result = list(parser.parse(f)) - for data in result: - for r in data: - self.assertIn('text', r) - if include_label: - self.assertIn('labels', r) - return result - - def test_give_valid_data_to_conll_parser(self): - self.parser_helper(filename='labeling.conll', parser=CoNLLParser()) - - def test_give_valid_data_to_conll_parser_with_trailing_newlines(self): - result = self.parser_helper(filename='labeling.trailing.conll', parser=CoNLLParser()) - self.assertEqual(len(result), 1) - self.assertEqual(len(result[0]), 1) - - def test_plain_parser(self): - self.parser_helper(filename='example.txt', parser=PlainTextParser(), include_label=False) - - def test_give_invalid_data_to_conll_parser(self): - with self.assertRaises(FileParseException): - self.parser_helper(filename='labeling.invalid.conll', - parser=CoNLLParser()) - - def test_give_classification_data_to_csv_parser(self): - self.parser_helper(filename='example.csv', parser=CSVParser(), include_label=False) - - def test_give_seq2seq_data_to_csv_parser(self): - self.parser_helper(filename='example.csv', parser=CSVParser(), include_label=False) - - def test_give_classification_data_to_json_parser(self): - self.parser_helper(filename='classification.jsonl', parser=JSONParser()) - - def test_give_labeling_data_to_json_parser(self): - self.parser_helper(filename='labeling.jsonl', parser=JSONParser()) - - def test_give_seq2seq_data_to_json_parser(self): - self.parser_helper(filename='seq2seq.jsonl', parser=JSONParser()) - - def test_give_data_without_label_to_json_parser(self): - self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False) - - def test_give_labeling_data_to_fasttext_parser(self): - self.parser_helper(filename='example_fasttext.txt', parser=FastTextParser()) - - def test_give_data_without_label_name_to_fasttext_parser(self): - with self.assertRaises(FileParseException): - self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FastTextParser()) - - def test_give_data_without_text_to_fasttext_parser(self): - with self.assertRaises(FileParseException): - self.parser_helper(filename='example_fasttext_without_text.txt', parser=FastTextParser()) diff --git a/backend/api/tests/test_models.py b/backend/api/tests/test_models.py index bbdc88a1..8e08c7ba 100644 --- a/backend/api/tests/test_models.py +++ b/backend/api/tests/test_models.py @@ -18,10 +18,6 @@ class TestTextClassificationProject(TestCase): def setUpTestData(cls): cls.project = mommy.make('TextClassificationProject') - def test_get_bundle_name(self): - template = self.project.get_bundle_name() - self.assertEqual(template, 'document_classification') - def test_get_annotation_serializer(self): serializer = self.project.get_annotation_serializer() self.assertEqual(serializer, DocumentAnnotationSerializer) @@ -38,10 +34,6 @@ class TestSequenceLabelingProject(TestCase): def setUpTestData(cls): cls.project = mommy.make('SequenceLabelingProject') - def test_get_bundle_name(self): - template = self.project.get_bundle_name() - self.assertEqual(template, 'sequence_labeling') - def test_get_annotation_serializer(self): serializer = self.project.get_annotation_serializer() self.assertEqual(serializer, SequenceAnnotationSerializer) @@ -58,10 +50,6 @@ class TestSeq2seqProject(TestCase): def setUpTestData(cls): cls.project = mommy.make('Seq2seqProject') - def test_get_bundle_name(self): - template = self.project.get_bundle_name() - self.assertEqual(template, 'seq2seq') - def test_get_annotation_serializer(self): serializer = self.project.get_annotation_serializer() self.assertEqual(serializer, Seq2seqAnnotationSerializer) @@ -78,10 +66,6 @@ class TestSpeech2textProject(TestCase): def setUpTestData(cls): cls.project = mommy.make('Speech2textProject') - def test_get_bundle_name(self): - template = self.project.get_bundle_name() - self.assertEqual(template, 'speech2text') - def test_get_annotation_serializer(self): serializer = self.project.get_annotation_serializer() self.assertEqual(serializer, Speech2textAnnotationSerializer) diff --git a/backend/api/tests/test_utils.py b/backend/api/tests/test_utils.py deleted file mode 100644 index fec5b1ba..00000000 --- a/backend/api/tests/test_utils.py +++ /dev/null @@ -1,172 +0,0 @@ -import io - -from django.test import TestCase -from seqeval.metrics.sequence_labeling import get_entities - -from ..exceptions import FileParseException -from ..models import Document, Label -from ..utils import (AudioParser, BaseStorage, ClassificationStorage, - CoNLLParser, Seq2seqStorage, SequenceLabelingStorage, - iterable_to_io) - - -class TestBaseStorage(TestCase): - def test_extract_label(self): - data = [{'labels': ['positive']}, {'labels': ['negative']}] - - actual = BaseStorage.extract_label(data) - - self.assertEqual(actual, [['positive'], ['negative']]) - - def test_exclude_created_labels(self): - labels = ['positive', 'negative'] - created = {'positive': Label(text='positive')} - - actual = BaseStorage.exclude_created_labels(labels, created) - - self.assertEqual(actual, ['negative']) - - def test_to_serializer_format(self): - labels = ['positive'] - created = {} - - actual = BaseStorage.to_serializer_format(labels, created) - - self.assertEqual(len(actual), 1) - self.assertEqual(actual[0]['text'], 'positive') - self.assertIsNone(actual[0]['prefix_key']) - self.assertEqual(actual[0]['suffix_key'], 'p') - self.assertIsNotNone(actual[0]['background_color']) - self.assertIsNotNone(actual[0]['text_color']) - - def test_get_shortkey_without_existing_shortkey(self): - label = 'positive' - created = {} - - actual = BaseStorage.get_shortkey(label, created) - - self.assertEqual(actual, ('p', None)) - - def test_get_shortkey_with_existing_shortkey(self): - label = 'positive' - created = {('p', None)} - - actual = BaseStorage.get_shortkey(label, created) - - self.assertEqual(actual, ('p', 'ctrl')) - - def test_update_saved_labels(self): - saved = {'positive': Label(text='positive', text_color='#000000')} - new = [Label(text='positive', text_color='#ffffff')] - - actual = BaseStorage.update_saved_labels(saved, new) - - self.assertEqual(actual['positive'].text_color, '#ffffff') - - -class TestClassificationStorage(TestCase): - def test_extract_unique_labels(self): - labels = [['positive'], ['positive', 'negative'], ['negative']] - - actual = ClassificationStorage.extract_unique_labels(labels) - - self.assertCountEqual(actual, ['positive', 'negative']) - - def test_make_annotations(self): - docs = [Document(text='a', id=1), Document(text='b', id=2), Document(text='c', id=3)] - labels = [['positive'], ['positive', 'negative'], ['negative']] - saved_labels = {'positive': Label(text='positive', id=1), 'negative': Label(text='negative', id=2)} - - actual = ClassificationStorage.make_annotations(docs, labels, saved_labels) - - self.assertCountEqual(actual, [ - {'document': 1, 'label': 1}, - {'document': 2, 'label': 1}, - {'document': 2, 'label': 2}, - {'document': 3, 'label': 2}, - ]) - - -class TestSequenceLabelingStorage(TestCase): - def test_extract_unique_labels(self): - labels = [[[0, 1, 'LOC']], [[3, 4, 'ORG']]] - - actual = SequenceLabelingStorage.extract_unique_labels(labels) - - self.assertCountEqual(actual, ['LOC', 'ORG']) - - def test_make_annotations(self): - docs = [Document(text='a', id=1), Document(text='b', id=2)] - labels = [[[0, 1, 'LOC']], [[3, 4, 'ORG']]] - saved_labels = {'LOC': Label(text='LOC', id=1), 'ORG': Label(text='ORG', id=2)} - - actual = SequenceLabelingStorage.make_annotations(docs, labels, saved_labels) - - self.assertEqual(actual, [ - {'document': 1, 'label': 1, 'start_offset': 0, 'end_offset': 1}, - {'document': 2, 'label': 2, 'start_offset': 3, 'end_offset': 4}, - ]) - - -class TestSeq2seqStorage(TestCase): - def test_make_annotations(self): - docs = [Document(text='a', id=1), Document(text='b', id=2)] - labels = [['Hello!'], ['How are you?', "What's up?"]] - - actual = Seq2seqStorage.make_annotations(docs, labels) - - self.assertEqual(actual, [ - {'document': 1, 'text': 'Hello!'}, - {'document': 2, 'text': 'How are you?'}, - {'document': 2, 'text': "What's up?"}, - ]) - - -class TestCoNLLParser(TestCase): - def test_calc_char_offset(self): - f = io.BytesIO( - b"EU\tORG\n" - b"rejects\t_\n" - b"German\tMISC\n" - b"call\t_\n" - ) - - actual = next(CoNLLParser().parse(f))[0] - - self.assertEqual(actual, { - 'text': 'EU rejects German call', - 'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']] - }) - - -class TestAudioParser(TestCase): - def test_parse_mp3(self): - f = io.BytesIO(b'...') - f.name = 'test.mp3' - - actual = next(AudioParser().parse(f)) - - self.assertEqual(actual, [{ - 'audio': 'data:audio/mpeg;base64,Li4u', - 'meta': '{"filename": "test.mp3"}', - }]) - - def test_parse_unknown(self): - f = io.BytesIO(b'...') - f.name = 'unknown.unknown' - - with self.assertRaises(FileParseException): - next(AudioParser().parse(f)) - - -class TestIterableToIO(TestCase): - def test(self): - def iterable(): - yield b'fo' - yield b'o\nbar\n' - yield b'baz\nrest' - - stream = iterable_to_io(iterable()) - stream = io.TextIOWrapper(stream) - - self.assertEqual(stream.readlines(), ['foo\n', 'bar\n', 'baz\n', 'rest']) diff --git a/backend/api/urls.py b/backend/api/urls.py index ebd565b9..774cd328 100644 --- a/backend/api/urls.py +++ b/backend/api/urls.py @@ -157,11 +157,6 @@ urlpatterns = [ view=views.Features.as_view(), name='features' ), - path( - route='cloud-upload', - view=views.CloudUploadAPI.as_view(), - name='cloud_uploader' - ), path( route='projects', view=views.ProjectList.as_view(), diff --git a/backend/api/utils.py b/backend/api/utils.py deleted file mode 100644 index c019e51d..00000000 --- a/backend/api/utils.py +++ /dev/null @@ -1,634 +0,0 @@ -import base64 -import csv -import io -import itertools -import json -import mimetypes -import re -from collections import defaultdict - -import conllu -import pyexcel -from chardet import UniversalDetector -from colour import Color -from django.conf import settings -from django.db import transaction -from rest_framework.renderers import BaseRenderer, JSONRenderer -from seqeval.metrics.sequence_labeling import get_entities - -from .exceptions import FileParseException -from .models import Label -from .serializers import DocumentSerializer, LabelSerializer - - -def extract_label(tag): - ptn = re.compile(r'(B|I|E|S)-(.+)') - m = ptn.match(tag) - if m: - return m.groups()[1] - else: - return tag - - -class BaseStorage(object): - - def __init__(self, data, project): - self.data = data - self.project = project - - @transaction.atomic - def save(self, user): - raise NotImplementedError() - - def save_doc(self, data): - serializer = DocumentSerializer(data=data, many=True) - serializer.is_valid(raise_exception=True) - doc = serializer.save(project=self.project) - return doc - - def save_label(self, data): - serializer = LabelSerializer(data=data, many=True) - serializer.is_valid(raise_exception=True) - label = serializer.save(project=self.project) - return label - - def save_annotation(self, data, user): - annotation_serializer = self.project.get_annotation_serializer() - serializer = annotation_serializer(data=data, many=True) - serializer.is_valid(raise_exception=True) - annotation = serializer.save(user=user) - return annotation - - @classmethod - def extract_label(cls, data): - return [d.get('labels', []) for d in data] - - @classmethod - def exclude_created_labels(cls, labels, created): - return [label for label in labels if label not in created] - - @classmethod - def to_serializer_format(cls, labels, created): - existing_shortkeys = {(label.suffix_key, label.prefix_key) - for label in created.values()} - - serializer_labels = [] - - for label in sorted(labels): - serializer_label = {'text': label} - - shortkey = cls.get_shortkey(label, existing_shortkeys) - if shortkey: - serializer_label['suffix_key'] = shortkey[0] - serializer_label['prefix_key'] = shortkey[1] - existing_shortkeys.add(shortkey) - - background_color = Color(pick_for=label) - text_color = Color('white') if background_color.get_luminance() < 0.5 else Color('black') - serializer_label['background_color'] = background_color.hex - serializer_label['text_color'] = text_color.hex - - serializer_labels.append(serializer_label) - - return serializer_labels - - @classmethod - def get_shortkey(cls, label, existing_shortkeys): - model_prefix_keys = [key for (key, _) in Label.PREFIX_KEYS] - prefix_keys = [None] + model_prefix_keys - - model_suffix_keys = {key for (key, _) in Label.SUFFIX_KEYS} - suffix_keys = [key for key in label.lower() if key in model_suffix_keys] - - for shortkey in itertools.product(suffix_keys, prefix_keys): - if shortkey not in existing_shortkeys: - return shortkey - - return None - - @classmethod - def update_saved_labels(cls, saved, new): - for label in new: - saved[label.text] = label - return saved - - -class PlainStorage(BaseStorage): - - @transaction.atomic - def save(self, user): - for text in self.data: - self.save_doc(text) - - -class ClassificationStorage(BaseStorage): - """Store json for text classification. - - The format is as follows: - {"text": "Python is awesome!", "labels": ["positive"]} - ... - """ - @transaction.atomic - def save(self, user): - saved_labels = {label.text: label for label in self.project.labels.all()} - for data in self.data: - docs = self.save_doc(data) - labels = self.extract_label(data) - unique_labels = self.extract_unique_labels(labels) - unique_labels = self.exclude_created_labels(unique_labels, saved_labels) - unique_labels = self.to_serializer_format(unique_labels, saved_labels) - new_labels = self.save_label(unique_labels) - saved_labels = self.update_saved_labels(saved_labels, new_labels) - annotations = self.make_annotations(docs, labels, saved_labels) - self.save_annotation(annotations, user) - - @classmethod - def extract_unique_labels(cls, labels): - return set(itertools.chain(*labels)) - - @classmethod - def make_annotations(cls, docs, labels, saved_labels): - annotations = [] - for doc, label in zip(docs, labels): - for name in label: - label = saved_labels[name] - annotations.append({'document': doc.id, 'label': label.id}) - return annotations - - -class SequenceLabelingStorage(BaseStorage): - """Upload jsonl for sequence labeling. - - The format is as follows: - {"text": "Python is awesome!", "labels": [[0, 6, "Product"],]} - ... - """ - @transaction.atomic - def save(self, user): - saved_labels = {label.text: label for label in self.project.labels.all()} - for data in self.data: - docs = self.save_doc(data) - labels = self.extract_label(data) - unique_labels = self.extract_unique_labels(labels) - unique_labels = self.exclude_created_labels(unique_labels, saved_labels) - unique_labels = self.to_serializer_format(unique_labels, saved_labels) - new_labels = self.save_label(unique_labels) - saved_labels = self.update_saved_labels(saved_labels, new_labels) - annotations = self.make_annotations(docs, labels, saved_labels) - self.save_annotation(annotations, user) - - @classmethod - def extract_unique_labels(cls, labels): - return set([label for _, _, label in itertools.chain(*labels)]) - - @classmethod - def make_annotations(cls, docs, labels, saved_labels): - annotations = [] - for doc, spans in zip(docs, labels): - for span in spans: - start_offset, end_offset, name = span - label = saved_labels[name] - annotations.append({'document': doc.id, - 'label': label.id, - 'start_offset': start_offset, - 'end_offset': end_offset}) - return annotations - - -class Seq2seqStorage(BaseStorage): - """Store json for seq2seq. - - The format is as follows: - {"text": "Hello, World!", "labels": ["こんにちは、世界!"]} - ... - """ - @transaction.atomic - def save(self, user): - for data in self.data: - doc = self.save_doc(data) - labels = self.extract_label(data) - annotations = self.make_annotations(doc, labels) - self.save_annotation(annotations, user) - - @classmethod - def make_annotations(cls, docs, labels): - annotations = [] - for doc, texts in zip(docs, labels): - for text in texts: - annotations.append({'document': doc.id, 'text': text}) - return annotations - - -class Speech2textStorage(BaseStorage): - """Store json for speech2text. - - The format is as follows: - {"audio": "data:audio/mpeg;base64,...", "transcription": "こんにちは、世界!"} - ... - """ - @transaction.atomic - def save(self, user): - for data in self.data: - for audio in data: - audio['text'] = audio.pop('audio') - doc = self.save_doc(data) - annotations = self.make_annotations(doc, data) - self.save_annotation(annotations, user) - - @classmethod - def make_annotations(cls, docs, data): - annotations = [] - for doc, datum in zip(docs, data): - try: - annotations.append({'document': doc.id, 'text': datum['transcription']}) - except KeyError: - continue - return annotations - - -class FileParser(object): - - def parse(self, file): - raise NotImplementedError() - - @staticmethod - def encode_metadata(data): - return json.dumps(data, ensure_ascii=False) - - -class CoNLLParser(FileParser): - """Uploads CoNLL format file. - - The file format is tab-separated values. - A blank line is required at the end of a sentence. - For example: - ``` - EU B-ORG - rejects O - German B-MISC - call O - to O - boycott O - British B-MISC - lamb O - . O - - Peter B-PER - Blackburn I-PER - ... - ``` - """ - def parse(self, file): - data = [] - file = EncodedIO(file) - file = io.TextIOWrapper(file, encoding=file.encoding) - - # Add check exception - - field_parsers = { - "ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]), - } - - gen_parser = conllu.parse_incr( - file, - fields=("form", "ne"), - field_parsers=field_parsers - ) - - try: - for sentence in gen_parser: - if not sentence: - continue - if len(data) >= settings.IMPORT_BATCH_SIZE: - yield data - data = [] - words, labels = [], [] - for item in sentence: - word = item.get("form") - tag = item.get("ne") - - if tag is not None: - char_left = sum(map(len, words)) + len(words) - char_right = char_left + len(word) - span = [char_left, char_right, tag] - labels.append(span) - - words.append(word) - - # Create and add JSONL - data.append({'text': ' '.join(words), 'labels': labels}) - - except conllu.parser.ParseException as e: - raise FileParseException(line_num=-1, line=str(e)) - - if data: - yield data - - -class PlainTextParser(FileParser): - """Uploads plain text. - - The file format is as follows: - ``` - EU rejects German call to boycott British lamb. - President Obama is speaking at the White House. - ... - ``` - """ - def parse(self, file): - file = EncodedIO(file) - file = io.TextIOWrapper(file, encoding=file.encoding) - while True: - batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE)) - if not batch: - break - yield [{'text': line.strip()} for line in batch] - - -class CSVParser(FileParser): - """Uploads csv file. - - The file format is comma separated values. - Column names are required at the top of a file. - For example: - ``` - text, label - "EU rejects German call to boycott British lamb.",Politics - "President Obama is speaking at the White House.",Politics - "He lives in Newark, Ohio.",Other - ... - ``` - """ - def parse(self, file): - file = EncodedIO(file) - file = io.TextIOWrapper(file, encoding=file.encoding) - reader = csv.reader(file) - yield from ExcelParser.parse_excel_csv_reader(reader) - - -class ExcelParser(FileParser): - def parse(self, file): - excel_book = pyexcel.iget_book(file_type="xlsx", file_content=file.read()) - # Handle multiple sheets - for sheet_name in excel_book.sheet_names(): - reader = excel_book[sheet_name].to_array() - yield from self.parse_excel_csv_reader(reader) - - @staticmethod - def parse_excel_csv_reader(reader): - columns = next(reader) - data = [] - if len(columns) == 1 and columns[0] != 'text': - data.append({'text': columns[0]}) - for i, row in enumerate(reader, start=2): - if len(data) >= settings.IMPORT_BATCH_SIZE: - yield data - data = [] - # Only text column - if len(row) <= len(columns) and len(row) == 1: - data.append({'text': row[0]}) - # Text, labels and metadata columns - elif 2 <= len(row) <= len(columns): - datum = dict(zip(columns, row)) - text, label = datum.pop('text'), datum.pop('label') - meta = FileParser.encode_metadata(datum) - if label != '': - j = {'text': text, 'labels': [label], 'meta': meta} - else: - j = {'text': text, 'meta': meta} - data.append(j) - else: - raise FileParseException(line_num=i, line=row) - if data: - yield data - - -class JSONParser(FileParser): - - def parse(self, file): - file = EncodedIO(file) - file = io.TextIOWrapper(file, encoding=file.encoding) - data = [] - for i, line in enumerate(file, start=1): - if len(data) >= settings.IMPORT_BATCH_SIZE: - yield data - data = [] - try: - j = json.loads(line) - j['meta'] = FileParser.encode_metadata(j.get('meta', {})) - data.append(j) - except json.decoder.JSONDecodeError: - raise FileParseException(line_num=i, line=line) - if data: - yield data - - -class FastTextParser(FileParser): - """ - Parse files in fastText format. - Labels are marked with the __label__ prefix - and the corresponding text comes afterwards in the same line - For example: - ``` - __label__dog poodle - __label__house mansion - ``` - """ - def parse(self, file): - file = EncodedIO(file) - file = io.TextIOWrapper(file, encoding=file.encoding) - data = [] - for i, line in enumerate(file, start=0): - if len(data) >= settings.IMPORT_BATCH_SIZE: - yield data - data = [] - - # Search labels and text, check correct syntax and append - labels = [] - text = [] - for token in line.rstrip().split(" "): - if token.startswith('__label__'): - if token == '__label__': - raise FileParseException(line_num=i, line=line) - labels.append(token[len('__label__'):]) - else: - text.append(token) - - # Check if text for labels is given - if not text: - raise FileParseException(line_num=i, line=line) - - data.append({'text': " ".join(text), 'labels': labels}) - - if data: - yield data - - - -class AudioParser(FileParser): - def parse(self, file): - file_type, _ = mimetypes.guess_type(file.name, strict=False) - if not file_type: - raise FileParseException(line_num=1, line='Unable to guess file type') - - audio = base64.b64encode(file.read()) - yield [{ - 'audio': f'data:{file_type};base64,{audio.decode("ascii")}', - 'meta': json.dumps({'filename': file.name}), - }] - - -class JSONLRenderer(JSONRenderer): - - def render(self, data, accepted_media_type=None, renderer_context=None): - """ - Render `data` into JSON, returning a bytestring. - """ - if data is None: - return bytes() - - if not isinstance(data, list): - data = [data] - - for d in data: - yield json.dumps(d, - cls=self.encoder_class, - ensure_ascii=self.ensure_ascii, - allow_nan=not self.strict) + '\n' - - -class FastTextPainter(object): - - @staticmethod - def paint_labels(documents, labels): - serializer = DocumentSerializer(documents, many=True) - serializer_labels = LabelSerializer(labels, many=True) - data = [] - for d in serializer.data: - labels = [] - for a in d['annotations']: - label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0] - labels.append('__label__{}'.format(label_obj['text'].replace(' ', '_'))) - text = d['text'].replace('\n', ' ') - if labels: - data.append('{} {}'.format(' '.join(labels), text)) - else: - data.append(text) - return data - - -class PlainTextRenderer(BaseRenderer): - media_type = 'text/plain' - format = 'txt' - charset = 'utf-8' - - def render(self, data, accepted_media_type=None, renderer_context=None): - if data is None: - return bytes() - - if not isinstance(data, list): - data = [data] - - buffer = io.BytesIO() - for d in data: - buffer.write((d + '\n').encode(self.charset)) - return buffer.getvalue() - - -class JSONPainter(object): - - def paint(self, documents): - serializer = DocumentSerializer(documents, many=True) - data = [] - for d in serializer.data: - d['meta'] = json.loads(d['meta']) - for a in d['annotations']: - a.pop('id') - a.pop('prob') - a.pop('document') - data.append(d) - return data - - @staticmethod - def paint_labels(documents, labels): - serializer_labels = LabelSerializer(labels, many=True) - serializer = DocumentSerializer(documents, many=True) - data = [] - for d in serializer.data: - labels = [] - for a in d['annotations']: - label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0] - label_text = label_obj['text'] - label_start = a['start_offset'] - label_end = a['end_offset'] - labels.append([label_start, label_end, label_text]) - d.pop('annotations') - d['labels'] = labels - d['meta'] = json.loads(d['meta']) - data.append(d) - return data - - -class CSVPainter(JSONPainter): - - def paint(self, documents): - data = super().paint(documents) - res = [] - for d in data: - annotations = d.pop('annotations') - for a in annotations: - res.append({**d, **a}) - return res - - -def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE): - """See https://stackoverflow.com/a/20260030/3817588.""" - class IterStream(io.RawIOBase): - def __init__(self): - self.leftover = None - - def readable(self): - return True - - def readinto(self, b): - try: - l = len(b) # We're supposed to return at most this much - chunk = self.leftover or next(iterable) - output, self.leftover = chunk[:l], chunk[l:] - b[:len(output)] = output - return len(output) - except StopIteration: - return 0 # indicate EOF - - return io.BufferedReader(IterStream(), buffer_size=buffer_size) - - -class EncodedIO(io.RawIOBase): - def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'): - buffer = b'' - detector = UniversalDetector() - - while True: - read = fobj.read(buffer_size) - detector.feed(read) - buffer += read - if detector.done or len(read) < buffer_size: - break - - if detector.done: - self.encoding = detector.result['encoding'] - else: - self.encoding = default_encoding - - self._fobj = fobj - self._buffer = buffer - - def readable(self): - return self._fobj.readable() - - def readinto(self, b): - l = len(b) - chunk = self._buffer or self._fobj.read(l) - output, self._buffer = chunk[:l], chunk[l:] - b[:len(output)] = output - return len(output) diff --git a/backend/api/views/import_export.py b/backend/api/views/import_export.py index 18a0e3a3..c1198928 100644 --- a/backend/api/views/import_export.py +++ b/backend/api/views/import_export.py @@ -1,22 +1,7 @@ from django.conf import settings -from django.shortcuts import get_object_or_404, redirect -from libcloud import DriverType, get_driver -from libcloud.storage.types import (ContainerDoesNotExistError, - ObjectDoesNotExistError) -from rest_framework import status -from rest_framework.exceptions import ParseError, ValidationError -from rest_framework.parsers import MultiPartParser from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response from rest_framework.views import APIView -from rest_framework_csv.renderers import CSVRenderer - -from ..models import Project -from ..permissions import IsProjectAdmin -from ..utils import (AudioParser, CoNLLParser, CSVPainter, CSVParser, - ExcelParser, FastTextPainter, FastTextParser, - JSONLRenderer, JSONPainter, JSONParser, PlainTextParser, - PlainTextRenderer, iterable_to_io) class Features(APIView): @@ -28,134 +13,52 @@ class Features(APIView): }) -class TextUploadAPI(APIView): - parser_classes = (MultiPartParser,) - permission_classes = [IsAuthenticated & IsProjectAdmin] - - def post(self, request, *args, **kwargs): - if 'file' not in request.data: - raise ParseError('Empty content') - - self.save_file( - user=request.user, - file=request.data['file'], - file_format=request.data['format'], - project_id=kwargs['project_id'], - ) - - return Response(status=status.HTTP_201_CREATED) - - @classmethod - def save_file(cls, user, file, file_format, project_id): - project = get_object_or_404(Project, pk=project_id) - parser = cls.select_parser(file_format) - data = parser.parse(file) - storage = project.get_storage(data) - storage.save(user) - - @classmethod - def select_parser(cls, file_format): - if file_format == 'plain': - return PlainTextParser() - elif file_format == 'csv': - return CSVParser() - elif file_format == 'json': - return JSONParser() - elif file_format == 'conll': - return CoNLLParser() - elif file_format == 'excel': - return ExcelParser() - elif file_format == 'audio': - return AudioParser() - elif file_format == 'fastText': - return FastTextParser() - else: - raise ValidationError('format {} is invalid.'.format(file_format)) - - -class CloudUploadAPI(APIView): - permission_classes = TextUploadAPI.permission_classes - - def get(self, request, *args, **kwargs): - try: - project_id = request.query_params['project_id'] - file_format = request.query_params['upload_format'] - cloud_container = request.query_params['container'] - cloud_object = request.query_params['object'] - except KeyError as ex: - raise ValidationError('query parameter {} is missing'.format(ex)) - - try: - cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object) - except ContainerDoesNotExistError: - raise ValidationError('cloud container {} does not exist'.format(cloud_container)) - except ObjectDoesNotExistError: - raise ValidationError('cloud object {} does not exist'.format(cloud_object)) - - TextUploadAPI.save_file( - user=request.user, - file=cloud_file, - file_format=file_format, - project_id=project_id, - ) - - next_url = request.query_params.get('next') - - if next_url == 'about:blank': - return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED) - - if next_url: - return redirect(next_url) - - return Response(status=status.HTTP_201_CREATED) - - @classmethod - def get_cloud_object_as_io(cls, container_name, object_name): - provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower() - account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT - key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY - - driver = get_driver(DriverType.STORAGE, provider) - client = driver(account, key) - - cloud_container = client.get_container(container_name) - cloud_object = cloud_container.get_object(object_name) - - return iterable_to_io(cloud_object.as_stream()) - - -class TextDownloadAPI(APIView): - permission_classes = TextUploadAPI.permission_classes - - renderer_classes = (CSVRenderer, JSONLRenderer, PlainTextRenderer) - - def get(self, request, *args, **kwargs): - format = request.query_params.get('q') - only_approved = request.query_params.get('onlyApproved') - project = get_object_or_404(Project, pk=self.kwargs['project_id']) - documents = ( - project.documents.exclude(annotations_approved_by = None) - if only_approved == 'true' - else project.documents.all() - ) - painter = self.select_painter(format) - - # jsonl-textlabel format prints text labels while jsonl format prints annotations with label ids - # jsonl-textlabel format - "labels": [[0, 15, "PERSON"], ..] - # jsonl format - "annotations": [{"label": 5, "start_offset": 0, "end_offset": 2, "user": 1},..] - if format in ('jsonl', 'txt'): - labels = project.labels.all() - data = painter.paint_labels(documents, labels) - else: - data = painter.paint(documents) - return Response(data) - - def select_painter(self, format): - if format == 'csv': - return CSVPainter() - elif format == 'jsonl' or format == 'json': - return JSONPainter() - elif format == 'txt': - return FastTextPainter() - else: - raise ValidationError('format {} is invalid.'.format(format)) +# class CloudUploadAPI(APIView): +# permission_classes = TextUploadAPI.permission_classes +# +# def get(self, request, *args, **kwargs): +# try: +# project_id = request.query_params['project_id'] +# file_format = request.query_params['upload_format'] +# cloud_container = request.query_params['container'] +# cloud_object = request.query_params['object'] +# except KeyError as ex: +# raise ValidationError('query parameter {} is missing'.format(ex)) +# +# try: +# cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object) +# except ContainerDoesNotExistError: +# raise ValidationError('cloud container {} does not exist'.format(cloud_container)) +# except ObjectDoesNotExistError: +# raise ValidationError('cloud object {} does not exist'.format(cloud_object)) +# +# TextUploadAPI.save_file( +# user=request.user, +# file=cloud_file, +# file_format=file_format, +# project_id=project_id, +# ) +# +# next_url = request.query_params.get('next') +# +# if next_url == 'about:blank': +# return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED) +# +# if next_url: +# return redirect(next_url) +# +# return Response(status=status.HTTP_201_CREATED) +# +# @classmethod +# def get_cloud_object_as_io(cls, container_name, object_name): +# provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower() +# account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT +# key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY +# +# driver = get_driver(DriverType.STORAGE, provider) +# client = driver(account, key) +# +# cloud_container = client.get_container(container_name) +# cloud_object = cloud_container.get_object(object_name) +# +# return iterable_to_io(cloud_object.as_stream())