Browse Source

Remove unused code

pull/1334/head
Hironsan 3 years ago
parent
commit
08524ff3a6
7 changed files with 50 additions and 1102 deletions
  1. 64
      backend/api/models.py
  2. 66
      backend/api/tests/api/test_upload.py
  3. 16
      backend/api/tests/test_models.py
  4. 172
      backend/api/tests/test_utils.py
  5. 5
      backend/api/urls.py
  6. 634
      backend/api/utils.py
  7. 195
      backend/api/views/import_export.py

64
backend/api/models.py

@ -40,39 +40,18 @@ class Project(PolymorphicModel):
def get_absolute_url(self):
return reverse('upload', args=[self.id])
def get_bundle_name(self):
raise NotImplementedError()
def get_bundle_name_upload(self):
raise NotImplementedError()
def get_bundle_name_download(self):
raise NotImplementedError()
def get_annotation_serializer(self):
raise NotImplementedError()
def get_annotation_class(self):
raise NotImplementedError()
def get_storage(self, data):
raise NotImplementedError()
def __str__(self):
return self.name
class TextClassificationProject(Project):
def get_bundle_name(self):
return 'document_classification'
def get_bundle_name_upload(self):
return 'upload_text_classification'
def get_bundle_name_download(self):
return 'download_text_classification'
def get_annotation_serializer(self):
from .serializers import DocumentAnnotationSerializer
return DocumentAnnotationSerializer
@ -80,22 +59,9 @@ class TextClassificationProject(Project):
def get_annotation_class(self):
return DocumentAnnotation
def get_storage(self, data):
from .utils import ClassificationStorage
return ClassificationStorage(data, self)
class SequenceLabelingProject(Project):
def get_bundle_name(self):
return 'sequence_labeling'
def get_bundle_name_upload(self):
return 'upload_sequence_labeling'
def get_bundle_name_download(self):
return 'download_sequence_labeling'
def get_annotation_serializer(self):
from .serializers import SequenceAnnotationSerializer
return SequenceAnnotationSerializer
@ -103,22 +69,9 @@ class SequenceLabelingProject(Project):
def get_annotation_class(self):
return SequenceAnnotation
def get_storage(self, data):
from .utils import SequenceLabelingStorage
return SequenceLabelingStorage(data, self)
class Seq2seqProject(Project):
def get_bundle_name(self):
return 'seq2seq'
def get_bundle_name_upload(self):
return 'upload_seq2seq'
def get_bundle_name_download(self):
return 'download_seq2seq'
def get_annotation_serializer(self):
from .serializers import Seq2seqAnnotationSerializer
return Seq2seqAnnotationSerializer
@ -126,22 +79,9 @@ class Seq2seqProject(Project):
def get_annotation_class(self):
return Seq2seqAnnotation
def get_storage(self, data):
from .utils import Seq2seqStorage
return Seq2seqStorage(data, self)
class Speech2textProject(Project):
def get_bundle_name(self):
return 'speech2text'
def get_bundle_name_upload(self):
return 'upload_speech2text'
def get_bundle_name_download(self):
return 'download_speech2text'
def get_annotation_serializer(self):
from .serializers import Speech2textAnnotationSerializer
return Speech2textAnnotationSerializer
@ -149,10 +89,6 @@ class Speech2textProject(Project):
def get_annotation_class(self):
return Speech2textAnnotation
def get_storage(self, data):
from .utils import Speech2textStorage
return Speech2textStorage(data, self)
class Label(models.Model):
PREFIX_KEYS = (

66
backend/api/tests/api/test_upload.py

@ -1,14 +1,9 @@
import os
from django.contrib.auth.models import User
from django.test import override_settings
from rest_framework.reverse import reverse
from rest_framework.test import APITestCase
from ...exceptions import FileParseException
from ...utils import (CoNLLParser, CSVParser, FastTextParser, JSONParser,
PlainTextParser)
from .utils import DATA_DIR, create_default_roles
from .utils import create_default_roles
class TestFeatures(APITestCase):
@ -27,62 +22,3 @@ class TestFeatures(APITestCase):
response = self.client.get(reverse('features'))
self.assertFalse(response.json().get('cloud_upload'))
@override_settings(IMPORT_BATCH_SIZE=2)
class TestParser(APITestCase):
def parser_helper(self, filename, parser, include_label=True):
with open(os.path.join(DATA_DIR, filename), mode='rb') as f:
result = list(parser.parse(f))
for data in result:
for r in data:
self.assertIn('text', r)
if include_label:
self.assertIn('labels', r)
return result
def test_give_valid_data_to_conll_parser(self):
self.parser_helper(filename='labeling.conll', parser=CoNLLParser())
def test_give_valid_data_to_conll_parser_with_trailing_newlines(self):
result = self.parser_helper(filename='labeling.trailing.conll', parser=CoNLLParser())
self.assertEqual(len(result), 1)
self.assertEqual(len(result[0]), 1)
def test_plain_parser(self):
self.parser_helper(filename='example.txt', parser=PlainTextParser(), include_label=False)
def test_give_invalid_data_to_conll_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='labeling.invalid.conll',
parser=CoNLLParser())
def test_give_classification_data_to_csv_parser(self):
self.parser_helper(filename='example.csv', parser=CSVParser(), include_label=False)
def test_give_seq2seq_data_to_csv_parser(self):
self.parser_helper(filename='example.csv', parser=CSVParser(), include_label=False)
def test_give_classification_data_to_json_parser(self):
self.parser_helper(filename='classification.jsonl', parser=JSONParser())
def test_give_labeling_data_to_json_parser(self):
self.parser_helper(filename='labeling.jsonl', parser=JSONParser())
def test_give_seq2seq_data_to_json_parser(self):
self.parser_helper(filename='seq2seq.jsonl', parser=JSONParser())
def test_give_data_without_label_to_json_parser(self):
self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False)
def test_give_labeling_data_to_fasttext_parser(self):
self.parser_helper(filename='example_fasttext.txt', parser=FastTextParser())
def test_give_data_without_label_name_to_fasttext_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FastTextParser())
def test_give_data_without_text_to_fasttext_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='example_fasttext_without_text.txt', parser=FastTextParser())

16
backend/api/tests/test_models.py

@ -18,10 +18,6 @@ class TestTextClassificationProject(TestCase):
def setUpTestData(cls):
cls.project = mommy.make('TextClassificationProject')
def test_get_bundle_name(self):
template = self.project.get_bundle_name()
self.assertEqual(template, 'document_classification')
def test_get_annotation_serializer(self):
serializer = self.project.get_annotation_serializer()
self.assertEqual(serializer, DocumentAnnotationSerializer)
@ -38,10 +34,6 @@ class TestSequenceLabelingProject(TestCase):
def setUpTestData(cls):
cls.project = mommy.make('SequenceLabelingProject')
def test_get_bundle_name(self):
template = self.project.get_bundle_name()
self.assertEqual(template, 'sequence_labeling')
def test_get_annotation_serializer(self):
serializer = self.project.get_annotation_serializer()
self.assertEqual(serializer, SequenceAnnotationSerializer)
@ -58,10 +50,6 @@ class TestSeq2seqProject(TestCase):
def setUpTestData(cls):
cls.project = mommy.make('Seq2seqProject')
def test_get_bundle_name(self):
template = self.project.get_bundle_name()
self.assertEqual(template, 'seq2seq')
def test_get_annotation_serializer(self):
serializer = self.project.get_annotation_serializer()
self.assertEqual(serializer, Seq2seqAnnotationSerializer)
@ -78,10 +66,6 @@ class TestSpeech2textProject(TestCase):
def setUpTestData(cls):
cls.project = mommy.make('Speech2textProject')
def test_get_bundle_name(self):
template = self.project.get_bundle_name()
self.assertEqual(template, 'speech2text')
def test_get_annotation_serializer(self):
serializer = self.project.get_annotation_serializer()
self.assertEqual(serializer, Speech2textAnnotationSerializer)

172
backend/api/tests/test_utils.py

@ -1,172 +0,0 @@
import io
from django.test import TestCase
from seqeval.metrics.sequence_labeling import get_entities
from ..exceptions import FileParseException
from ..models import Document, Label
from ..utils import (AudioParser, BaseStorage, ClassificationStorage,
CoNLLParser, Seq2seqStorage, SequenceLabelingStorage,
iterable_to_io)
class TestBaseStorage(TestCase):
def test_extract_label(self):
data = [{'labels': ['positive']}, {'labels': ['negative']}]
actual = BaseStorage.extract_label(data)
self.assertEqual(actual, [['positive'], ['negative']])
def test_exclude_created_labels(self):
labels = ['positive', 'negative']
created = {'positive': Label(text='positive')}
actual = BaseStorage.exclude_created_labels(labels, created)
self.assertEqual(actual, ['negative'])
def test_to_serializer_format(self):
labels = ['positive']
created = {}
actual = BaseStorage.to_serializer_format(labels, created)
self.assertEqual(len(actual), 1)
self.assertEqual(actual[0]['text'], 'positive')
self.assertIsNone(actual[0]['prefix_key'])
self.assertEqual(actual[0]['suffix_key'], 'p')
self.assertIsNotNone(actual[0]['background_color'])
self.assertIsNotNone(actual[0]['text_color'])
def test_get_shortkey_without_existing_shortkey(self):
label = 'positive'
created = {}
actual = BaseStorage.get_shortkey(label, created)
self.assertEqual(actual, ('p', None))
def test_get_shortkey_with_existing_shortkey(self):
label = 'positive'
created = {('p', None)}
actual = BaseStorage.get_shortkey(label, created)
self.assertEqual(actual, ('p', 'ctrl'))
def test_update_saved_labels(self):
saved = {'positive': Label(text='positive', text_color='#000000')}
new = [Label(text='positive', text_color='#ffffff')]
actual = BaseStorage.update_saved_labels(saved, new)
self.assertEqual(actual['positive'].text_color, '#ffffff')
class TestClassificationStorage(TestCase):
def test_extract_unique_labels(self):
labels = [['positive'], ['positive', 'negative'], ['negative']]
actual = ClassificationStorage.extract_unique_labels(labels)
self.assertCountEqual(actual, ['positive', 'negative'])
def test_make_annotations(self):
docs = [Document(text='a', id=1), Document(text='b', id=2), Document(text='c', id=3)]
labels = [['positive'], ['positive', 'negative'], ['negative']]
saved_labels = {'positive': Label(text='positive', id=1), 'negative': Label(text='negative', id=2)}
actual = ClassificationStorage.make_annotations(docs, labels, saved_labels)
self.assertCountEqual(actual, [
{'document': 1, 'label': 1},
{'document': 2, 'label': 1},
{'document': 2, 'label': 2},
{'document': 3, 'label': 2},
])
class TestSequenceLabelingStorage(TestCase):
def test_extract_unique_labels(self):
labels = [[[0, 1, 'LOC']], [[3, 4, 'ORG']]]
actual = SequenceLabelingStorage.extract_unique_labels(labels)
self.assertCountEqual(actual, ['LOC', 'ORG'])
def test_make_annotations(self):
docs = [Document(text='a', id=1), Document(text='b', id=2)]
labels = [[[0, 1, 'LOC']], [[3, 4, 'ORG']]]
saved_labels = {'LOC': Label(text='LOC', id=1), 'ORG': Label(text='ORG', id=2)}
actual = SequenceLabelingStorage.make_annotations(docs, labels, saved_labels)
self.assertEqual(actual, [
{'document': 1, 'label': 1, 'start_offset': 0, 'end_offset': 1},
{'document': 2, 'label': 2, 'start_offset': 3, 'end_offset': 4},
])
class TestSeq2seqStorage(TestCase):
def test_make_annotations(self):
docs = [Document(text='a', id=1), Document(text='b', id=2)]
labels = [['Hello!'], ['How are you?', "What's up?"]]
actual = Seq2seqStorage.make_annotations(docs, labels)
self.assertEqual(actual, [
{'document': 1, 'text': 'Hello!'},
{'document': 2, 'text': 'How are you?'},
{'document': 2, 'text': "What's up?"},
])
class TestCoNLLParser(TestCase):
def test_calc_char_offset(self):
f = io.BytesIO(
b"EU\tORG\n"
b"rejects\t_\n"
b"German\tMISC\n"
b"call\t_\n"
)
actual = next(CoNLLParser().parse(f))[0]
self.assertEqual(actual, {
'text': 'EU rejects German call',
'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
})
class TestAudioParser(TestCase):
def test_parse_mp3(self):
f = io.BytesIO(b'...')
f.name = 'test.mp3'
actual = next(AudioParser().parse(f))
self.assertEqual(actual, [{
'audio': 'data:audio/mpeg;base64,Li4u',
'meta': '{"filename": "test.mp3"}',
}])
def test_parse_unknown(self):
f = io.BytesIO(b'...')
f.name = 'unknown.unknown'
with self.assertRaises(FileParseException):
next(AudioParser().parse(f))
class TestIterableToIO(TestCase):
def test(self):
def iterable():
yield b'fo'
yield b'o\nbar\n'
yield b'baz\nrest'
stream = iterable_to_io(iterable())
stream = io.TextIOWrapper(stream)
self.assertEqual(stream.readlines(), ['foo\n', 'bar\n', 'baz\n', 'rest'])

5
backend/api/urls.py

@ -157,11 +157,6 @@ urlpatterns = [
view=views.Features.as_view(),
name='features'
),
path(
route='cloud-upload',
view=views.CloudUploadAPI.as_view(),
name='cloud_uploader'
),
path(
route='projects',
view=views.ProjectList.as_view(),

634
backend/api/utils.py

@ -1,634 +0,0 @@
import base64
import csv
import io
import itertools
import json
import mimetypes
import re
from collections import defaultdict
import conllu
import pyexcel
from chardet import UniversalDetector
from colour import Color
from django.conf import settings
from django.db import transaction
from rest_framework.renderers import BaseRenderer, JSONRenderer
from seqeval.metrics.sequence_labeling import get_entities
from .exceptions import FileParseException
from .models import Label
from .serializers import DocumentSerializer, LabelSerializer
def extract_label(tag):
ptn = re.compile(r'(B|I|E|S)-(.+)')
m = ptn.match(tag)
if m:
return m.groups()[1]
else:
return tag
class BaseStorage(object):
def __init__(self, data, project):
self.data = data
self.project = project
@transaction.atomic
def save(self, user):
raise NotImplementedError()
def save_doc(self, data):
serializer = DocumentSerializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
doc = serializer.save(project=self.project)
return doc
def save_label(self, data):
serializer = LabelSerializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
label = serializer.save(project=self.project)
return label
def save_annotation(self, data, user):
annotation_serializer = self.project.get_annotation_serializer()
serializer = annotation_serializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
annotation = serializer.save(user=user)
return annotation
@classmethod
def extract_label(cls, data):
return [d.get('labels', []) for d in data]
@classmethod
def exclude_created_labels(cls, labels, created):
return [label for label in labels if label not in created]
@classmethod
def to_serializer_format(cls, labels, created):
existing_shortkeys = {(label.suffix_key, label.prefix_key)
for label in created.values()}
serializer_labels = []
for label in sorted(labels):
serializer_label = {'text': label}
shortkey = cls.get_shortkey(label, existing_shortkeys)
if shortkey:
serializer_label['suffix_key'] = shortkey[0]
serializer_label['prefix_key'] = shortkey[1]
existing_shortkeys.add(shortkey)
background_color = Color(pick_for=label)
text_color = Color('white') if background_color.get_luminance() < 0.5 else Color('black')
serializer_label['background_color'] = background_color.hex
serializer_label['text_color'] = text_color.hex
serializer_labels.append(serializer_label)
return serializer_labels
@classmethod
def get_shortkey(cls, label, existing_shortkeys):
model_prefix_keys = [key for (key, _) in Label.PREFIX_KEYS]
prefix_keys = [None] + model_prefix_keys
model_suffix_keys = {key for (key, _) in Label.SUFFIX_KEYS}
suffix_keys = [key for key in label.lower() if key in model_suffix_keys]
for shortkey in itertools.product(suffix_keys, prefix_keys):
if shortkey not in existing_shortkeys:
return shortkey
return None
@classmethod
def update_saved_labels(cls, saved, new):
for label in new:
saved[label.text] = label
return saved
class PlainStorage(BaseStorage):
@transaction.atomic
def save(self, user):
for text in self.data:
self.save_doc(text)
class ClassificationStorage(BaseStorage):
"""Store json for text classification.
The format is as follows:
{"text": "Python is awesome!", "labels": ["positive"]}
...
"""
@transaction.atomic
def save(self, user):
saved_labels = {label.text: label for label in self.project.labels.all()}
for data in self.data:
docs = self.save_doc(data)
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels, saved_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
self.save_annotation(annotations, user)
@classmethod
def extract_unique_labels(cls, labels):
return set(itertools.chain(*labels))
@classmethod
def make_annotations(cls, docs, labels, saved_labels):
annotations = []
for doc, label in zip(docs, labels):
for name in label:
label = saved_labels[name]
annotations.append({'document': doc.id, 'label': label.id})
return annotations
class SequenceLabelingStorage(BaseStorage):
"""Upload jsonl for sequence labeling.
The format is as follows:
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
...
"""
@transaction.atomic
def save(self, user):
saved_labels = {label.text: label for label in self.project.labels.all()}
for data in self.data:
docs = self.save_doc(data)
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels, saved_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
self.save_annotation(annotations, user)
@classmethod
def extract_unique_labels(cls, labels):
return set([label for _, _, label in itertools.chain(*labels)])
@classmethod
def make_annotations(cls, docs, labels, saved_labels):
annotations = []
for doc, spans in zip(docs, labels):
for span in spans:
start_offset, end_offset, name = span
label = saved_labels[name]
annotations.append({'document': doc.id,
'label': label.id,
'start_offset': start_offset,
'end_offset': end_offset})
return annotations
class Seq2seqStorage(BaseStorage):
"""Store json for seq2seq.
The format is as follows:
{"text": "Hello, World!", "labels": ["こんにちは、世界!"]}
...
"""
@transaction.atomic
def save(self, user):
for data in self.data:
doc = self.save_doc(data)
labels = self.extract_label(data)
annotations = self.make_annotations(doc, labels)
self.save_annotation(annotations, user)
@classmethod
def make_annotations(cls, docs, labels):
annotations = []
for doc, texts in zip(docs, labels):
for text in texts:
annotations.append({'document': doc.id, 'text': text})
return annotations
class Speech2textStorage(BaseStorage):
"""Store json for speech2text.
The format is as follows:
{"audio": "data:audio/mpeg;base64,...", "transcription": "こんにちは、世界!"}
...
"""
@transaction.atomic
def save(self, user):
for data in self.data:
for audio in data:
audio['text'] = audio.pop('audio')
doc = self.save_doc(data)
annotations = self.make_annotations(doc, data)
self.save_annotation(annotations, user)
@classmethod
def make_annotations(cls, docs, data):
annotations = []
for doc, datum in zip(docs, data):
try:
annotations.append({'document': doc.id, 'text': datum['transcription']})
except KeyError:
continue
return annotations
class FileParser(object):
def parse(self, file):
raise NotImplementedError()
@staticmethod
def encode_metadata(data):
return json.dumps(data, ensure_ascii=False)
class CoNLLParser(FileParser):
"""Uploads CoNLL format file.
The file format is tab-separated values.
A blank line is required at the end of a sentence.
For example:
```
EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O
Peter B-PER
Blackburn I-PER
...
```
"""
def parse(self, file):
data = []
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
# Add check exception
field_parsers = {
"ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]),
}
gen_parser = conllu.parse_incr(
file,
fields=("form", "ne"),
field_parsers=field_parsers
)
try:
for sentence in gen_parser:
if not sentence:
continue
if len(data) >= settings.IMPORT_BATCH_SIZE:
yield data
data = []
words, labels = [], []
for item in sentence:
word = item.get("form")
tag = item.get("ne")
if tag is not None:
char_left = sum(map(len, words)) + len(words)
char_right = char_left + len(word)
span = [char_left, char_right, tag]
labels.append(span)
words.append(word)
# Create and add JSONL
data.append({'text': ' '.join(words), 'labels': labels})
except conllu.parser.ParseException as e:
raise FileParseException(line_num=-1, line=str(e))
if data:
yield data
class PlainTextParser(FileParser):
"""Uploads plain text.
The file format is as follows:
```
EU rejects German call to boycott British lamb.
President Obama is speaking at the White House.
...
```
"""
def parse(self, file):
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
while True:
batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
if not batch:
break
yield [{'text': line.strip()} for line in batch]
class CSVParser(FileParser):
"""Uploads csv file.
The file format is comma separated values.
Column names are required at the top of a file.
For example:
```
text, label
"EU rejects German call to boycott British lamb.",Politics
"President Obama is speaking at the White House.",Politics
"He lives in Newark, Ohio.",Other
...
```
"""
def parse(self, file):
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
reader = csv.reader(file)
yield from ExcelParser.parse_excel_csv_reader(reader)
class ExcelParser(FileParser):
def parse(self, file):
excel_book = pyexcel.iget_book(file_type="xlsx", file_content=file.read())
# Handle multiple sheets
for sheet_name in excel_book.sheet_names():
reader = excel_book[sheet_name].to_array()
yield from self.parse_excel_csv_reader(reader)
@staticmethod
def parse_excel_csv_reader(reader):
columns = next(reader)
data = []
if len(columns) == 1 and columns[0] != 'text':
data.append({'text': columns[0]})
for i, row in enumerate(reader, start=2):
if len(data) >= settings.IMPORT_BATCH_SIZE:
yield data
data = []
# Only text column
if len(row) <= len(columns) and len(row) == 1:
data.append({'text': row[0]})
# Text, labels and metadata columns
elif 2 <= len(row) <= len(columns):
datum = dict(zip(columns, row))
text, label = datum.pop('text'), datum.pop('label')
meta = FileParser.encode_metadata(datum)
if label != '':
j = {'text': text, 'labels': [label], 'meta': meta}
else:
j = {'text': text, 'meta': meta}
data.append(j)
else:
raise FileParseException(line_num=i, line=row)
if data:
yield data
class JSONParser(FileParser):
def parse(self, file):
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
data = []
for i, line in enumerate(file, start=1):
if len(data) >= settings.IMPORT_BATCH_SIZE:
yield data
data = []
try:
j = json.loads(line)
j['meta'] = FileParser.encode_metadata(j.get('meta', {}))
data.append(j)
except json.decoder.JSONDecodeError:
raise FileParseException(line_num=i, line=line)
if data:
yield data
class FastTextParser(FileParser):
"""
Parse files in fastText format.
Labels are marked with the __label__ prefix
and the corresponding text comes afterwards in the same line
For example:
```
__label__dog poodle
__label__house mansion
```
"""
def parse(self, file):
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
data = []
for i, line in enumerate(file, start=0):
if len(data) >= settings.IMPORT_BATCH_SIZE:
yield data
data = []
# Search labels and text, check correct syntax and append
labels = []
text = []
for token in line.rstrip().split(" "):
if token.startswith('__label__'):
if token == '__label__':
raise FileParseException(line_num=i, line=line)
labels.append(token[len('__label__'):])
else:
text.append(token)
# Check if text for labels is given
if not text:
raise FileParseException(line_num=i, line=line)
data.append({'text': " ".join(text), 'labels': labels})
if data:
yield data
class AudioParser(FileParser):
def parse(self, file):
file_type, _ = mimetypes.guess_type(file.name, strict=False)
if not file_type:
raise FileParseException(line_num=1, line='Unable to guess file type')
audio = base64.b64encode(file.read())
yield [{
'audio': f'data:{file_type};base64,{audio.decode("ascii")}',
'meta': json.dumps({'filename': file.name}),
}]
class JSONLRenderer(JSONRenderer):
def render(self, data, accepted_media_type=None, renderer_context=None):
"""
Render `data` into JSON, returning a bytestring.
"""
if data is None:
return bytes()
if not isinstance(data, list):
data = [data]
for d in data:
yield json.dumps(d,
cls=self.encoder_class,
ensure_ascii=self.ensure_ascii,
allow_nan=not self.strict) + '\n'
class FastTextPainter(object):
@staticmethod
def paint_labels(documents, labels):
serializer = DocumentSerializer(documents, many=True)
serializer_labels = LabelSerializer(labels, many=True)
data = []
for d in serializer.data:
labels = []
for a in d['annotations']:
label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0]
labels.append('__label__{}'.format(label_obj['text'].replace(' ', '_')))
text = d['text'].replace('\n', ' ')
if labels:
data.append('{} {}'.format(' '.join(labels), text))
else:
data.append(text)
return data
class PlainTextRenderer(BaseRenderer):
media_type = 'text/plain'
format = 'txt'
charset = 'utf-8'
def render(self, data, accepted_media_type=None, renderer_context=None):
if data is None:
return bytes()
if not isinstance(data, list):
data = [data]
buffer = io.BytesIO()
for d in data:
buffer.write((d + '\n').encode(self.charset))
return buffer.getvalue()
class JSONPainter(object):
def paint(self, documents):
serializer = DocumentSerializer(documents, many=True)
data = []
for d in serializer.data:
d['meta'] = json.loads(d['meta'])
for a in d['annotations']:
a.pop('id')
a.pop('prob')
a.pop('document')
data.append(d)
return data
@staticmethod
def paint_labels(documents, labels):
serializer_labels = LabelSerializer(labels, many=True)
serializer = DocumentSerializer(documents, many=True)
data = []
for d in serializer.data:
labels = []
for a in d['annotations']:
label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0]
label_text = label_obj['text']
label_start = a['start_offset']
label_end = a['end_offset']
labels.append([label_start, label_end, label_text])
d.pop('annotations')
d['labels'] = labels
d['meta'] = json.loads(d['meta'])
data.append(d)
return data
class CSVPainter(JSONPainter):
def paint(self, documents):
data = super().paint(documents)
res = []
for d in data:
annotations = d.pop('annotations')
for a in annotations:
res.append({**d, **a})
return res
def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""See https://stackoverflow.com/a/20260030/3817588."""
class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = None
def readable(self):
return True
def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
output, self.leftover = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF
return io.BufferedReader(IterStream(), buffer_size=buffer_size)
class EncodedIO(io.RawIOBase):
def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
buffer = b''
detector = UniversalDetector()
while True:
read = fobj.read(buffer_size)
detector.feed(read)
buffer += read
if detector.done or len(read) < buffer_size:
break
if detector.done:
self.encoding = detector.result['encoding']
else:
self.encoding = default_encoding
self._fobj = fobj
self._buffer = buffer
def readable(self):
return self._fobj.readable()
def readinto(self, b):
l = len(b)
chunk = self._buffer or self._fobj.read(l)
output, self._buffer = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)

195
backend/api/views/import_export.py

@ -1,22 +1,7 @@
from django.conf import settings
from django.shortcuts import get_object_or_404, redirect
from libcloud import DriverType, get_driver
from libcloud.storage.types import (ContainerDoesNotExistError,
ObjectDoesNotExistError)
from rest_framework import status
from rest_framework.exceptions import ParseError, ValidationError
from rest_framework.parsers import MultiPartParser
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework_csv.renderers import CSVRenderer
from ..models import Project
from ..permissions import IsProjectAdmin
from ..utils import (AudioParser, CoNLLParser, CSVPainter, CSVParser,
ExcelParser, FastTextPainter, FastTextParser,
JSONLRenderer, JSONPainter, JSONParser, PlainTextParser,
PlainTextRenderer, iterable_to_io)
class Features(APIView):
@ -28,134 +13,52 @@ class Features(APIView):
})
class TextUploadAPI(APIView):
parser_classes = (MultiPartParser,)
permission_classes = [IsAuthenticated & IsProjectAdmin]
def post(self, request, *args, **kwargs):
if 'file' not in request.data:
raise ParseError('Empty content')
self.save_file(
user=request.user,
file=request.data['file'],
file_format=request.data['format'],
project_id=kwargs['project_id'],
)
return Response(status=status.HTTP_201_CREATED)
@classmethod
def save_file(cls, user, file, file_format, project_id):
project = get_object_or_404(Project, pk=project_id)
parser = cls.select_parser(file_format)
data = parser.parse(file)
storage = project.get_storage(data)
storage.save(user)
@classmethod
def select_parser(cls, file_format):
if file_format == 'plain':
return PlainTextParser()
elif file_format == 'csv':
return CSVParser()
elif file_format == 'json':
return JSONParser()
elif file_format == 'conll':
return CoNLLParser()
elif file_format == 'excel':
return ExcelParser()
elif file_format == 'audio':
return AudioParser()
elif file_format == 'fastText':
return FastTextParser()
else:
raise ValidationError('format {} is invalid.'.format(file_format))
class CloudUploadAPI(APIView):
permission_classes = TextUploadAPI.permission_classes
def get(self, request, *args, **kwargs):
try:
project_id = request.query_params['project_id']
file_format = request.query_params['upload_format']
cloud_container = request.query_params['container']
cloud_object = request.query_params['object']
except KeyError as ex:
raise ValidationError('query parameter {} is missing'.format(ex))
try:
cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object)
except ContainerDoesNotExistError:
raise ValidationError('cloud container {} does not exist'.format(cloud_container))
except ObjectDoesNotExistError:
raise ValidationError('cloud object {} does not exist'.format(cloud_object))
TextUploadAPI.save_file(
user=request.user,
file=cloud_file,
file_format=file_format,
project_id=project_id,
)
next_url = request.query_params.get('next')
if next_url == 'about:blank':
return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED)
if next_url:
return redirect(next_url)
return Response(status=status.HTTP_201_CREATED)
@classmethod
def get_cloud_object_as_io(cls, container_name, object_name):
provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower()
account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT
key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY
driver = get_driver(DriverType.STORAGE, provider)
client = driver(account, key)
cloud_container = client.get_container(container_name)
cloud_object = cloud_container.get_object(object_name)
return iterable_to_io(cloud_object.as_stream())
class TextDownloadAPI(APIView):
permission_classes = TextUploadAPI.permission_classes
renderer_classes = (CSVRenderer, JSONLRenderer, PlainTextRenderer)
def get(self, request, *args, **kwargs):
format = request.query_params.get('q')
only_approved = request.query_params.get('onlyApproved')
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
documents = (
project.documents.exclude(annotations_approved_by = None)
if only_approved == 'true'
else project.documents.all()
)
painter = self.select_painter(format)
# jsonl-textlabel format prints text labels while jsonl format prints annotations with label ids
# jsonl-textlabel format - "labels": [[0, 15, "PERSON"], ..]
# jsonl format - "annotations": [{"label": 5, "start_offset": 0, "end_offset": 2, "user": 1},..]
if format in ('jsonl', 'txt'):
labels = project.labels.all()
data = painter.paint_labels(documents, labels)
else:
data = painter.paint(documents)
return Response(data)
def select_painter(self, format):
if format == 'csv':
return CSVPainter()
elif format == 'jsonl' or format == 'json':
return JSONPainter()
elif format == 'txt':
return FastTextPainter()
else:
raise ValidationError('format {} is invalid.'.format(format))
# class CloudUploadAPI(APIView):
# permission_classes = TextUploadAPI.permission_classes
#
# def get(self, request, *args, **kwargs):
# try:
# project_id = request.query_params['project_id']
# file_format = request.query_params['upload_format']
# cloud_container = request.query_params['container']
# cloud_object = request.query_params['object']
# except KeyError as ex:
# raise ValidationError('query parameter {} is missing'.format(ex))
#
# try:
# cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object)
# except ContainerDoesNotExistError:
# raise ValidationError('cloud container {} does not exist'.format(cloud_container))
# except ObjectDoesNotExistError:
# raise ValidationError('cloud object {} does not exist'.format(cloud_object))
#
# TextUploadAPI.save_file(
# user=request.user,
# file=cloud_file,
# file_format=file_format,
# project_id=project_id,
# )
#
# next_url = request.query_params.get('next')
#
# if next_url == 'about:blank':
# return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED)
#
# if next_url:
# return redirect(next_url)
#
# return Response(status=status.HTTP_201_CREATED)
#
# @classmethod
# def get_cloud_object_as_io(cls, container_name, object_name):
# provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower()
# account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT
# key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY
#
# driver = get_driver(DriverType.STORAGE, provider)
# client = driver(account, key)
#
# cloud_container = client.get_container(container_name)
# cloud_object = cloud_container.get_object(object_name)
#
# return iterable_to_io(cloud_object.as_stream())
Loading…
Cancel
Save