Browse Source

Support batch import

pull/110/head
Hironsan 6 years ago
parent
commit
49d41416e4
9 changed files with 445 additions and 345 deletions
  1. 72
      app/server/api.py
  2. 44
      app/server/models.py
  3. 17
      app/server/serializers.py
  4. 13
      app/server/static/js/upload.js
  5. 2
      app/server/templates/admin/upload/base.html
  6. 2
      app/server/templates/admin/upload/sequence_labeling.html
  7. 129
      app/server/tests/test_api.py
  8. 509
      app/server/utils.py
  9. 2
      requirements.txt

72
app/server/api.py

@ -2,18 +2,23 @@ from collections import Counter
from django.shortcuts import get_object_or_404
from django_filters.rest_framework import DjangoFilterBackend
from django.db.models import Count
from rest_framework import generics, filters, status
from rest_framework.exceptions import ParseError
from rest_framework.exceptions import ParseError, ValidationError
from rest_framework.permissions import IsAuthenticated, IsAdminUser
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.parsers import MultiPartParser
from rest_framework_csv.renderers import CSVRenderer
from .filters import DocumentFilter
from .models import Project, Label, Document
from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsOwnAnnotation
from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer
from .serializers import ProjectPolymorphicSerializer
from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser
from .utils import JSONLRenderer
from .utils import JSONPainter, CSVPainter
class ProjectList(generics.ListCreateAPIView):
@ -51,24 +56,23 @@ class StatisticsAPI(APIView):
return Response(response)
def progress(self, project):
total = project.documents.count()
remaining = 0
docs = project.documents
annotation_class = project.get_annotation_class()
for d in project.documents.all():
count = annotation_class.objects.filter(document=d).count()
if count == 0:
remaining += 1
total = docs.count()
done = annotation_class.objects.filter(document_id__in=docs.all()).\
aggregate(Count('document', distinct=True))['document__count']
remaining = total - done
return {'total': total, 'remaining': remaining}
def label_per_data(self, project):
label_count = Counter()
user_count = Counter()
annotation_class = project.get_annotation_class()
for doc in project.documents.all():
annotations = annotation_class.objects.filter(document=doc.id)
for a in annotations:
label_count[a.label.text] += 1
user_count[a.user.username] += 1
docs = project.documents.all()
annotations = annotation_class.objects.filter(document_id__in=docs.all())
for d in annotations.values('label__text', 'user__username').annotate(Count('label'), Count('user')):
label_count[d['label__text']] += d['label__count']
user_count[d['user__username']] += d['user__count']
return label_count, user_count
@ -132,9 +136,14 @@ class AnnotationList(generics.ListCreateAPIView):
def get_queryset(self):
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
model = project.get_annotation_class()
self.queryset = model.objects.filter(document=self.kwargs['doc_id'], user=self.request.user)
self.queryset = model.objects.filter(document=self.kwargs['doc_id'],
user=self.request.user)
return self.queryset
def create(self, request, *args, **kwargs):
request.data['document'] = self.kwargs['doc_id']
return super().create(request, args, kwargs)
def perform_create(self, serializer):
doc = get_object_or_404(Document, pk=self.kwargs['doc_id'])
serializer.save(document=doc, user=self.request.user)
@ -164,18 +173,41 @@ class TextUploadAPI(APIView):
if 'file' not in request.data:
raise ParseError('Empty content')
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
handler = project.get_file_handler(request.data['format'])
handler.handle_uploaded_file(request.data['file'], self.request.user)
parser = self.select_parser(request.data['format'])
data = parser.parse(request.data['file'])
storage = project.get_storage(data)
storage.save(self.request.user)
return Response(status=status.HTTP_201_CREATED)
def select_parser(self, format):
if format == 'plain':
return PlainTextParser()
elif format == 'csv':
return CSVParser()
elif format == 'json':
return JSONParser()
elif format == 'conll':
return CoNLLParser()
else:
raise ValidationError('format {} is invalid.'.format(format))
class TextDownloadAPI(APIView):
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
renderer_classes = (CSVRenderer, JSONLRenderer)
def get(self, request, *args, **kwargs):
project_id = self.kwargs['project_id']
format = request.query_params.get('q')
project = get_object_or_404(Project, pk=project_id)
handler = project.get_file_handler(format)
response = handler.render()
return response
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
documents = project.documents.all()
painter = self.select_painter(format)
data = painter.paint(documents)
return Response(data)
def select_painter(self, format):
if format == 'csv':
return CSVPainter()
elif format == 'json':
return JSONPainter()
else:
raise ValidationError('format {} is invalid.'.format(format))

44
app/server/models.py

@ -58,7 +58,7 @@ class Project(PolymorphicModel):
def get_annotation_class(self):
raise NotImplementedError()
def get_file_handler(self, format):
def get_storage(self, data):
raise NotImplementedError()
def __str__(self):
@ -87,17 +87,9 @@ class TextClassificationProject(Project):
def get_annotation_class(self):
return DocumentAnnotation
def get_file_handler(self, format):
from .utils import JsonClassificationHandler
from .utils import CSVClassificationHandler
from .utils import PlainTextHandler
if format == 'plain':
return PlainTextHandler(self)
elif format == 'csv':
return CSVClassificationHandler(self)
elif format == 'json':
return JsonClassificationHandler(self)
raise ValidationError('format {} is invalid.'.format(format))
def get_storage(self, data):
from .utils import ClassificationStorage
return ClassificationStorage(data, self)
class SequenceLabelingProject(Project):
@ -122,17 +114,9 @@ class SequenceLabelingProject(Project):
def get_annotation_class(self):
return SequenceAnnotation
def get_file_handler(self, format):
from .utils import JsonLabelingHandler
from .utils import PlainTextHandler
from .utils import CoNLLHandler
if format == 'plain':
return PlainTextHandler(self)
elif format == 'conll':
return CoNLLHandler(self)
elif format == 'json':
return JsonLabelingHandler(self)
raise ValidationError('format {} is invalid.'.format(format))
def get_storage(self, data):
from .utils import SequenceLabelingStorage
return SequenceLabelingStorage(data, self)
class Seq2seqProject(Project):
@ -157,17 +141,9 @@ class Seq2seqProject(Project):
def get_annotation_class(self):
return Seq2seqAnnotation
def get_file_handler(self, format):
from .utils import JsonSeq2seqHandler
from .utils import CSVSeq2seqHandler
from .utils import PlainTextHandler
if format == 'plain':
return PlainTextHandler(self)
elif format == 'csv':
return CSVSeq2seqHandler(self)
elif format == 'json':
return JsonSeq2seqHandler(self)
raise ValidationError('format {} is invalid.'.format(format))
def get_storage(self, data):
from .utils import Seq2seqStorage
return Seq2seqStorage(data, self)
class Label(models.Model):

17
app/server/serializers.py

@ -87,34 +87,29 @@ class ProjectFilteredPrimaryKeyRelatedField(serializers.PrimaryKeyRelatedField):
class DocumentAnnotationSerializer(serializers.ModelSerializer):
# label = ProjectFilteredPrimaryKeyRelatedField(queryset=Label.objects.all())
label = serializers.PrimaryKeyRelatedField(queryset=Label.objects.all())
document = serializers.PrimaryKeyRelatedField(queryset=Document.objects.all())
class Meta:
model = DocumentAnnotation
fields = ('id', 'prob', 'label', 'user')
fields = ('id', 'prob', 'label', 'user', 'document')
read_only_fields = ('user', )
def create(self, validated_data):
annotation = DocumentAnnotation.objects.create(**validated_data)
return annotation
class SequenceAnnotationSerializer(serializers.ModelSerializer):
#label = ProjectFilteredPrimaryKeyRelatedField(queryset=Label.objects.all())
label = serializers.PrimaryKeyRelatedField(queryset=Label.objects.all())
document = serializers.PrimaryKeyRelatedField(queryset=Document.objects.all())
class Meta:
model = SequenceAnnotation
fields = ('id', 'prob', 'label', 'start_offset', 'end_offset', 'user')
fields = ('id', 'prob', 'label', 'start_offset', 'end_offset', 'user', 'document')
read_only_fields = ('user',)
def create(self, validated_data):
annotation = SequenceAnnotation.objects.create(**validated_data)
return annotation
class Seq2seqAnnotationSerializer(serializers.ModelSerializer):
document = serializers.PrimaryKeyRelatedField(queryset=Document.objects.all())
class Meta:
model = Seq2seqAnnotation
fields = ('id', 'text', 'user')
fields = ('id', 'text', 'user', 'document')
read_only_fields = ('user',)

13
app/server/static/js/upload.js

@ -8,11 +8,13 @@ const vm = new Vue({
file: '',
messages: [],
format: 'json',
isLoading: false,
},
methods: {
upload() {
this.isLoading = true;
this.file = this.$refs.file.files[0];
let formData = new FormData();
formData.append('file', this.file);
@ -27,8 +29,10 @@ const vm = new Vue({
.then((response) => {
console.log(response);
this.messages = [];
window.location = window.location.pathname.split('/').slice(0, -1).join('/');
})
.catch((error) => {
this.isLoading = false;
if ('detail' in error.response.data) {
this.messages.push(error.response.data.detail);
} else if ('text' in error.response.data) {
@ -38,6 +42,14 @@ const vm = new Vue({
},
download() {
let headers = {};
if (this.format === 'csv') {
headers.Accept = 'text/csv; charset=utf-8';
headers['Content-Type'] = 'text/csv; charset=utf-8';
} else {
headers.Accept = 'application/json';
headers['Content-Type'] = 'application/json';
}
HTTP({
url: 'docs/download',
method: 'GET',
@ -45,6 +57,7 @@ const vm = new Vue({
params: {
q: this.format,
},
headers,
}).then((response) => {
const url = window.URL.createObjectURL(new Blob([response.data]));
const link = document.createElement('a');

2
app/server/templates/admin/upload/base.html

@ -27,7 +27,7 @@
<div class="file has-name is-primary">
<label class="file-label">
<input class="file-input" type="file" ref="file" name="file" required v-on:change="upload()">
<span class="file-cta">
<span class="file-cta button" v-bind:class="{'is-loading': isLoading}">
<span class="file-icon">
<i class="fas fa-upload"></i>
</span>

2
app/server/templates/admin/upload/sequence_labeling.html

@ -3,7 +3,7 @@
{% block select-format-area %}
<label class="radio">
<input type="radio" name="format" value="conll" :checked="format=='conll'" v-model="format">
CoNll
CoNLL
</label>
<label class="radio">
<input type="radio" name="format" value="json" :checked="format=='json'" v-model="format">

129
app/server/tests/test_api.py

@ -4,10 +4,9 @@ from rest_framework import status
from rest_framework.reverse import reverse
from rest_framework.test import APITestCase
from model_mommy import mommy
from ..models import User, SequenceAnnotation, Document, Label, Seq2seqAnnotation, DocumentAnnotation
from ..models import User, SequenceAnnotation, Document
from ..models import DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ
from ..utils import CoNLLHandler, CSVClassificationHandler, CSVSeq2seqHandler
from ..utils import JsonClassificationHandler, JsonLabelingHandler, JsonSeq2seqHandler
from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser
from ..exceptions import FileParseException
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
@ -733,92 +732,52 @@ class TestUploader(APITestCase):
format='plain',
expected_status=status.HTTP_201_CREATED)
def test_can_upload_data_without_label(self):
self.upload_test_helper(url=self.classification_url,
filename='example.jsonl',
format='json',
expected_status=status.HTTP_201_CREATED)
class TestFileHandler(APITestCase):
@classmethod
def setUpTestData(cls):
cls.super_user_name = 'super_user_name'
cls.super_user_pass = 'super_user_pass'
# Todo: change super_user to project_admin.
cls.super_user = User.objects.create_superuser(username=cls.super_user_name,
password=cls.super_user_pass,
email='fizz@buzz.com')
cls.classification_project = mommy.make('server.TextClassificationProject', users=[cls.super_user])
cls.labeling_project = mommy.make('server.SequenceLabelingProject', users=[cls.super_user])
cls.seq2seq_project = mommy.make('server.Seq2seqProject', users=[cls.super_user])
def handler_test_helper(self, filename, handler):
class TestParser(APITestCase):
def parser_helper(self, filename, parser, include_label=True):
with open(os.path.join(DATA_DIR, filename), mode='rb') as f:
handler.handle_uploaded_file(f, self.super_user)
result = parser.parse(f)
for data in result:
for r in data:
self.assertIn('text', r)
if include_label:
self.assertIn('labels', r)
def test_give_valid_data_to_conll_handler(self):
self.handler_test_helper(filename='labeling.conll',
handler=CoNLLHandler(self.labeling_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Label.objects.count(), 3) # LOC, PER, O
self.assertEqual(SequenceAnnotation.objects.count(), 20) # num of annotation line
def test_give_valid_data_to_conll_parser(self):
self.parser_helper(filename='labeling.conll', parser=CoNLLParser())
def test_give_invalid_data_to_conll_handler(self):
def test_plain_parser(self):
self.parser_helper(filename='example.txt', parser=PlainTextParser(), include_label=False)
def test_give_invalid_data_to_conll_parser(self):
with self.assertRaises(FileParseException):
self.handler_test_helper(filename='labeling.invalid.conll',
handler=CoNLLHandler(self.labeling_project))
self.assertEqual(Document.objects.count(), 0)
self.assertEqual(Label.objects.count(), 0)
self.assertEqual(SequenceAnnotation.objects.count(), 0)
def test_give_valid_data_to_csv_classification_handler(self):
self.handler_test_helper(filename='example.csv',
handler=CSVClassificationHandler(self.classification_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Label.objects.count(), 2)
self.assertEqual(DocumentAnnotation.objects.count(), 3)
def test_give_valid_data_to_csv_seq2seq_handler(self):
self.handler_test_helper(filename='example.csv',
handler=CSVSeq2seqHandler(self.seq2seq_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Seq2seqAnnotation.objects.count(), 3)
def test_give_valid_data_to_json_classification_handler(self):
self.handler_test_helper(filename='classification.jsonl',
handler=JsonClassificationHandler(self.classification_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Label.objects.count(), 2)
self.assertEqual(DocumentAnnotation.objects.count(), 4)
def test_give_valid_data_to_json_labeling_handler(self):
self.handler_test_helper(filename='labeling.jsonl',
handler=JsonLabelingHandler(self.labeling_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Label.objects.count(), 3)
self.assertEqual(SequenceAnnotation.objects.count(), 4)
def test_give_valid_data_to_json_seq2seq_handler(self):
self.handler_test_helper(filename='seq2seq.jsonl',
handler=JsonSeq2seqHandler(self.seq2seq_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Seq2seqAnnotation.objects.count(), 4)
def test_give_data_without_label_to_json_classification_handler(self):
self.handler_test_helper(filename='example.jsonl',
handler=JsonClassificationHandler(self.classification_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Label.objects.count(), 0)
self.assertEqual(DocumentAnnotation.objects.count(), 0)
def test_give_data_without_label_to_json_labeling_handler(self):
self.handler_test_helper(filename='example.jsonl',
handler=JsonLabelingHandler(self.labeling_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Label.objects.count(), 0)
self.assertEqual(SequenceAnnotation.objects.count(), 0)
def test_give_data_without_label_to_json_seq2seq_handler(self):
self.handler_test_helper(filename='example.jsonl',
handler=JsonSeq2seqHandler(self.seq2seq_project))
self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Seq2seqAnnotation.objects.count(), 0)
self.parser_helper(filename='labeling.invalid.conll',
parser=CoNLLParser())
def test_give_classification_data_to_csv_parser(self):
self.parser_helper(filename='example.csv', parser=CSVParser())
def test_give_seq2seq_data_to_csv_parser(self):
self.parser_helper(filename='example.csv', parser=CSVParser())
def test_give_classification_data_to_json_parser(self):
self.parser_helper(filename='classification.jsonl', parser=JSONParser())
def test_give_labeling_data_to_json_parser(self):
self.parser_helper(filename='labeling.jsonl', parser=JSONParser())
def test_give_seq2seq_data_to_json_parser(self):
self.parser_helper(filename='seq2seq.jsonl', parser=JSONParser())
def test_give_data_without_label_to_json_parser(self):
self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False)
class TestDownloader(APITestCase):
@ -858,10 +817,10 @@ class TestDownloader(APITestCase):
format='csv',
expected_status=status.HTTP_200_OK)
def test_cannot_download_labeling_csv(self):
def test_can_download_labeling_csv(self):
self.download_test_helper(url=self.labeling_url,
format='csv',
expected_status=status.HTTP_400_BAD_REQUEST)
expected_status=status.HTTP_200_OK)
def test_can_download_seq2seq_csv(self):
self.download_test_helper(url=self.seq2seq_url,

509
app/server/utils.py

@ -1,15 +1,18 @@
import csv
import io
import itertools
import json
import re
from collections import defaultdict
from django.db import transaction
from django.http import HttpResponse
from rest_framework.exceptions import ValidationError
from rest_framework.renderers import JSONRenderer
from seqeval.metrics.sequence_labeling import get_entities
from app.settings import IMPORT_BATCH_SIZE
from .exceptions import FileParseException
from .models import Label
from .serializers import DocumentSerializer, LabelSerializer
from .serializers import SequenceAnnotationSerializer, DocumentAnnotationSerializer, Seq2seqAnnotationSerializer
def extract_label(tag):
@ -21,44 +24,232 @@ def extract_label(tag):
return tag
class FileHandler(object):
annotation_serializer = None
class BaseStorage(object):
def __init__(self, project):
def __init__(self, data, project):
self.data = data
self.project = project
@transaction.atomic
def handle_uploaded_file(self, file, user):
raise NotImplementedError()
def parse(self, file):
raise NotImplementedError()
def render(self):
def save(self, user):
raise NotImplementedError()
def save_doc(self, data):
serializer = DocumentSerializer(data=data)
serializer = DocumentSerializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
doc = serializer.save(project=self.project)
return doc
def save_label(self, data):
from .models import Label
label = Label.objects.filter(project=self.project, **data).first()
serializer = LabelSerializer(label, data=data)
serializer = LabelSerializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
label = serializer.save(project=self.project)
return label
def save_annotation(self, data, doc, user):
serializer = self.annotation_serializer(data=data)
def save_annotation(self, data, user):
annotation_serializer = self.project.get_annotation_serializer()
serializer = annotation_serializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
annotation = serializer.save(document=doc, user=user)
annotation = serializer.save(user=user)
return annotation
def extract_label(self, data):
"""Extract labels from parsed data.
Example:
>>> data = [{"labels": ["positive"]}, {"labels": ["negative"]}]
>>> self.extract_label(data)
[["positive"], ["negative"]]
"""
return [d.get('labels', []) for d in data]
def exclude_created_labels(self, labels, created):
"""Exclude created labels.
Example:
>>> labels = ["positive", "negative"]
>>> created = {"positive": ...}
>>> self.exclude_created_labels(labels, created)
["negative"]
"""
return [label for label in labels if label not in created]
class CoNLLHandler(FileHandler):
def to_serializer_format(self, labels):
"""Exclude created labels.
Example:
>>> labels = ["positive"]
>>> self.to_serializer_format(labels)
[{"text": "negative"}]
```
"""
return [{'text': label} for label in labels]
def update_saved_labels(self, saved, new):
"""Update saved labels.
Example:
>>> saved = {'positive': ...}
>>> new = [<Label: positive>]
"""
for label in new:
saved[label.text] = label
return saved
class PlainStorage(BaseStorage):
@transaction.atomic
def save(self, user):
for text in self.data:
self.save_doc(text)
class ClassificationStorage(BaseStorage):
"""Store json for text classification.
The format is as follows:
{"text": "Python is awesome!", "labels": ["positive"]}
...
"""
@transaction.atomic
def save(self, user):
saved_labels = {label.text: label for label in self.project.labels.all()}
for data in self.data:
docs = self.save_doc(data)
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
self.save_annotation(annotations, user)
def extract_unique_labels(self, labels):
"""Extract unique labels
Example:
>>> labels = [["positive"], ["positive", "negative"], ["negative"]]
>>> self.extract_unique_labels(labels)
["positive", "negative"]
"""
return set(itertools.chain(*labels))
def make_annotations(self, docs, labels, saved_labels):
"""Make list of annotation obj for serializer.
Example:
>>> docs = ["<Document: a>", "<Document: b>", "<Document: c>"]
>>> labels = [["positive"], ["positive", "negative"], ["negative"]]
>>> saved_labels = {"positive": "<Label: positive>", 'negative': "<Label: negative>"}
>>> self.make_annotations(docs, labels, saved_labels)
[{"document": 1, "label": 1}, {"document": 2, "label": 1}
{"document": 2, "label": 2}, {"document": 3, "label": 2}]
"""
annotations = []
for doc, label in zip(docs, labels):
for name in label:
label = saved_labels[name]
annotations.append({'document': doc.id, 'label': label.id})
return annotations
class SequenceLabelingStorage(BaseStorage):
"""Upload jsonl for sequence labeling.
The format is as follows:
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
...
"""
@transaction.atomic
def save(self, user):
saved_labels = {label.text: label for label in self.project.labels.all()}
for data in self.data:
docs = self.save_doc(data)
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
self.save_annotation(annotations, user)
def extract_unique_labels(self, labels):
"""Extract unique labels
Example:
>>> labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]]
>>> self.extract_unique_labels(labels)
["LOC", "ORG"]
"""
return set([label for _, _, label in itertools.chain(*labels)])
def make_annotations(self, docs, labels, saved_labels):
"""Make list of annotation obj for serializer.
Example:
>>> docs = ["<Document: a>", "<Document: b>"]
>>> labels = labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]]
>>> saved_labels = {"LOC": "<Label: LOC>", 'ORG': "<Label: ORG>"}
>>> self.make_annotations(docs, labels, saved_labels)
[
{"document": 1, "label": 1, "start_offset": 0, "end_offset": 1}
{"document": 2, "label": 2, "start_offset": 3, "end_offset": 4}
]
"""
annotations = []
for doc, spans in zip(docs, labels):
for span in spans:
start_offset, end_offset, name = span
label = saved_labels[name]
annotations.append({'document': doc.id,
'label': label.id,
'start_offset': start_offset,
'end_offset': end_offset})
return annotations
class Seq2seqStorage(BaseStorage):
"""Store json for seq2seq.
The format is as follows:
{"text": "Hello, World!", "labels": ["こんにちは、世界!"]}
...
"""
@transaction.atomic
def save(self, user):
for data in self.data:
doc = self.save_doc(data)
labels = self.extract_label(data)
annotations = self.make_annotations(doc, labels)
self.save_annotation(annotations, user)
def make_annotations(self, docs, labels):
"""Make list of annotation obj for serializer.
Example:
>>> docs = ["<Document: a>", "<Document: b>"]
>>> labels = [["Hello!"], ["How are you?", "What's up?"]]
>>> self.make_annotations(docs, labels)
[{"document": 1, "text": "Hello"}, {"document": 2, "text": "How are you?"}
{"document": 2, "text": "What's up?"}]
"""
annotations = []
for doc, texts in zip(docs, labels):
for text in texts:
annotations.append({'document': doc.id, 'text': text})
return annotations
class FileParser(object):
def parse(self, file):
raise NotImplementedError()
class CoNLLParser(FileParser):
"""Uploads CoNLL format file.
The file format is tab-separated values.
@ -80,26 +271,19 @@ class CoNLLHandler(FileHandler):
...
```
"""
annotation_serializer = SequenceAnnotationSerializer
@transaction.atomic
def handle_uploaded_file(self, file, user):
for words, tags in self.parse(file):
start_offset = 0
sent = ' '.join(words)
doc = self.save_doc({'text': sent})
for word, tag in zip(words, tags):
label = extract_label(tag)
label = self.save_label({'text': label})
data = {'start_offset': start_offset,
'end_offset': start_offset + len(word),
'label': label.id}
start_offset += len(word) + 1
self.save_annotation(data, doc, user)
def parse(self, file):
"""Store json for seq2seq.
Return format:
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
...
"""
words, tags = [], []
data = []
for i, line in enumerate(file, start=1):
if len(data) >= IMPORT_BATCH_SIZE:
yield data
data = []
line = line.decode('utf-8')
line = line.strip()
if line:
@ -110,16 +294,42 @@ class CoNLLHandler(FileHandler):
words.append(word)
tags.append(tag)
else:
yield words, tags
j = self.calc_char_offset(words, tags)
data.append(j)
words, tags = [], []
if len(words) > 0:
yield words, tags
def render(self):
raise ValidationError("This project type doesn't support CoNLL format.")
class PlainTextHandler(FileHandler):
j = self.calc_char_offset(words, tags)
data.append(j)
yield data
def calc_char_offset(self, words, tags):
"""
Examples:
>>> words = ['EU', 'rejects', 'German', 'call']
>>> tags = ['B-ORG', 'O', 'B-MISC', 'O']
>>> entities = get_entities(tags)
>>> entities
[['ORG', 0, 0], ['MISC', 2, 2]]
>>> self.calc_char_offset(words, tags)
{
'text': 'EU rejects German call',
'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
}
"""
doc = ' '.join(words)
j = {'text': ' '.join(words), 'labels': []}
pos = defaultdict(int)
for label, start_offset, end_offset in get_entities(tags):
entity = ' '.join(words[start_offset: end_offset + 1])
char_left = doc.index(entity, pos[entity])
char_right = char_left + len(entity)
span = [char_left, char_right, label]
j['labels'].append(span)
pos[entity] = char_right
return j
class PlainTextParser(FileParser):
"""Uploads plain text.
The file format is as follows:
@ -129,21 +339,16 @@ class PlainTextHandler(FileHandler):
...
```
"""
@transaction.atomic
def handle_uploaded_file(self, file, user):
for text in self.parse(file):
self.save_doc({'text': text})
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
for i, line in enumerate(file, start=1):
yield line.strip()
def render(self):
raise ValidationError("You cannot download plain text. Please specify csv or json.")
while True:
batch = list(itertools.islice(file, IMPORT_BATCH_SIZE))
if not batch:
raise StopIteration
yield [{'text': line.strip()} for line in batch]
class CSVHandler(FileHandler):
class CSVParser(FileParser):
"""Uploads csv file.
The file format is comma separated values.
@ -161,163 +366,81 @@ class CSVHandler(FileHandler):
file = io.TextIOWrapper(file, encoding='utf-8')
reader = csv.reader(file)
columns = next(reader)
data = []
for i, row in enumerate(reader, start=2):
if len(data) >= IMPORT_BATCH_SIZE:
yield data
data = []
if len(row) == len(columns) and len(row) >= 2:
text, label = row[:2]
meta = json.dumps(dict(zip(columns[2:], row[2:])))
data = {'text': text, 'meta': meta}
yield data, label
j = {'text': text, 'labels': [label], 'meta': meta}
data.append(j)
else:
raise FileParseException(line_num=i, line=row)
if data:
yield data
def render(self):
queryset = self.project.documents.all()
serializer = DocumentSerializer(queryset, many=True)
filename = '_'.join(self.project.name.lower().split())
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format(filename)
writer = csv.writer(response)
columns = ['id', 'text', 'label', 'user']
meta_keys = self.get_meta_keys(serializer.data)
columns.extend(meta_keys)
writer.writerow(columns)
for d in serializer.data:
meta = json.loads(d['meta'])
for a in d['annotations']:
row = self.make_row(d, a)
row.extend([meta[k] for k in meta_keys])
writer.writerow(row)
return response
def get_meta_keys(self, data):
if len(data):
meta = json.loads(data[0]['meta'])
return sorted(meta.keys())
else:
return []
def make_row(self, doc, annotation):
raise NotImplementedError('Please implement in subclass.')
class CSVClassificationHandler(CSVHandler):
annotation_serializer = DocumentAnnotationSerializer
@transaction.atomic
def handle_uploaded_file(self, file, user):
for data, label in self.parse(file):
doc = self.save_doc(data)
label = self.save_label({'text': label})
self.save_annotation({'label': label.id}, doc, user)
def make_row(self, doc, annotation):
row = [doc['id'], doc['text'], annotation['label'], annotation['user']]
return row
class CSVSeq2seqHandler(CSVHandler):
annotation_serializer = Seq2seqAnnotationSerializer
@transaction.atomic
def handle_uploaded_file(self, file, user):
for data, label in self.parse(file):
doc = self.save_doc(data)
self.save_annotation({'text': label}, doc, user)
def make_row(self, doc, annotation):
row = [doc['id'], doc['text'], annotation['text'], annotation['user']]
return row
class JSONParser(FileParser):
class JsonHandler(FileHandler):
"""Uploads jsonl file.
The file format is as follows:
```
{"text": "example1"}
{"text": "example2"}
...
```
"""
def parse(self, file):
data = []
for i, line in enumerate(file, start=1):
if len(data) >= IMPORT_BATCH_SIZE:
yield data
data = []
try:
j = json.loads(line)
j['meta'] = json.dumps(j.get('meta', {}))
yield j
data.append(j)
except json.decoder.JSONDecodeError:
raise FileParseException(line_num=i, line=line)
if data:
yield data
def render(self):
queryset = self.project.documents.all()
serializer = DocumentSerializer(queryset, many=True)
filename = '_'.join(self.project.name.lower().split())
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename="{}.jsonl"'.format(filename)
for d in serializer.data:
d['meta'] = json.loads(d['meta'])
dump = json.dumps(d, ensure_ascii=False)
response.write(dump + '\n')
return response
class JSONLRenderer(JSONRenderer):
class JsonClassificationHandler(JsonHandler):
"""Upload jsonl for text classification.
def render(self, data, accepted_media_type=None, renderer_context=None):
"""
Render `data` into JSON, returning a bytestring.
"""
if data is None:
return bytes()
The format is as follows:
```
{"text": "Python is awesome!", "labels": ["positive"]}
...
```
"""
annotation_serializer = DocumentAnnotationSerializer
@transaction.atomic
def handle_uploaded_file(self, file, user):
for data in self.parse(file):
doc = self.save_doc(data)
for label in data.get('labels', []):
label = self.save_label({'text': label})
self.save_annotation({'label': label.id}, doc, user)
class JsonLabelingHandler(JsonHandler):
"""Upload jsonl for sequence labeling.
The format is as follows:
```
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
...
```
"""
annotation_serializer = SequenceAnnotationSerializer
if not isinstance(data, list):
data = [data]
@transaction.atomic
def handle_uploaded_file(self, file, user):
for data in self.parse(file):
doc = self.save_doc(data)
for start_offset, end_offset, label in data.get('labels', []):
label = self.save_label({'text': label})
data = {'label': label.id,
'start_offset': start_offset,
'end_offset': end_offset}
self.save_annotation(data, doc, user)
for d in data:
yield json.dumps(d,
cls=self.encoder_class,
ensure_ascii=self.ensure_ascii,
allow_nan=not self.strict) + '\n'
class JsonSeq2seqHandler(JsonHandler):
"""Upload jsonl for seq2seq.
The format is as follows:
```
{"text": "Hello, World!", "labels": ["こんにちは、世界!"]}
...
```
"""
annotation_serializer = Seq2seqAnnotationSerializer
class JSONPainter(object):
@transaction.atomic
def handle_uploaded_file(self, file, user):
for data in self.parse(file):
doc = self.save_doc(data)
for label in data.get('labels', []):
self.save_annotation({'text': label}, doc, user)
def paint(self, documents):
serializer = DocumentSerializer(documents, many=True)
data = []
for d in serializer.data:
d['meta'] = json.loads(d['meta'])
for a in d['annotations']:
a.pop('id')
a.pop('prob')
a.pop('document')
data.append(d)
return data
class CSVPainter(JSONPainter):
def paint(self, documents):
data = super().paint(documents)
res = []
for d in data:
annotations = d.pop('annotations')
for a in annotations:
res.append({**d, **a})
return res

2
requirements.txt

@ -7,6 +7,7 @@ django-widget-tweaks==1.4.2
django-polymorphic==2.0.3
django-rest-polymorphic==0.1.8
djangorestframework==3.8.2
djangorestframework-csv==2.1.0
djangorestframework-filters==0.10.2
djangorestframework-xml==1.4.0
Faker==0.8.8
@ -18,6 +19,7 @@ psycopg2==2.7.5
python-dateutil==2.7.3
pytz==2018.4
six==1.11.0
seqeval==0.0.6
social-auth-app-django==3.1.0
social-auth-core[azuread]==3.0.0
text-unidecode==1.2

Loading…
Cancel
Save