Browse Source

Add file uploader for CoNLL format

Corresponding to issue #11
pull/110/head
Hironsan 6 years ago
parent
commit
04e2e55693
6 changed files with 183 additions and 1 deletions
  1. 69
      app/server/api.py
  2. 3
      app/server/api_urls.py
  3. 16
      app/server/exceptions.py
  4. 22
      app/server/tests/data/conll.tsv
  5. 22
      app/server/tests/data/conll_wrong.tsv
  6. 52
      app/server/tests/test_api.py

69
app/server/api.py

@ -1,13 +1,17 @@
from collections import Counter
from itertools import chain
from django.db import transaction
from django.shortcuts import get_object_or_404
from django_filters.rest_framework import DjangoFilterBackend
from rest_framework import generics, filters
from rest_framework import generics, filters, status
from rest_framework.exceptions import ParseError
from rest_framework.permissions import IsAuthenticated, IsAdminUser
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.parsers import MultiPartParser
from .exceptions import CoNLLParseException
from .models import Project, Label, Document
from .models import SequenceAnnotation
from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsMyEntity
@ -129,3 +133,66 @@ class EntityDetail(generics.RetrieveUpdateDestroyAPIView):
serializer_class = SequenceAnnotationSerializer
lookup_url_kwarg = 'entity_id'
permission_classes = (IsAuthenticated, IsProjectUser, IsMyEntity)
class CoNLLFileUploadAPI(APIView):
"""Uploads CoNLL format file.
The file format is tab-separated values.
A blank line is required at the end of a sentence.
For example:
```
EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O
Peter B-PER
Blackburn I-PER
...
```
"""
parser_classes = (MultiPartParser,)
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
def post(self, request, *args, **kwargs):
if 'file' not in request.FILES:
raise ParseError('Empty content')
self.handle_uploaded_file(request.FILES['file'])
return Response(status=status.HTTP_201_CREATED)
@transaction.atomic
def handle_uploaded_file(self, file):
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
for words in self.parse(file):
sent = self.words_to_sent(words)
data = {'text': sent}
serializer = DocumentSerializer(data=data)
serializer.is_valid(raise_exception=True)
serializer.save(project=project)
def words_to_sent(self, words):
return ' '.join(words)
def parse(self, file):
words, tags = [], []
for i, line in enumerate(file, start=1):
line = line.decode('utf-8')
line = line.strip()
if line:
try:
word, tag = line.split('\t')
except ValueError:
raise CoNLLParseException(line_num=i, line=line)
words.append(word)
tags.append(tag)
else:
yield words
words, tags = [], []
if len(words) > 0:
yield words

3
app/server/api_urls.py

@ -5,6 +5,7 @@ from .api import ProjectList, ProjectDetail
from .api import LabelList, LabelDetail
from .api import DocumentList, DocumentDetail
from .api import EntityList, EntityDetail
from .api import CoNLLFileUploadAPI
from .api import StatisticsAPI
@ -25,6 +26,8 @@ urlpatterns = [
EntityList.as_view(), name='entity_list'),
path('projects/<int:project_id>/docs/<int:doc_id>/entities/<int:entity_id>',
EntityDetail.as_view(), name='entity_detail'),
path('projects/<int:project_id>/conll_uploader',
CoNLLFileUploadAPI.as_view(), name='conll_uploader'),
]
urlpatterns = format_suffix_patterns(urlpatterns, allowed=['json', 'xml'])

16
app/server/exceptions.py

@ -0,0 +1,16 @@
from rest_framework import status
from rest_framework.exceptions import APIException
class FileParseException(APIException):
pass
class CoNLLParseException(APIException):
status_code = status.HTTP_400_BAD_REQUEST
default_detail = 'Invalid file format, line {}: {}'
default_code = 'invalid'
def __init__(self, line_num, line, code=None):
detail = self.default_detail.format(line_num, line)
super().__init__(detail, code)

22
app/server/tests/data/conll.tsv

@ -0,0 +1,22 @@
SOCCER O
- O
JAPAN B-LOC
GET O
LUCKY O
WIN O
, O
CHINA B-PER
IN O
SURPRISE O
DEFEAT O
. O
Nadim B-PER
Ladki I-PER
AL-AIN B-LOC
, O
United B-LOC
Arab I-LOC
Emirates I-LOC
1996-12-06 O

22
app/server/tests/data/conll_wrong.tsv

@ -0,0 +1,22 @@
SOCCERO
- O
JAPAN B-LOC
GET O
LUCKY O
WIN O
, O
CHINA B-PER
IN O
SURPRISE O
DEFEAT O
. O
Nadim B-PER
Ladki I-PER
AL-AIN B-LOC
, O
United B-LOC
Arab I-LOC
Emirates I-LOC
1996-12-06 O

52
app/server/tests/test_api.py

@ -1,8 +1,11 @@
import os
from rest_framework import status
from rest_framework.reverse import reverse
from rest_framework.test import APITestCase
from mixer.backend.django import mixer
from ..models import User, SequenceAnnotation, Document
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
class TestProjectListAPI(APITestCase):
@ -613,3 +616,52 @@ class TestFilter(APITestCase):
seq_annotations__label__id=self.label1.id).values()
for d1, d2 in zip(response.data['results'], docs):
self.assertEqual(d1['id'], d2['id'])
class TestUploader(APITestCase):
@classmethod
def setUpTestData(cls):
cls.project_member_name = 'project_member_name'
cls.project_member_pass = 'project_member_pass'
project_member = User.objects.create_user(username=cls.project_member_name,
password=cls.project_member_pass)
cls.super_user_name = 'super_user_name'
cls.super_user_pass = 'super_user_pass'
# Todo: change super_user to project_admin.
super_user = User.objects.create_superuser(username=cls.super_user_name,
password=cls.super_user_pass,
email='fizz@buzz.com')
cls.main_project = mixer.blend('server.Project', users=[project_member, super_user])
def test_can_upload_conll_format_file(self):
self.assertEqual(Document.objects.count(), 0)
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
filename = 'conll.tsv'
with open(os.path.join(DATA_DIR, filename)) as f:
url = reverse(viewname='conll_uploader', args=[self.main_project.id])
response = self.client.post(url, data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
self.assertEqual(Document.objects.count(), 3)
def test_cannot_upload_wrong_conll_format_file(self):
self.assertEqual(Document.objects.count(), 0)
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
filename = 'conll_wrong.tsv'
with open(os.path.join(DATA_DIR, filename)) as f:
url = reverse(viewname='conll_uploader', args=[self.main_project.id])
response = self.client.post(url, data={'file': f})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(Document.objects.count(), 0)
def test_cannot_upload_wrong_filename(self):
self.assertEqual(Document.objects.count(), 0)
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
filename = 'conll.tsv'
with open(os.path.join(DATA_DIR, filename)) as f:
url = reverse(viewname='conll_uploader', args=[self.main_project.id])
response = self.client.post(url, data={'fizz': f})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
Loading…
Cancel
Save