Browse Source

Add PlainText, CSV, JSONL uploader

pull/110/head
Hironsan 5 years ago
parent
commit
f4f55b4b03
13 changed files with 183 additions and 43 deletions
  1. 128
      app/server/api.py
  2. 8
      app/server/api_urls.py
  3. 4
      app/server/exceptions.py
  4. 4
      app/server/tests/data/example.invalid.1.csv
  5. 4
      app/server/tests/data/example.invalid.2.csv
  6. 0
      app/server/tests/data/example.invalid.conll
  7. 0
      app/server/tests/data/example.jsonl
  8. 3
      app/server/tests/data/example.txt
  9. 1
      app/server/tests/data/example.valid.1.csv
  10. 4
      app/server/tests/data/example.valid.2.csv
  11. 0
      app/server/tests/data/example.valid.conll
  12. 66
      app/server/tests/test_api.py
  13. 4
      app/server/tests/test_views.py

128
app/server/api.py

@ -1,3 +1,6 @@
import csv
import io
import json
from collections import Counter
from itertools import chain
@ -11,7 +14,7 @@ from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.parsers import MultiPartParser
from .exceptions import CoNLLParseException
from .exceptions import FileParseException
from .models import Project, Label, Document
from .models import SequenceAnnotation
from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsMyEntity
@ -135,7 +138,26 @@ class EntityDetail(generics.RetrieveUpdateDestroyAPIView):
permission_classes = (IsAuthenticated, IsProjectUser, IsMyEntity)
class CoNLLFileUploadAPI(APIView):
class TextUploadAPI(APIView):
"""Base API for text upload."""
parser_classes = (MultiPartParser,)
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
def post(self, request, *args, **kwargs):
if 'file' not in request.FILES:
raise ParseError('Empty content')
self.handle_uploaded_file(request.FILES['file'])
return Response(status=status.HTTP_201_CREATED)
@transaction.atomic
def handle_uploaded_file(self, file):
raise NotImplementedError()
def parse(self, file):
raise NotImplementedError()
class CoNLLFileUploadAPI(TextUploadAPI):
"""Uploads CoNLL format file.
The file format is tab-separated values.
@ -157,14 +179,6 @@ class CoNLLFileUploadAPI(APIView):
...
```
"""
parser_classes = (MultiPartParser,)
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
def post(self, request, *args, **kwargs):
if 'file' not in request.FILES:
raise ParseError('Empty content')
self.handle_uploaded_file(request.FILES['file'])
return Response(status=status.HTTP_201_CREATED)
@transaction.atomic
def handle_uploaded_file(self, file):
@ -188,7 +202,7 @@ class CoNLLFileUploadAPI(APIView):
try:
word, tag = line.split('\t')
except ValueError:
raise CoNLLParseException(line_num=i, line=line)
raise FileParseException(line_num=i, line=line)
words.append(word)
tags.append(tag)
else:
@ -196,3 +210,95 @@ class CoNLLFileUploadAPI(APIView):
words, tags = [], []
if len(words) > 0:
yield words
class PlainTextUploadAPI(TextUploadAPI):
"""Uploads plain text.
The file format is as follows:
```
EU rejects German call to boycott British lamb.
President Obama is speaking at the White House.
...
```
"""
@transaction.atomic
def handle_uploaded_file(self, file):
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
for text in self.parse(file):
data = {'text': text}
serializer = DocumentSerializer(data=data)
serializer.is_valid(raise_exception=True)
serializer.save(project=project)
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
for i, line in enumerate(file, start=1):
yield line.strip()
class CSVUploadAPI(TextUploadAPI):
"""Uploads csv file.
The file format is comma separated values.
Column names are required at the top of a file.
For example:
```
text, label(optional)
"EU rejects German call to boycott British lamb.",
"President Obama is speaking at the White House.",
"He lives in Newark, Ohio.",
...
```
"""
@transaction.atomic
def handle_uploaded_file(self, file):
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
for text, label in self.parse(file):
data = {'text': text}
serializer = DocumentSerializer(data=data)
serializer.is_valid(raise_exception=True)
serializer.save(project=project)
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
reader = csv.reader(file)
columns = None
for i, row in enumerate(reader, start=1):
if i == 1: # skip header
columns = row
continue
elif len(row) == len(columns) == 2: # text with a label
text, label = row
yield text, label
else:
raise FileParseException(line_num=i, line=row)
class JSONLUploadAPI(TextUploadAPI):
"""Uploads jsonl file.
The file format is as follows:
```
{"text": "example1"}
{"text": "example2"}
...
```
"""
@transaction.atomic
def handle_uploaded_file(self, file):
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
for data in self.parse(file):
serializer = DocumentSerializer(data=data)
serializer.is_valid(raise_exception=True)
serializer.save(project=project)
def parse(self, file):
for i, line in enumerate(file, start=1):
try:
j = json.loads(line)
yield j
except json.decoder.JSONDecodeError:
raise FileParseException(line_num=i, line=line)

8
app/server/api_urls.py

@ -5,7 +5,7 @@ from .api import ProjectList, ProjectDetail
from .api import LabelList, LabelDetail
from .api import DocumentList, DocumentDetail
from .api import EntityList, EntityDetail
from .api import CoNLLFileUploadAPI
from .api import CoNLLFileUploadAPI, CSVUploadAPI, JSONLUploadAPI, PlainTextUploadAPI
from .api import StatisticsAPI
@ -26,8 +26,14 @@ urlpatterns = [
EntityList.as_view(), name='entity_list'),
path('projects/<int:project_id>/docs/<int:doc_id>/entities/<int:entity_id>',
EntityDetail.as_view(), name='entity_detail'),
path('projects/<int:project_id>/plain_uploader',
PlainTextUploadAPI.as_view(), name='plain_uploader'),
path('projects/<int:project_id>/conll_uploader',
CoNLLFileUploadAPI.as_view(), name='conll_uploader'),
path('projects/<int:project_id>/csv_uploader',
CSVUploadAPI.as_view(), name='csv_uploader'),
path('projects/<int:project_id>/json_uploader',
JSONLUploadAPI.as_view(), name='json_uploader'),
]
urlpatterns = format_suffix_patterns(urlpatterns, allowed=['json', 'xml'])

4
app/server/exceptions.py

@ -3,10 +3,6 @@ from rest_framework.exceptions import APIException
class FileParseException(APIException):
pass
class CoNLLParseException(APIException):
status_code = status.HTTP_400_BAD_REQUEST
default_detail = 'Invalid file format, line {}: {}'
default_code = 'invalid'

4
app/server/tests/data/example.invalid.1.csv

@ -0,0 +1,4 @@
text, label
AAA
BBB
CCC

4
app/server/tests/data/example.invalid.2.csv

@ -0,0 +1,4 @@
text, label
AAA, Negative, Positive
BBB, Negative
CCC, Negative

app/server/tests/data/conll_wrong.tsv → app/server/tests/data/example.invalid.conll

app/server/tests/data/test.jsonl → app/server/tests/data/example.jsonl

3
app/server/tests/data/example.txt

@ -0,0 +1,3 @@
example1
example2
example3

app/server/tests/data/test.csv → app/server/tests/data/example.valid.1.csv

@ -1,3 +1,4 @@
text
AAA
BBB
CCC

4
app/server/tests/data/example.valid.2.csv

@ -0,0 +1,4 @@
text, label
AAA, Positive
BBB, Positive
CCC, Negative

app/server/tests/data/conll.tsv → app/server/tests/data/example.valid.conll

66
app/server/tests/test_api.py

@ -633,35 +633,51 @@ class TestUploader(APITestCase):
password=cls.super_user_pass,
email='fizz@buzz.com')
cls.main_project = mixer.blend('server.Project', users=[project_member, super_user])
cls.conll_url = reverse(viewname='conll_uploader', args=[cls.main_project.id])
cls.csv_url = reverse(viewname='csv_uploader', args=[cls.main_project.id])
cls.json_url = reverse(viewname='json_uploader', args=[cls.main_project.id])
cls.plain_url = reverse(viewname='plain_uploader', args=[cls.main_project.id])
def test_can_upload_conll_format_file(self):
self.assertEqual(Document.objects.count(), 0)
def setUp(self):
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
filename = 'conll.tsv'
with open(os.path.join(DATA_DIR, filename)) as f:
url = reverse(viewname='conll_uploader', args=[self.main_project.id])
response = self.client.post(url, data={'file': f})
self.assertEqual(response.status_code, status.HTTP_201_CREATED)
self.assertEqual(Document.objects.count(), 3)
def test_cannot_upload_wrong_conll_format_file(self):
self.assertEqual(Document.objects.count(), 0)
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
filename = 'conll_wrong.tsv'
def upload_test_helper(self, filename, url, expected_status):
with open(os.path.join(DATA_DIR, filename)) as f:
url = reverse(viewname='conll_uploader', args=[self.main_project.id])
response = self.client.post(url, data={'file': f})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertEqual(Document.objects.count(), 0)
self.assertEqual(response.status_code, expected_status)
def test_cannot_upload_wrong_filename(self):
self.assertEqual(Document.objects.count(), 0)
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
filename = 'conll.tsv'
with open(os.path.join(DATA_DIR, filename)) as f:
url = reverse(viewname='conll_uploader', args=[self.main_project.id])
response = self.client.post(url, data={'fizz': f})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_can_upload_conll_format_file(self):
self.upload_test_helper(filename='example.valid.conll',
url=self.conll_url,
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_wrong_conll_format_file(self):
self.upload_test_helper(filename='example.invalid.conll',
url=self.conll_url,
expected_status=status.HTTP_400_BAD_REQUEST)
def test_can_upload_csv_with_label(self):
self.upload_test_helper(filename='example.valid.2.csv',
url=self.csv_url,
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_csv_file_does_not_match_column_and_row(self):
self.upload_test_helper(filename='example.invalid.1.csv',
url=self.csv_url,
expected_status=status.HTTP_400_BAD_REQUEST)
def test_cannot_upload_csv_file_has_too_many_columns(self):
self.upload_test_helper(filename='example.invalid.2.csv',
url=self.csv_url,
expected_status=status.HTTP_400_BAD_REQUEST)
def test_can_upload_jsonl(self):
self.upload_test_helper(filename='example.jsonl',
url=self.json_url,
expected_status=status.HTTP_201_CREATED)
def test_can_upload_plain_text(self):
self.upload_test_helper(filename='example.txt',
url=self.plain_url,
expected_status=status.HTTP_201_CREATED)

4
app/server/tests/test_views.py

@ -11,8 +11,8 @@ class TestUpload(TestCase):
def setUp(self):
self.username, self.password = 'user', 'pass'
self.client = Client()
self.csv_path = os.path.join(os.path.dirname(__file__), 'data/test.csv')
self.json_path = os.path.join(os.path.dirname(__file__), 'data/test.jsonl')
self.csv_path = os.path.join(os.path.dirname(__file__), 'data/example.valid.1.csv')
self.json_path = os.path.join(os.path.dirname(__file__), 'data/example.jsonl')
def create_user(self):
user = User.objects.create_user(username=self.username, password=self.password)

Loading…
Cancel
Save