diff --git a/app/server/api.py b/app/server/api.py index 172a3a97..c3dbfc6e 100644 --- a/app/server/api.py +++ b/app/server/api.py @@ -1,3 +1,6 @@ +import csv +import io +import json from collections import Counter from itertools import chain @@ -11,7 +14,7 @@ from rest_framework.response import Response from rest_framework.views import APIView from rest_framework.parsers import MultiPartParser -from .exceptions import CoNLLParseException +from .exceptions import FileParseException from .models import Project, Label, Document from .models import SequenceAnnotation from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsMyEntity @@ -135,7 +138,26 @@ class EntityDetail(generics.RetrieveUpdateDestroyAPIView): permission_classes = (IsAuthenticated, IsProjectUser, IsMyEntity) -class CoNLLFileUploadAPI(APIView): +class TextUploadAPI(APIView): + """Base API for text upload.""" + parser_classes = (MultiPartParser,) + permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser) + + def post(self, request, *args, **kwargs): + if 'file' not in request.FILES: + raise ParseError('Empty content') + self.handle_uploaded_file(request.FILES['file']) + return Response(status=status.HTTP_201_CREATED) + + @transaction.atomic + def handle_uploaded_file(self, file): + raise NotImplementedError() + + def parse(self, file): + raise NotImplementedError() + + +class CoNLLFileUploadAPI(TextUploadAPI): """Uploads CoNLL format file. The file format is tab-separated values. @@ -157,14 +179,6 @@ class CoNLLFileUploadAPI(APIView): ... ``` """ - parser_classes = (MultiPartParser,) - permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser) - - def post(self, request, *args, **kwargs): - if 'file' not in request.FILES: - raise ParseError('Empty content') - self.handle_uploaded_file(request.FILES['file']) - return Response(status=status.HTTP_201_CREATED) @transaction.atomic def handle_uploaded_file(self, file): @@ -188,7 +202,7 @@ class CoNLLFileUploadAPI(APIView): try: word, tag = line.split('\t') except ValueError: - raise CoNLLParseException(line_num=i, line=line) + raise FileParseException(line_num=i, line=line) words.append(word) tags.append(tag) else: @@ -196,3 +210,95 @@ class CoNLLFileUploadAPI(APIView): words, tags = [], [] if len(words) > 0: yield words + + +class PlainTextUploadAPI(TextUploadAPI): + """Uploads plain text. + + The file format is as follows: + ``` + EU rejects German call to boycott British lamb. + President Obama is speaking at the White House. + ... + ``` + """ + @transaction.atomic + def handle_uploaded_file(self, file): + project = get_object_or_404(Project, pk=self.kwargs['project_id']) + for text in self.parse(file): + data = {'text': text} + serializer = DocumentSerializer(data=data) + serializer.is_valid(raise_exception=True) + serializer.save(project=project) + + def parse(self, file): + file = io.TextIOWrapper(file, encoding='utf-8') + for i, line in enumerate(file, start=1): + yield line.strip() + + +class CSVUploadAPI(TextUploadAPI): + """Uploads csv file. + + The file format is comma separated values. + Column names are required at the top of a file. + For example: + ``` + text, label(optional) + "EU rejects German call to boycott British lamb.", + "President Obama is speaking at the White House.", + "He lives in Newark, Ohio.", + ... + ``` + """ + + @transaction.atomic + def handle_uploaded_file(self, file): + project = get_object_or_404(Project, pk=self.kwargs['project_id']) + for text, label in self.parse(file): + data = {'text': text} + serializer = DocumentSerializer(data=data) + serializer.is_valid(raise_exception=True) + serializer.save(project=project) + + def parse(self, file): + file = io.TextIOWrapper(file, encoding='utf-8') + reader = csv.reader(file) + columns = None + for i, row in enumerate(reader, start=1): + if i == 1: # skip header + columns = row + continue + elif len(row) == len(columns) == 2: # text with a label + text, label = row + yield text, label + else: + raise FileParseException(line_num=i, line=row) + + +class JSONLUploadAPI(TextUploadAPI): + """Uploads jsonl file. + + The file format is as follows: + ``` + {"text": "example1"} + {"text": "example2"} + ... + ``` + """ + + @transaction.atomic + def handle_uploaded_file(self, file): + project = get_object_or_404(Project, pk=self.kwargs['project_id']) + for data in self.parse(file): + serializer = DocumentSerializer(data=data) + serializer.is_valid(raise_exception=True) + serializer.save(project=project) + + def parse(self, file): + for i, line in enumerate(file, start=1): + try: + j = json.loads(line) + yield j + except json.decoder.JSONDecodeError: + raise FileParseException(line_num=i, line=line) diff --git a/app/server/api_urls.py b/app/server/api_urls.py index a4b5c3fb..1f403188 100644 --- a/app/server/api_urls.py +++ b/app/server/api_urls.py @@ -5,7 +5,7 @@ from .api import ProjectList, ProjectDetail from .api import LabelList, LabelDetail from .api import DocumentList, DocumentDetail from .api import EntityList, EntityDetail -from .api import CoNLLFileUploadAPI +from .api import CoNLLFileUploadAPI, CSVUploadAPI, JSONLUploadAPI, PlainTextUploadAPI from .api import StatisticsAPI @@ -26,8 +26,14 @@ urlpatterns = [ EntityList.as_view(), name='entity_list'), path('projects//docs//entities/', EntityDetail.as_view(), name='entity_detail'), + path('projects//plain_uploader', + PlainTextUploadAPI.as_view(), name='plain_uploader'), path('projects//conll_uploader', CoNLLFileUploadAPI.as_view(), name='conll_uploader'), + path('projects//csv_uploader', + CSVUploadAPI.as_view(), name='csv_uploader'), + path('projects//json_uploader', + JSONLUploadAPI.as_view(), name='json_uploader'), ] urlpatterns = format_suffix_patterns(urlpatterns, allowed=['json', 'xml']) diff --git a/app/server/exceptions.py b/app/server/exceptions.py index b288f8a7..02adb1d0 100644 --- a/app/server/exceptions.py +++ b/app/server/exceptions.py @@ -3,10 +3,6 @@ from rest_framework.exceptions import APIException class FileParseException(APIException): - pass - - -class CoNLLParseException(APIException): status_code = status.HTTP_400_BAD_REQUEST default_detail = 'Invalid file format, line {}: {}' default_code = 'invalid' diff --git a/app/server/tests/data/example.invalid.1.csv b/app/server/tests/data/example.invalid.1.csv new file mode 100644 index 00000000..4e2a2552 --- /dev/null +++ b/app/server/tests/data/example.invalid.1.csv @@ -0,0 +1,4 @@ +text, label +AAA +BBB +CCC \ No newline at end of file diff --git a/app/server/tests/data/example.invalid.2.csv b/app/server/tests/data/example.invalid.2.csv new file mode 100644 index 00000000..0df963d4 --- /dev/null +++ b/app/server/tests/data/example.invalid.2.csv @@ -0,0 +1,4 @@ +text, label +AAA, Negative, Positive +BBB, Negative +CCC, Negative \ No newline at end of file diff --git a/app/server/tests/data/conll_wrong.tsv b/app/server/tests/data/example.invalid.conll similarity index 100% rename from app/server/tests/data/conll_wrong.tsv rename to app/server/tests/data/example.invalid.conll diff --git a/app/server/tests/data/test.jsonl b/app/server/tests/data/example.jsonl similarity index 100% rename from app/server/tests/data/test.jsonl rename to app/server/tests/data/example.jsonl diff --git a/app/server/tests/data/example.txt b/app/server/tests/data/example.txt new file mode 100644 index 00000000..161bb8ec --- /dev/null +++ b/app/server/tests/data/example.txt @@ -0,0 +1,3 @@ +example1 +example2 +example3 \ No newline at end of file diff --git a/app/server/tests/data/test.csv b/app/server/tests/data/example.valid.1.csv similarity index 50% rename from app/server/tests/data/test.csv rename to app/server/tests/data/example.valid.1.csv index 32bd28a6..9b34b36b 100644 --- a/app/server/tests/data/test.csv +++ b/app/server/tests/data/example.valid.1.csv @@ -1,3 +1,4 @@ +text AAA BBB CCC \ No newline at end of file diff --git a/app/server/tests/data/example.valid.2.csv b/app/server/tests/data/example.valid.2.csv new file mode 100644 index 00000000..89d78846 --- /dev/null +++ b/app/server/tests/data/example.valid.2.csv @@ -0,0 +1,4 @@ +text, label +AAA, Positive +BBB, Positive +CCC, Negative \ No newline at end of file diff --git a/app/server/tests/data/conll.tsv b/app/server/tests/data/example.valid.conll similarity index 100% rename from app/server/tests/data/conll.tsv rename to app/server/tests/data/example.valid.conll diff --git a/app/server/tests/test_api.py b/app/server/tests/test_api.py index 20211d48..b5008559 100644 --- a/app/server/tests/test_api.py +++ b/app/server/tests/test_api.py @@ -633,35 +633,51 @@ class TestUploader(APITestCase): password=cls.super_user_pass, email='fizz@buzz.com') cls.main_project = mixer.blend('server.Project', users=[project_member, super_user]) + cls.conll_url = reverse(viewname='conll_uploader', args=[cls.main_project.id]) + cls.csv_url = reverse(viewname='csv_uploader', args=[cls.main_project.id]) + cls.json_url = reverse(viewname='json_uploader', args=[cls.main_project.id]) + cls.plain_url = reverse(viewname='plain_uploader', args=[cls.main_project.id]) - def test_can_upload_conll_format_file(self): - self.assertEqual(Document.objects.count(), 0) + def setUp(self): self.client.login(username=self.super_user_name, password=self.super_user_pass) - filename = 'conll.tsv' - with open(os.path.join(DATA_DIR, filename)) as f: - url = reverse(viewname='conll_uploader', args=[self.main_project.id]) - response = self.client.post(url, data={'file': f}) - self.assertEqual(response.status_code, status.HTTP_201_CREATED) - self.assertEqual(Document.objects.count(), 3) - def test_cannot_upload_wrong_conll_format_file(self): - self.assertEqual(Document.objects.count(), 0) - self.client.login(username=self.super_user_name, - password=self.super_user_pass) - filename = 'conll_wrong.tsv' + def upload_test_helper(self, filename, url, expected_status): with open(os.path.join(DATA_DIR, filename)) as f: - url = reverse(viewname='conll_uploader', args=[self.main_project.id]) response = self.client.post(url, data={'file': f}) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertEqual(Document.objects.count(), 0) + self.assertEqual(response.status_code, expected_status) - def test_cannot_upload_wrong_filename(self): - self.assertEqual(Document.objects.count(), 0) - self.client.login(username=self.super_user_name, - password=self.super_user_pass) - filename = 'conll.tsv' - with open(os.path.join(DATA_DIR, filename)) as f: - url = reverse(viewname='conll_uploader', args=[self.main_project.id]) - response = self.client.post(url, data={'fizz': f}) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + def test_can_upload_conll_format_file(self): + self.upload_test_helper(filename='example.valid.conll', + url=self.conll_url, + expected_status=status.HTTP_201_CREATED) + + def test_cannot_upload_wrong_conll_format_file(self): + self.upload_test_helper(filename='example.invalid.conll', + url=self.conll_url, + expected_status=status.HTTP_400_BAD_REQUEST) + + def test_can_upload_csv_with_label(self): + self.upload_test_helper(filename='example.valid.2.csv', + url=self.csv_url, + expected_status=status.HTTP_201_CREATED) + + def test_cannot_upload_csv_file_does_not_match_column_and_row(self): + self.upload_test_helper(filename='example.invalid.1.csv', + url=self.csv_url, + expected_status=status.HTTP_400_BAD_REQUEST) + + def test_cannot_upload_csv_file_has_too_many_columns(self): + self.upload_test_helper(filename='example.invalid.2.csv', + url=self.csv_url, + expected_status=status.HTTP_400_BAD_REQUEST) + + def test_can_upload_jsonl(self): + self.upload_test_helper(filename='example.jsonl', + url=self.json_url, + expected_status=status.HTTP_201_CREATED) + + def test_can_upload_plain_text(self): + self.upload_test_helper(filename='example.txt', + url=self.plain_url, + expected_status=status.HTTP_201_CREATED) diff --git a/app/server/tests/test_views.py b/app/server/tests/test_views.py index 56b333cf..2ea3557e 100644 --- a/app/server/tests/test_views.py +++ b/app/server/tests/test_views.py @@ -11,8 +11,8 @@ class TestUpload(TestCase): def setUp(self): self.username, self.password = 'user', 'pass' self.client = Client() - self.csv_path = os.path.join(os.path.dirname(__file__), 'data/test.csv') - self.json_path = os.path.join(os.path.dirname(__file__), 'data/test.jsonl') + self.csv_path = os.path.join(os.path.dirname(__file__), 'data/example.valid.1.csv') + self.json_path = os.path.join(os.path.dirname(__file__), 'data/example.jsonl') def create_user(self): user = User.objects.create_user(username=self.username, password=self.password)