From 18f4f650d806cf6ca37e94f96f1eb2dffea38cbe Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 1 Dec 2020 23:21:55 +0100 Subject: [PATCH] Feature: FastText Export (#362) --- app/api/tests/test_api.py | 5 ++++ app/api/utils.py | 40 ++++++++++++++++++++++++++- app/api/views.py | 14 ++++++---- frontend/services/document.service.js | 3 ++ frontend/store/projects.js | 12 +++++++- 5 files changed, 66 insertions(+), 8 deletions(-) diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py index 8a77179b..aa877c97 100644 --- a/app/api/tests/test_api.py +++ b/app/api/tests/test_api.py @@ -1528,6 +1528,11 @@ class TestDownloader(APITestCase): format='plain', expected_status=status.HTTP_400_BAD_REQUEST) + def test_can_download_classification_fasttext(self): + self.download_test_helper(url=self.classification_url, + format='txt', + expected_status=status.HTTP_200_OK) + class TestStatisticsAPI(APITestCase, TestUtilsMixin): diff --git a/app/api/utils.py b/app/api/utils.py index 977cba5a..7edda553 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -13,7 +13,7 @@ from django.db import transaction from django.conf import settings from colour import Color import pyexcel -from rest_framework.renderers import JSONRenderer +from rest_framework.renderers import JSONRenderer, BaseRenderer from seqeval.metrics.sequence_labeling import get_entities from .exceptions import FileParseException @@ -497,6 +497,44 @@ class JSONLRenderer(JSONRenderer): allow_nan=not self.strict) + '\n' +class FastTextPainter(object): + + @staticmethod + def paint_labels(documents, labels): + serializer = DocumentSerializer(documents, many=True) + serializer_labels = LabelSerializer(labels, many=True) + data = [] + for d in serializer.data: + labels = [] + for a in d['annotations']: + label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0] + labels.append('__label__{}'.format(label_obj['text'].replace(' ', '_'))) + text = d['text'].replace('\n', ' ') + if labels: + data.append('{} {}'.format(' '.join(labels), text)) + else: + data.append(text) + return data + + +class PlainTextRenderer(BaseRenderer): + media_type = 'text/plain' + format = 'txt' + charset = 'utf-8' + + def render(self, data, accepted_media_type=None, renderer_context=None): + if data is None: + return bytes() + + if not isinstance(data, list): + data = [data] + + buffer = io.BytesIO() + for d in data: + buffer.write((d + '\n').encode(self.charset)) + return buffer.getvalue() + + class JSONPainter(object): def paint(self, documents): diff --git a/app/api/views.py b/app/api/views.py index 20dd1ce2..1fff9770 100644 --- a/app/api/views.py +++ b/app/api/views.py @@ -24,9 +24,9 @@ from .models import Project, Label, Document, RoleMapping, Role from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer -from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io -from .utils import JSONLRenderer -from .utils import JSONPainter, CSVPainter +from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, FastTextParser, CoNLLParser, AudioParser, iterable_to_io +from .utils import JSONLRenderer, PlainTextRenderer +from .utils import JSONPainter, CSVPainter, FastTextPainter IsInProjectReadOnlyOrAdmin = (IsAnnotatorAndReadOnly | IsAnnotationApproverAndReadOnly | IsProjectAdmin) IsInProjectOrAdmin = (IsAnnotator | IsAnnotationApprover | IsProjectAdmin) @@ -359,7 +359,7 @@ class CloudUploadAPI(APIView): class TextDownloadAPI(APIView): permission_classes = TextUploadAPI.permission_classes - renderer_classes = (CSVRenderer, JSONLRenderer) + renderer_classes = (CSVRenderer, JSONLRenderer, PlainTextRenderer) def get(self, request, *args, **kwargs): format = request.query_params.get('q') @@ -369,9 +369,9 @@ class TextDownloadAPI(APIView): # jsonl-textlabel format prints text labels while jsonl format prints annotations with label ids # jsonl-textlabel format - "labels": [[0, 15, "PERSON"], ..] # jsonl format - "annotations": [{"label": 5, "start_offset": 0, "end_offset": 2, "user": 1},..] - if format == 'jsonl': + if format in ('jsonl', 'txt'): labels = project.labels.all() - data = JSONPainter.paint_labels(documents, labels) + data = painter.paint_labels(documents, labels) else: data = painter.paint(documents) return Response(data) @@ -381,6 +381,8 @@ class TextDownloadAPI(APIView): return CSVPainter() elif format == 'jsonl' or format == 'json': return JSONPainter() + elif format == 'txt': + return FastTextPainter() else: raise ValidationError('format {} is invalid.'.format(format)) diff --git a/frontend/services/document.service.js b/frontend/services/document.service.js index 8f72d8f0..09a2a131 100644 --- a/frontend/services/document.service.js +++ b/frontend/services/document.service.js @@ -30,6 +30,9 @@ class DocumentService { if (format === 'csv') { headers.Accept = 'text/csv; charset=utf-8' headers['Content-Type'] = 'text/csv; charset=utf-8' + } else if (format === 'txt') { + headers.Accept = 'text/plain; charset=utf-8' + headers['Content-Type'] = 'text/plain; charset=utf-8' } else { headers.Accept = 'application/json' headers['Content-Type'] = 'application/json' diff --git a/frontend/store/projects.js b/frontend/store/projects.js index 5622be20..48d057c0 100644 --- a/frontend/store/projects.js +++ b/frontend/store/projects.js @@ -180,6 +180,11 @@ export const getters = { text: 'JSONL(Text label)', suffix: 'jsonl' } + const fastText = { + type: 'txt', + text: 'FastText', + suffix: 'txt' + } if (state.current.project_type === 'DocumentClassification') { json.examples = [ '{"id": 1, "text": "Terrible customer service.", "annotations": [{"id": 1, "label": 1, "user": 1}]}\n', @@ -192,9 +197,14 @@ export const getters = { '2,"Really great transaction.",2,1\n', '3,"Great price.",2,1' ] + fastText.examples = [ + '__label__pet dog cat \n', + '__label__car VW BMW' + ] return [ csv, - json + json, + fastText ] } else if (state.current.project_type === 'SequenceLabeling') { json.examples = [