From 18f4f650d806cf6ca37e94f96f1eb2dffea38cbe Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 1 Dec 2020 23:21:55 +0100 Subject: [PATCH 1/2] Feature: FastText Export (#362) --- app/api/tests/test_api.py | 5 ++++ app/api/utils.py | 40 ++++++++++++++++++++++++++- app/api/views.py | 14 ++++++---- frontend/services/document.service.js | 3 ++ frontend/store/projects.js | 12 +++++++- 5 files changed, 66 insertions(+), 8 deletions(-) diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py index 8a77179b..aa877c97 100644 --- a/app/api/tests/test_api.py +++ b/app/api/tests/test_api.py @@ -1528,6 +1528,11 @@ class TestDownloader(APITestCase): format='plain', expected_status=status.HTTP_400_BAD_REQUEST) + def test_can_download_classification_fasttext(self): + self.download_test_helper(url=self.classification_url, + format='txt', + expected_status=status.HTTP_200_OK) + class TestStatisticsAPI(APITestCase, TestUtilsMixin): diff --git a/app/api/utils.py b/app/api/utils.py index 977cba5a..7edda553 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -13,7 +13,7 @@ from django.db import transaction from django.conf import settings from colour import Color import pyexcel -from rest_framework.renderers import JSONRenderer +from rest_framework.renderers import JSONRenderer, BaseRenderer from seqeval.metrics.sequence_labeling import get_entities from .exceptions import FileParseException @@ -497,6 +497,44 @@ class JSONLRenderer(JSONRenderer): allow_nan=not self.strict) + '\n' +class FastTextPainter(object): + + @staticmethod + def paint_labels(documents, labels): + serializer = DocumentSerializer(documents, many=True) + serializer_labels = LabelSerializer(labels, many=True) + data = [] + for d in serializer.data: + labels = [] + for a in d['annotations']: + label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0] + labels.append('__label__{}'.format(label_obj['text'].replace(' ', '_'))) + text = d['text'].replace('\n', ' ') + if labels: + data.append('{} {}'.format(' '.join(labels), text)) + else: + data.append(text) + return data + + +class PlainTextRenderer(BaseRenderer): + media_type = 'text/plain' + format = 'txt' + charset = 'utf-8' + + def render(self, data, accepted_media_type=None, renderer_context=None): + if data is None: + return bytes() + + if not isinstance(data, list): + data = [data] + + buffer = io.BytesIO() + for d in data: + buffer.write((d + '\n').encode(self.charset)) + return buffer.getvalue() + + class JSONPainter(object): def paint(self, documents): diff --git a/app/api/views.py b/app/api/views.py index 20dd1ce2..1fff9770 100644 --- a/app/api/views.py +++ b/app/api/views.py @@ -24,9 +24,9 @@ from .models import Project, Label, Document, RoleMapping, Role from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer -from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io -from .utils import JSONLRenderer -from .utils import JSONPainter, CSVPainter +from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, FastTextParser, CoNLLParser, AudioParser, iterable_to_io +from .utils import JSONLRenderer, PlainTextRenderer +from .utils import JSONPainter, CSVPainter, FastTextPainter IsInProjectReadOnlyOrAdmin = (IsAnnotatorAndReadOnly | IsAnnotationApproverAndReadOnly | IsProjectAdmin) IsInProjectOrAdmin = (IsAnnotator | IsAnnotationApprover | IsProjectAdmin) @@ -359,7 +359,7 @@ class CloudUploadAPI(APIView): class TextDownloadAPI(APIView): permission_classes = TextUploadAPI.permission_classes - renderer_classes = (CSVRenderer, JSONLRenderer) + renderer_classes = (CSVRenderer, JSONLRenderer, PlainTextRenderer) def get(self, request, *args, **kwargs): format = request.query_params.get('q') @@ -369,9 +369,9 @@ class TextDownloadAPI(APIView): # jsonl-textlabel format prints text labels while jsonl format prints annotations with label ids # jsonl-textlabel format - "labels": [[0, 15, "PERSON"], ..] # jsonl format - "annotations": [{"label": 5, "start_offset": 0, "end_offset": 2, "user": 1},..] - if format == 'jsonl': + if format in ('jsonl', 'txt'): labels = project.labels.all() - data = JSONPainter.paint_labels(documents, labels) + data = painter.paint_labels(documents, labels) else: data = painter.paint(documents) return Response(data) @@ -381,6 +381,8 @@ class TextDownloadAPI(APIView): return CSVPainter() elif format == 'jsonl' or format == 'json': return JSONPainter() + elif format == 'txt': + return FastTextPainter() else: raise ValidationError('format {} is invalid.'.format(format)) diff --git a/frontend/services/document.service.js b/frontend/services/document.service.js index 8f72d8f0..09a2a131 100644 --- a/frontend/services/document.service.js +++ b/frontend/services/document.service.js @@ -30,6 +30,9 @@ class DocumentService { if (format === 'csv') { headers.Accept = 'text/csv; charset=utf-8' headers['Content-Type'] = 'text/csv; charset=utf-8' + } else if (format === 'txt') { + headers.Accept = 'text/plain; charset=utf-8' + headers['Content-Type'] = 'text/plain; charset=utf-8' } else { headers.Accept = 'application/json' headers['Content-Type'] = 'application/json' diff --git a/frontend/store/projects.js b/frontend/store/projects.js index 5622be20..48d057c0 100644 --- a/frontend/store/projects.js +++ b/frontend/store/projects.js @@ -180,6 +180,11 @@ export const getters = { text: 'JSONL(Text label)', suffix: 'jsonl' } + const fastText = { + type: 'txt', + text: 'FastText', + suffix: 'txt' + } if (state.current.project_type === 'DocumentClassification') { json.examples = [ '{"id": 1, "text": "Terrible customer service.", "annotations": [{"id": 1, "label": 1, "user": 1}]}\n', @@ -192,9 +197,14 @@ export const getters = { '2,"Really great transaction.",2,1\n', '3,"Great price.",2,1' ] + fastText.examples = [ + '__label__pet dog cat \n', + '__label__car VW BMW' + ] return [ csv, - json + json, + fastText ] } else if (state.current.project_type === 'SequenceLabeling') { json.examples = [ From cfafe1fd663f2d5c4ac8bedbe00c321435d550a6 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 18 Jan 2021 20:41:45 +0100 Subject: [PATCH 2/2] Add feature bulkimport for same filetype --- .../documents/DocumentUploadForm.vue | 39 +++++++++++++------ frontend/i18n/en/projects/dataset.js | 2 +- frontend/i18n/en/projects/errors.js | 2 +- frontend/rules/index.js | 2 +- 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/frontend/components/organisms/documents/DocumentUploadForm.vue b/frontend/components/organisms/documents/DocumentUploadForm.vue index 623ae1d3..6159728a 100644 --- a/frontend/components/organisms/documents/DocumentUploadForm.vue +++ b/frontend/components/organisms/documents/DocumentUploadForm.vue @@ -18,7 +18,7 @@ type="error" dismissible > - {{ $t('errors.fileCannotUpload') }} + {{ $t('errors.fileCannotUpload') + errorMsg }}

{{ $t('dataset.importDataMessage1') }}

{{ $t('dataset.importDataMessage2') }} { - this.reset() - this.cancel() + this.errors = [] + const promises = [] + const id = this.$route.params.id + const type = this.selectedFormat.type + this.file.forEach((item) => { + promises.push({ + projectId: id, + format: type, + file: item }) - .catch(() => { + }) + let p = Promise.resolve() + promises.forEach((item) => { + p = p.then(() => this.uploadDocument(item)).catch(() => { + this.errors.push(item.file.name) this.showError = true }) + }) + p.finally(() => { + if (!this.errors.length) { + this.reset() + this.cancel() + } else { + this.errorMsg = this.errors.join(', ') + } + }) } } } diff --git a/frontend/i18n/en/projects/dataset.js b/frontend/i18n/en/projects/dataset.js index ac8cccd1..d645083b 100644 --- a/frontend/i18n/en/projects/dataset.js +++ b/frontend/i18n/en/projects/dataset.js @@ -9,7 +9,7 @@ export default { annotate: 'Annotate', importDataTitle: 'Upload Data', importDataMessage1: 'Select a file format', - importDataMessage2: 'Select a file', + importDataMessage2: 'Select file(s)', importDataPlaceholder: 'File input', exportDataTitle: 'Export Data', exportDataMessage: 'Select a file format', diff --git a/frontend/i18n/en/projects/errors.js b/frontend/i18n/en/projects/errors.js index f84d04a9..fcb93846 100644 --- a/frontend/i18n/en/projects/errors.js +++ b/frontend/i18n/en/projects/errors.js @@ -1,5 +1,5 @@ export default { - fileCannotUpload: 'The file could not be uploaded. Maybe invalid format.\n Please check available formats carefully.', + fileCannotUpload: 'The file(s) could not be uploaded. Maybe invalid format.\n Please check available formats and the following file(s): ', labelCannotCreate: 'The label could not be created.\n You cannot use the same label name or shortcut key.', invalidUserOrPass: 'Incorrect username or password, or something went wrong.' } diff --git a/frontend/rules/index.js b/frontend/rules/index.js index 3f5219b3..51224440 100644 --- a/frontend/rules/index.js +++ b/frontend/rules/index.js @@ -57,7 +57,7 @@ export const fileFormatRules = (msg) => { export const uploadFileRules = (msg) => { return [ v => !!v || msg.fileRequired, - v => !v || v.size < 1000000 || msg.fileLessThan1MB + v => !v || v.some(file => file.size < 1000000) || msg.fileLessThan1MB ] }