Browse Source

Merge pull request #1156 from SwiftPredator/feature/fasttext_export

Feature: FastText Export (#362)
pull/1177/head
Hiroki Nakayama 4 years ago
committed by GitHub
parent
commit
da18b5f5c0
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 66 additions and 8 deletions
  1. 5
      app/api/tests/test_api.py
  2. 40
      app/api/utils.py
  3. 14
      app/api/views.py
  4. 3
      frontend/services/document.service.js
  5. 12
      frontend/store/projects.js

5
app/api/tests/test_api.py

@ -1617,6 +1617,11 @@ class TestDownloader(APITestCase):
format='plain',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_can_download_classification_fasttext(self):
self.download_test_helper(url=self.classification_url,
format='txt',
expected_status=status.HTTP_200_OK)
class TestStatisticsAPI(APITestCase, TestUtilsMixin):

40
app/api/utils.py

@ -13,7 +13,7 @@ from django.db import transaction
from django.conf import settings
from colour import Color
import pyexcel
from rest_framework.renderers import JSONRenderer
from rest_framework.renderers import JSONRenderer, BaseRenderer
from seqeval.metrics.sequence_labeling import get_entities
from .exceptions import FileParseException
@ -497,6 +497,44 @@ class JSONLRenderer(JSONRenderer):
allow_nan=not self.strict) + '\n'
class FastTextPainter(object):
@staticmethod
def paint_labels(documents, labels):
serializer = DocumentSerializer(documents, many=True)
serializer_labels = LabelSerializer(labels, many=True)
data = []
for d in serializer.data:
labels = []
for a in d['annotations']:
label_obj = [x for x in serializer_labels.data if x['id'] == a['label']][0]
labels.append('__label__{}'.format(label_obj['text'].replace(' ', '_')))
text = d['text'].replace('\n', ' ')
if labels:
data.append('{} {}'.format(' '.join(labels), text))
else:
data.append(text)
return data
class PlainTextRenderer(BaseRenderer):
media_type = 'text/plain'
format = 'txt'
charset = 'utf-8'
def render(self, data, accepted_media_type=None, renderer_context=None):
if data is None:
return bytes()
if not isinstance(data, list):
data = [data]
buffer = io.BytesIO()
for d in data:
buffer.write((d + '\n').encode(self.charset))
return buffer.getvalue()
class JSONPainter(object):
def paint(self, documents):

14
app/api/views.py

@ -24,9 +24,9 @@ from .models import Project, Label, Document, RoleMapping, Role, Comment
from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover, IsOwnComment
from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer, CommentSerializer
from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer
from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io
from .utils import JSONLRenderer
from .utils import JSONPainter, CSVPainter
from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, FastTextParser, CoNLLParser, AudioParser, iterable_to_io
from .utils import JSONLRenderer, PlainTextRenderer
from .utils import JSONPainter, CSVPainter, FastTextPainter
IsInProjectReadOnlyOrAdmin = (IsAnnotatorAndReadOnly | IsAnnotationApproverAndReadOnly | IsProjectAdmin)
IsInProjectOrAdmin = (IsAnnotator | IsAnnotationApprover | IsProjectAdmin)
@ -391,7 +391,7 @@ class CloudUploadAPI(APIView):
class TextDownloadAPI(APIView):
permission_classes = TextUploadAPI.permission_classes
renderer_classes = (CSVRenderer, JSONLRenderer)
renderer_classes = (CSVRenderer, JSONLRenderer, PlainTextRenderer)
def get(self, request, *args, **kwargs):
format = request.query_params.get('q')
@ -407,9 +407,9 @@ class TextDownloadAPI(APIView):
# jsonl-textlabel format prints text labels while jsonl format prints annotations with label ids
# jsonl-textlabel format - "labels": [[0, 15, "PERSON"], ..]
# jsonl format - "annotations": [{"label": 5, "start_offset": 0, "end_offset": 2, "user": 1},..]
if format == 'jsonl':
if format in ('jsonl', 'txt'):
labels = project.labels.all()
data = JSONPainter.paint_labels(documents, labels)
data = painter.paint_labels(documents, labels)
else:
data = painter.paint(documents)
return Response(data)
@ -419,6 +419,8 @@ class TextDownloadAPI(APIView):
return CSVPainter()
elif format == 'jsonl' or format == 'json':
return JSONPainter()
elif format == 'txt':
return FastTextPainter()
else:
raise ValidationError('format {} is invalid.'.format(format))

3
frontend/services/document.service.js

@ -34,6 +34,9 @@ class DocumentService {
if (format === 'csv') {
headers.Accept = 'text/csv; charset=utf-8'
headers['Content-Type'] = 'text/csv; charset=utf-8'
} else if (format === 'txt') {
headers.Accept = 'text/plain; charset=utf-8'
headers['Content-Type'] = 'text/plain; charset=utf-8'
} else {
headers.Accept = 'application/json'
headers['Content-Type'] = 'application/json'

12
frontend/store/projects.js

@ -180,6 +180,11 @@ export const getters = {
text: 'JSONL(Text label)',
suffix: 'jsonl'
}
const fastText = {
type: 'txt',
text: 'FastText',
suffix: 'txt'
}
if (state.current.project_type === 'DocumentClassification') {
json.examples = [
'{"id": 1, "text": "Terrible customer service.", "annotations": [{"id": 1, "label": 1, "user": 1}]}\n',
@ -192,9 +197,14 @@ export const getters = {
'2,"Really great transaction.",2,1\n',
'3,"Great price.",2,1'
]
fastText.examples = [
'__label__pet dog cat \n',
'__label__car VW BMW'
]
return [
csv,
json
json,
fastText
]
} else if (state.current.project_type === 'SequenceLabeling') {
json.examples = [

Loading…
Cancel
Save