mirror of https://github.com/doccano/doccano.git
pythonannotation-tooldatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learning
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
161 lines
5.9 KiB
161 lines
5.9 KiB
from django.conf import settings
|
|
from django.shortcuts import get_object_or_404, redirect
|
|
from libcloud import DriverType, get_driver
|
|
from libcloud.storage.types import (ContainerDoesNotExistError,
|
|
ObjectDoesNotExistError)
|
|
from rest_framework import status
|
|
from rest_framework.exceptions import ParseError, ValidationError
|
|
from rest_framework.parsers import MultiPartParser
|
|
from rest_framework.permissions import IsAuthenticated
|
|
from rest_framework.response import Response
|
|
from rest_framework.views import APIView
|
|
from rest_framework_csv.renderers import CSVRenderer
|
|
|
|
from ..models import Project
|
|
from ..permissions import IsProjectAdmin
|
|
from ..utils import (AudioParser, CoNLLParser, CSVPainter, CSVParser,
|
|
ExcelParser, FastTextPainter, FastTextParser,
|
|
JSONLRenderer, JSONPainter, JSONParser, PlainTextParser,
|
|
PlainTextRenderer, iterable_to_io)
|
|
|
|
|
|
class Features(APIView):
|
|
permission_classes = (IsAuthenticated,)
|
|
|
|
def get(self, request, *args, **kwargs):
|
|
return Response({
|
|
'cloud_upload': bool(settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER),
|
|
})
|
|
|
|
|
|
class TextUploadAPI(APIView):
|
|
parser_classes = (MultiPartParser,)
|
|
permission_classes = [IsAuthenticated & IsProjectAdmin]
|
|
|
|
def post(self, request, *args, **kwargs):
|
|
if 'file' not in request.data:
|
|
raise ParseError('Empty content')
|
|
|
|
self.save_file(
|
|
user=request.user,
|
|
file=request.data['file'],
|
|
file_format=request.data['format'],
|
|
project_id=kwargs['project_id'],
|
|
)
|
|
|
|
return Response(status=status.HTTP_201_CREATED)
|
|
|
|
@classmethod
|
|
def save_file(cls, user, file, file_format, project_id):
|
|
project = get_object_or_404(Project, pk=project_id)
|
|
parser = cls.select_parser(file_format)
|
|
data = parser.parse(file)
|
|
storage = project.get_storage(data)
|
|
storage.save(user)
|
|
|
|
@classmethod
|
|
def select_parser(cls, file_format):
|
|
if file_format == 'plain':
|
|
return PlainTextParser()
|
|
elif file_format == 'csv':
|
|
return CSVParser()
|
|
elif file_format == 'json':
|
|
return JSONParser()
|
|
elif file_format == 'conll':
|
|
return CoNLLParser()
|
|
elif file_format == 'excel':
|
|
return ExcelParser()
|
|
elif file_format == 'audio':
|
|
return AudioParser()
|
|
elif file_format == 'fastText':
|
|
return FastTextParser()
|
|
else:
|
|
raise ValidationError('format {} is invalid.'.format(file_format))
|
|
|
|
|
|
class CloudUploadAPI(APIView):
|
|
permission_classes = TextUploadAPI.permission_classes
|
|
|
|
def get(self, request, *args, **kwargs):
|
|
try:
|
|
project_id = request.query_params['project_id']
|
|
file_format = request.query_params['upload_format']
|
|
cloud_container = request.query_params['container']
|
|
cloud_object = request.query_params['object']
|
|
except KeyError as ex:
|
|
raise ValidationError('query parameter {} is missing'.format(ex))
|
|
|
|
try:
|
|
cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object)
|
|
except ContainerDoesNotExistError:
|
|
raise ValidationError('cloud container {} does not exist'.format(cloud_container))
|
|
except ObjectDoesNotExistError:
|
|
raise ValidationError('cloud object {} does not exist'.format(cloud_object))
|
|
|
|
TextUploadAPI.save_file(
|
|
user=request.user,
|
|
file=cloud_file,
|
|
file_format=file_format,
|
|
project_id=project_id,
|
|
)
|
|
|
|
next_url = request.query_params.get('next')
|
|
|
|
if next_url == 'about:blank':
|
|
return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED)
|
|
|
|
if next_url:
|
|
return redirect(next_url)
|
|
|
|
return Response(status=status.HTTP_201_CREATED)
|
|
|
|
@classmethod
|
|
def get_cloud_object_as_io(cls, container_name, object_name):
|
|
provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower()
|
|
account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT
|
|
key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY
|
|
|
|
driver = get_driver(DriverType.STORAGE, provider)
|
|
client = driver(account, key)
|
|
|
|
cloud_container = client.get_container(container_name)
|
|
cloud_object = cloud_container.get_object(object_name)
|
|
|
|
return iterable_to_io(cloud_object.as_stream())
|
|
|
|
|
|
class TextDownloadAPI(APIView):
|
|
permission_classes = TextUploadAPI.permission_classes
|
|
|
|
renderer_classes = (CSVRenderer, JSONLRenderer, PlainTextRenderer)
|
|
|
|
def get(self, request, *args, **kwargs):
|
|
format = request.query_params.get('q')
|
|
only_approved = request.query_params.get('onlyApproved')
|
|
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
|
|
documents = (
|
|
project.documents.exclude(annotations_approved_by = None)
|
|
if only_approved == 'true'
|
|
else project.documents.all()
|
|
)
|
|
painter = self.select_painter(format)
|
|
|
|
# jsonl-textlabel format prints text labels while jsonl format prints annotations with label ids
|
|
# jsonl-textlabel format - "labels": [[0, 15, "PERSON"], ..]
|
|
# jsonl format - "annotations": [{"label": 5, "start_offset": 0, "end_offset": 2, "user": 1},..]
|
|
if format in ('jsonl', 'txt'):
|
|
labels = project.labels.all()
|
|
data = painter.paint_labels(documents, labels)
|
|
else:
|
|
data = painter.paint(documents)
|
|
return Response(data)
|
|
|
|
def select_painter(self, format):
|
|
if format == 'csv':
|
|
return CSVPainter()
|
|
elif format == 'jsonl' or format == 'json':
|
|
return JSONPainter()
|
|
elif format == 'txt':
|
|
return FastTextPainter()
|
|
else:
|
|
raise ValidationError('format {} is invalid.'.format(format))
|