diff --git a/app/app/settings.py b/app/app/settings.py index d294ca5b..4a2bc60b 100644 --- a/app/app/settings.py +++ b/app/app/settings.py @@ -57,6 +57,15 @@ INSTALLED_APPS = [ 'webpack_loader', ] +CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER = env('CLOUD_BROWSER_LIBCLOUD_PROVIDER', None) +CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT = env('CLOUD_BROWSER_LIBCLOUD_ACCOUNT', None) +CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY = env('CLOUD_BROWSER_LIBCLOUD_KEY', None) + +if CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER: + CLOUD_BROWSER_DATASTORE = 'ApacheLibcloud' + CLOUD_BROWSER_OBJECT_REDIRECT_URL = '/v1/cloud-upload' + INSTALLED_APPS.append('cloud_browser') + MIDDLEWARE = [ 'django.middleware.security.SecurityMiddleware', 'whitenoise.middleware.WhiteNoiseMiddleware', diff --git a/app/app/urls.py b/app/app/urls.py index e9bc2ea4..0fa1ed36 100644 --- a/app/app/urls.py +++ b/app/app/urls.py @@ -13,6 +13,7 @@ Including another URLconf 1. Import the include() function: from django.urls import include, path 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ +from django.conf import settings from django.contrib import admin from django.urls import path, include, re_path from django.contrib.auth.views import PasswordResetView, LogoutView @@ -29,3 +30,6 @@ urlpatterns = [ path('api-auth/', include('rest_framework.urls')), path('v1/', include('server.api_urls')), ] + +if 'cloud_browser' in settings.INSTALLED_APPS: + urlpatterns.append(path('cloud-storage/', include('cloud_browser.urls'))) diff --git a/app/server/api.py b/app/server/api.py index bc853c4d..564dc9bd 100644 --- a/app/server/api.py +++ b/app/server/api.py @@ -1,8 +1,11 @@ from collections import Counter -from django.shortcuts import get_object_or_404 +from django.conf import settings +from django.shortcuts import get_object_or_404, redirect from django_filters.rest_framework import DjangoFilterBackend from django.db.models import Count +from libcloud.base import DriverType, get_driver +from libcloud.storage.types import ContainerDoesNotExistError, ObjectDoesNotExistError from rest_framework import generics, filters, status from rest_framework.exceptions import ParseError, ValidationError from rest_framework.permissions import IsAuthenticated, IsAdminUser @@ -16,7 +19,7 @@ from .models import Project, Label, Document from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsOwnAnnotation from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer from .serializers import ProjectPolymorphicSerializer -from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser +from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser, iterable_to_io from .utils import JSONLRenderer from .utils import JSONPainter, CSVPainter @@ -180,14 +183,26 @@ class TextUploadAPI(APIView): def post(self, request, *args, **kwargs): if 'file' not in request.data: raise ParseError('Empty content') - project = get_object_or_404(Project, pk=self.kwargs['project_id']) - parser = self.select_parser(request.data['format']) - data = parser.parse(request.data['file']) - storage = project.get_storage(data) - storage.save(self.request.user) + + self.save_file( + user=request.user, + file=request.data['file'], + file_format=request.data['format'], + project_id=kwargs['project_id'], + ) + return Response(status=status.HTTP_201_CREATED) - def select_parser(self, format): + @classmethod + def save_file(cls, user, file, file_format, project_id): + project = get_object_or_404(Project, pk=project_id) + parser = cls.select_parser(file_format) + data = parser.parse(file) + storage = project.get_storage(data) + storage.save(user) + + @classmethod + def select_parser(cls, format): if format == 'plain': return PlainTextParser() elif format == 'csv': @@ -200,6 +215,50 @@ class TextUploadAPI(APIView): raise ValidationError('format {} is invalid.'.format(format)) +class CloudUploadAPI(APIView): + permission_classes = TextUploadAPI.permission_classes + + def get(self, request, *args, **kwargs): + try: + project_id = request.query_params['project_id'] + file_format = request.query_params['upload_format'] + cloud_container = request.query_params['container'] + cloud_object = request.query_params['object'] + except KeyError as ex: + raise ValidationError('query parameter {} is missing'.format(ex)) + + try: + cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object) + except ContainerDoesNotExistError: + raise ValidationError('cloud container {} does not exist'.format(cloud_container)) + except ObjectDoesNotExistError: + raise ValidationError('cloud object {} does not exist'.format(cloud_object)) + + TextUploadAPI.save_file( + user=request.user, + file=cloud_file, + file_format=file_format, + project_id=project_id, + ) + + next_url = request.query_params.get('next') + return redirect(next_url) if next_url else Response(status=status.HTTP_201_CREATED) + + @classmethod + def get_cloud_object_as_io(cls, container_name, object_name): + provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower() + account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT + key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY + + driver = get_driver(DriverType.STORAGE, provider) + client = driver(account, key) + + cloud_container = client.get_container(container_name) + cloud_object = cloud_container.get_object(object_name) + + return iterable_to_io(cloud_object.as_stream()) + + class TextDownloadAPI(APIView): permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser) renderer_classes = (CSVRenderer, JSONLRenderer) diff --git a/app/server/api_urls.py b/app/server/api_urls.py index 15f894d9..108027fa 100644 --- a/app/server/api_urls.py +++ b/app/server/api_urls.py @@ -6,12 +6,13 @@ from .api import ProjectList, ProjectDetail from .api import LabelList, LabelDetail from .api import DocumentList, DocumentDetail from .api import AnnotationList, AnnotationDetail -from .api import TextUploadAPI, TextDownloadAPI +from .api import TextUploadAPI, TextDownloadAPI, CloudUploadAPI from .api import StatisticsAPI urlpatterns = [ path('me', Me.as_view(), name='me'), + path('cloud-upload', CloudUploadAPI.as_view(), name='cloud_uploader'), path('projects', ProjectList.as_view(), name='project_list'), path('projects/', ProjectDetail.as_view(), name='project_detail'), path('projects//statistics', diff --git a/app/server/permissions.py b/app/server/permissions.py index e82a3416..99f7bc58 100644 --- a/app/server/permissions.py +++ b/app/server/permissions.py @@ -9,7 +9,7 @@ class IsProjectUser(BasePermission): def has_permission(self, request, view): user = request.user - project_id = view.kwargs.get('project_id') + project_id = view.kwargs.get('project_id') or request.query_params.get('project_id') project = get_object_or_404(Project, pk=project_id) return user in project.users.all() diff --git a/app/server/static/components/mixin.js b/app/server/static/components/mixin.js index d7ac857c..a64b1251 100644 --- a/app/server/static/components/mixin.js +++ b/app/server/static/components/mixin.js @@ -233,6 +233,23 @@ export const uploadMixin = { hljs.initHighlighting(); }, + computed: { + projectId() { + return window.location.pathname.split('/')[2]; + }, + + postUploadUrl() { + return window.location.pathname.split('/').slice(0, -1).join('/'); + }, + + cloudUploadUrl() { + return '/cloud-storage' + + `?project_id=${this.projectId}` + + `&upload_format=${this.format}` + + `&next=${encodeURIComponent(this.postUploadUrl)}` + }, + }, + methods: { upload() { this.isLoading = true; @@ -250,7 +267,7 @@ export const uploadMixin = { .then((response) => { console.log(response); // eslint-disable-line no-console this.messages = []; - window.location = window.location.pathname.split('/').slice(0, -1).join('/'); + window.location = this.postUploadUrl; }) .catch((error) => { this.isLoading = false; diff --git a/app/server/static/components/upload.pug b/app/server/static/components/upload.pug index 13355f44..3b082e8f 100644 --- a/app/server/static/components/upload.pug +++ b/app/server/static/components/upload.pug @@ -24,19 +24,29 @@ div.columns(v-cloak="") block example-format-area - div.control(style="margin-top: 1em;") - div.file.has-name.is-primary - label.file-label - input.file-input( - v-on:change="upload()" - type="file" - ref="file" - name="file" - required - ) - span.file-cta.button(v-bind:class="{'is-loading': isLoading}") - span.file-icon - i.fas.fa-upload - span.file-label Select a file… - - span.file-name {{ file.name }} + div.field.is-grouped(style="margin-top: 1em;") + + div.control + div.file.has-name.is-primary + label.file-label + input.file-input( + v-on:change="upload()" + type="file" + ref="file" + name="file" + required + ) + span.file-cta.button(v-bind:class="{'is-loading': isLoading}") + span.file-icon + i.fas.fa-upload + span.file-label Select a file… + span.file-name {{ file.name }} + + div.control + a.button( + v-bind:href="cloudUploadUrl" + v-bind:class="{'is-loading': isLoading}" + ) + span.file-icon + i.fa.fa-cloud-upload-alt + span Browse cloud… diff --git a/app/server/tests/test_api.py b/app/server/tests/test_api.py index e54e9d5e..428c6faa 100644 --- a/app/server/tests/test_api.py +++ b/app/server/tests/test_api.py @@ -803,6 +803,50 @@ class TestUploader(APITestCase): expected_status=status.HTTP_201_CREATED) +@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER='LOCAL') +@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT=os.path.dirname(DATA_DIR)) +@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY='not-used') +class TestCloudUploader(TestUploader): + def upload_test_helper(self, project_id, filename, format, expected_status, **kwargs): + query_params = { + 'project_id': project_id, + 'upload_format': format, + 'container': kwargs.pop('container', os.path.basename(DATA_DIR)), + 'object': filename, + } + + query_params.update(kwargs) + + response = self.client.get(reverse('cloud_uploader'), query_params) + + self.assertEqual(response.status_code, expected_status) + + def test_cannot_upload_with_missing_file(self): + self.upload_test_helper(project_id=self.classification_project.id, + filename='does-not-exist', + format='json', + expected_status=status.HTTP_400_BAD_REQUEST) + + def test_cannot_upload_with_missing_container(self): + self.upload_test_helper(project_id=self.classification_project.id, + filename='example.jsonl', + container='does-not-exist', + format='json', + expected_status=status.HTTP_400_BAD_REQUEST) + + def test_cannot_upload_with_missing_query_parameters(self): + response = self.client.get(reverse('cloud_uploader'), {'project_id': self.classification_project.id}) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_can_upload_with_redirect(self): + self.upload_test_helper(project_id=self.classification_project.id, + filename='example.jsonl', + next='http://somewhere', + format='json', + expected_status=status.HTTP_302_FOUND) + + class TestParser(APITestCase): def parser_helper(self, filename, parser, include_label=True): diff --git a/app/server/tests/test_utils.py b/app/server/tests/test_utils.py index 264d296f..5d4874b8 100644 --- a/app/server/tests/test_utils.py +++ b/app/server/tests/test_utils.py @@ -1,10 +1,12 @@ +import io + from django.test import TestCase from seqeval.metrics.sequence_labeling import get_entities from ..models import Label, Document from ..utils import BaseStorage, ClassificationStorage, SequenceLabelingStorage, Seq2seqStorage, CoNLLParser -from ..utils import Color +from ..utils import Color, iterable_to_io class TestColor(TestCase): @@ -153,3 +155,16 @@ class TestCoNLLParser(TestCase): 'text': 'EU rejects German call', 'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']] }) + + +class TestIterableToIO(TestCase): + def test(self): + def iterable(): + yield b'fo' + yield b'o\nbar\n' + yield b'baz\nrest' + + stream = iterable_to_io(iterable()) + stream = io.TextIOWrapper(stream) + + self.assertEqual(stream.readlines(), ['foo\n', 'bar\n', 'baz\n', 'rest']) diff --git a/app/server/utils.py b/app/server/utils.py index e33bbed5..3065518c 100644 --- a/app/server/utils.py +++ b/app/server/utils.py @@ -343,6 +343,7 @@ class CSVParser(FileParser): class JSONParser(FileParser): def parse(self, file): + file = io.TextIOWrapper(file, encoding='utf-8') data = [] for i, line in enumerate(file, start=1): if len(data) >= IMPORT_BATCH_SIZE: @@ -443,3 +444,25 @@ class Color: def random(cls, seed=None): rgb = Random(seed).choices(range(256), k=3) return cls(*rgb) + + +def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE): + """See https://stackoverflow.com/a/20260030/3817588.""" + class IterStream(io.RawIOBase): + def __init__(self): + self.leftover = None + + def readable(self): + return True + + def readinto(self, b): + try: + l = len(b) # We're supposed to return at most this much + chunk = self.leftover or next(iterable) + output, self.leftover = chunk[:l], chunk[l:] + b[:len(output)] = output + return len(output) + except StopIteration: + return 0 # indicate EOF + + return io.BufferedReader(IterStream(), buffer_size=buffer_size) diff --git a/requirements.txt b/requirements.txt index 85240443..f2985992 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +https://github.com/ryan-roemer/django-cloud-browser/archive/bb7ca58.zip +apache-libcloud==2.4.0 applicationinsights==0.11.7 coverage==4.5.3 dj-database-url==0.5.0 @@ -17,6 +19,7 @@ Faker==0.8.8 flake8==3.6.0 furl==2.0.0 gunicorn==19.9.0 +lockfile==0.12.2 mixer==6.1.3 model-mommy==1.6.0 psycopg2-binary==2.7.7