Browse Source

Implement data import from cloud

pull/213/head
Clemens Wolff 6 years ago
parent
commit
d165851965
11 changed files with 213 additions and 28 deletions
  1. 9
      app/app/settings.py
  2. 4
      app/app/urls.py
  3. 75
      app/server/api.py
  4. 3
      app/server/api_urls.py
  5. 2
      app/server/permissions.py
  6. 19
      app/server/static/components/mixin.js
  7. 42
      app/server/static/components/upload.pug
  8. 44
      app/server/tests/test_api.py
  9. 17
      app/server/tests/test_utils.py
  10. 23
      app/server/utils.py
  11. 3
      requirements.txt

9
app/app/settings.py

@ -57,6 +57,15 @@ INSTALLED_APPS = [
'webpack_loader', 'webpack_loader',
] ]
CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER = env('CLOUD_BROWSER_LIBCLOUD_PROVIDER', None)
CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT = env('CLOUD_BROWSER_LIBCLOUD_ACCOUNT', None)
CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY = env('CLOUD_BROWSER_LIBCLOUD_KEY', None)
if CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER:
CLOUD_BROWSER_DATASTORE = 'ApacheLibcloud'
CLOUD_BROWSER_OBJECT_REDIRECT_URL = '/v1/cloud-upload'
INSTALLED_APPS.append('cloud_browser')
MIDDLEWARE = [ MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware', 'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware', 'whitenoise.middleware.WhiteNoiseMiddleware',

4
app/app/urls.py

@ -13,6 +13,7 @@ Including another URLconf
1. Import the include() function: from django.urls import include, path 1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
""" """
from django.conf import settings
from django.contrib import admin from django.contrib import admin
from django.urls import path, include, re_path from django.urls import path, include, re_path
from django.contrib.auth.views import PasswordResetView, LogoutView from django.contrib.auth.views import PasswordResetView, LogoutView
@ -29,3 +30,6 @@ urlpatterns = [
path('api-auth/', include('rest_framework.urls')), path('api-auth/', include('rest_framework.urls')),
path('v1/', include('server.api_urls')), path('v1/', include('server.api_urls')),
] ]
if 'cloud_browser' in settings.INSTALLED_APPS:
urlpatterns.append(path('cloud-storage/', include('cloud_browser.urls')))

75
app/server/api.py

@ -1,8 +1,11 @@
from collections import Counter from collections import Counter
from django.shortcuts import get_object_or_404
from django.conf import settings
from django.shortcuts import get_object_or_404, redirect
from django_filters.rest_framework import DjangoFilterBackend from django_filters.rest_framework import DjangoFilterBackend
from django.db.models import Count from django.db.models import Count
from libcloud.base import DriverType, get_driver
from libcloud.storage.types import ContainerDoesNotExistError, ObjectDoesNotExistError
from rest_framework import generics, filters, status from rest_framework import generics, filters, status
from rest_framework.exceptions import ParseError, ValidationError from rest_framework.exceptions import ParseError, ValidationError
from rest_framework.permissions import IsAuthenticated, IsAdminUser from rest_framework.permissions import IsAuthenticated, IsAdminUser
@ -16,7 +19,7 @@ from .models import Project, Label, Document
from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsOwnAnnotation from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsOwnAnnotation
from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer
from .serializers import ProjectPolymorphicSerializer from .serializers import ProjectPolymorphicSerializer
from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser
from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser, iterable_to_io
from .utils import JSONLRenderer from .utils import JSONLRenderer
from .utils import JSONPainter, CSVPainter from .utils import JSONPainter, CSVPainter
@ -180,14 +183,26 @@ class TextUploadAPI(APIView):
def post(self, request, *args, **kwargs): def post(self, request, *args, **kwargs):
if 'file' not in request.data: if 'file' not in request.data:
raise ParseError('Empty content') raise ParseError('Empty content')
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
parser = self.select_parser(request.data['format'])
data = parser.parse(request.data['file'])
storage = project.get_storage(data)
storage.save(self.request.user)
self.save_file(
user=request.user,
file=request.data['file'],
file_format=request.data['format'],
project_id=kwargs['project_id'],
)
return Response(status=status.HTTP_201_CREATED) return Response(status=status.HTTP_201_CREATED)
def select_parser(self, format):
@classmethod
def save_file(cls, user, file, file_format, project_id):
project = get_object_or_404(Project, pk=project_id)
parser = cls.select_parser(file_format)
data = parser.parse(file)
storage = project.get_storage(data)
storage.save(user)
@classmethod
def select_parser(cls, format):
if format == 'plain': if format == 'plain':
return PlainTextParser() return PlainTextParser()
elif format == 'csv': elif format == 'csv':
@ -200,6 +215,50 @@ class TextUploadAPI(APIView):
raise ValidationError('format {} is invalid.'.format(format)) raise ValidationError('format {} is invalid.'.format(format))
class CloudUploadAPI(APIView):
permission_classes = TextUploadAPI.permission_classes
def get(self, request, *args, **kwargs):
try:
project_id = request.query_params['project_id']
file_format = request.query_params['upload_format']
cloud_container = request.query_params['container']
cloud_object = request.query_params['object']
except KeyError as ex:
raise ValidationError('query parameter {} is missing'.format(ex))
try:
cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object)
except ContainerDoesNotExistError:
raise ValidationError('cloud container {} does not exist'.format(cloud_container))
except ObjectDoesNotExistError:
raise ValidationError('cloud object {} does not exist'.format(cloud_object))
TextUploadAPI.save_file(
user=request.user,
file=cloud_file,
file_format=file_format,
project_id=project_id,
)
next_url = request.query_params.get('next')
return redirect(next_url) if next_url else Response(status=status.HTTP_201_CREATED)
@classmethod
def get_cloud_object_as_io(cls, container_name, object_name):
provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower()
account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT
key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY
driver = get_driver(DriverType.STORAGE, provider)
client = driver(account, key)
cloud_container = client.get_container(container_name)
cloud_object = cloud_container.get_object(object_name)
return iterable_to_io(cloud_object.as_stream())
class TextDownloadAPI(APIView): class TextDownloadAPI(APIView):
permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser) permission_classes = (IsAuthenticated, IsProjectUser, IsAdminUser)
renderer_classes = (CSVRenderer, JSONLRenderer) renderer_classes = (CSVRenderer, JSONLRenderer)

3
app/server/api_urls.py

@ -6,12 +6,13 @@ from .api import ProjectList, ProjectDetail
from .api import LabelList, LabelDetail from .api import LabelList, LabelDetail
from .api import DocumentList, DocumentDetail from .api import DocumentList, DocumentDetail
from .api import AnnotationList, AnnotationDetail from .api import AnnotationList, AnnotationDetail
from .api import TextUploadAPI, TextDownloadAPI
from .api import TextUploadAPI, TextDownloadAPI, CloudUploadAPI
from .api import StatisticsAPI from .api import StatisticsAPI
urlpatterns = [ urlpatterns = [
path('me', Me.as_view(), name='me'), path('me', Me.as_view(), name='me'),
path('cloud-upload', CloudUploadAPI.as_view(), name='cloud_uploader'),
path('projects', ProjectList.as_view(), name='project_list'), path('projects', ProjectList.as_view(), name='project_list'),
path('projects/<int:project_id>', ProjectDetail.as_view(), name='project_detail'), path('projects/<int:project_id>', ProjectDetail.as_view(), name='project_detail'),
path('projects/<int:project_id>/statistics', path('projects/<int:project_id>/statistics',

2
app/server/permissions.py

@ -9,7 +9,7 @@ class IsProjectUser(BasePermission):
def has_permission(self, request, view): def has_permission(self, request, view):
user = request.user user = request.user
project_id = view.kwargs.get('project_id')
project_id = view.kwargs.get('project_id') or request.query_params.get('project_id')
project = get_object_or_404(Project, pk=project_id) project = get_object_or_404(Project, pk=project_id)
return user in project.users.all() return user in project.users.all()

19
app/server/static/components/mixin.js

@ -233,6 +233,23 @@ export const uploadMixin = {
hljs.initHighlighting(); hljs.initHighlighting();
}, },
computed: {
projectId() {
return window.location.pathname.split('/')[2];
},
postUploadUrl() {
return window.location.pathname.split('/').slice(0, -1).join('/');
},
cloudUploadUrl() {
return '/cloud-storage'
+ `?project_id=${this.projectId}`
+ `&upload_format=${this.format}`
+ `&next=${encodeURIComponent(this.postUploadUrl)}`
},
},
methods: { methods: {
upload() { upload() {
this.isLoading = true; this.isLoading = true;
@ -250,7 +267,7 @@ export const uploadMixin = {
.then((response) => { .then((response) => {
console.log(response); // eslint-disable-line no-console console.log(response); // eslint-disable-line no-console
this.messages = []; this.messages = [];
window.location = window.location.pathname.split('/').slice(0, -1).join('/');
window.location = this.postUploadUrl;
}) })
.catch((error) => { .catch((error) => {
this.isLoading = false; this.isLoading = false;

42
app/server/static/components/upload.pug

@ -24,19 +24,29 @@ div.columns(v-cloak="")
block example-format-area block example-format-area
div.control(style="margin-top: 1em;")
div.file.has-name.is-primary
label.file-label
input.file-input(
v-on:change="upload()"
type="file"
ref="file"
name="file"
required
)
span.file-cta.button(v-bind:class="{'is-loading': isLoading}")
span.file-icon
i.fas.fa-upload
span.file-label Select a file…
span.file-name {{ file.name }}
div.field.is-grouped(style="margin-top: 1em;")
div.control
div.file.has-name.is-primary
label.file-label
input.file-input(
v-on:change="upload()"
type="file"
ref="file"
name="file"
required
)
span.file-cta.button(v-bind:class="{'is-loading': isLoading}")
span.file-icon
i.fas.fa-upload
span.file-label Select a file…
span.file-name {{ file.name }}
div.control
a.button(
v-bind:href="cloudUploadUrl"
v-bind:class="{'is-loading': isLoading}"
)
span.file-icon
i.fa.fa-cloud-upload-alt
span Browse cloud…

44
app/server/tests/test_api.py

@ -803,6 +803,50 @@ class TestUploader(APITestCase):
expected_status=status.HTTP_201_CREATED) expected_status=status.HTTP_201_CREATED)
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER='LOCAL')
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT=os.path.dirname(DATA_DIR))
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY='not-used')
class TestCloudUploader(TestUploader):
def upload_test_helper(self, project_id, filename, format, expected_status, **kwargs):
query_params = {
'project_id': project_id,
'upload_format': format,
'container': kwargs.pop('container', os.path.basename(DATA_DIR)),
'object': filename,
}
query_params.update(kwargs)
response = self.client.get(reverse('cloud_uploader'), query_params)
self.assertEqual(response.status_code, expected_status)
def test_cannot_upload_with_missing_file(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='does-not-exist',
format='json',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_cannot_upload_with_missing_container(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.jsonl',
container='does-not-exist',
format='json',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_cannot_upload_with_missing_query_parameters(self):
response = self.client.get(reverse('cloud_uploader'), {'project_id': self.classification_project.id})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_can_upload_with_redirect(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.jsonl',
next='http://somewhere',
format='json',
expected_status=status.HTTP_302_FOUND)
class TestParser(APITestCase): class TestParser(APITestCase):
def parser_helper(self, filename, parser, include_label=True): def parser_helper(self, filename, parser, include_label=True):

17
app/server/tests/test_utils.py

@ -1,10 +1,12 @@
import io
from django.test import TestCase from django.test import TestCase
from seqeval.metrics.sequence_labeling import get_entities from seqeval.metrics.sequence_labeling import get_entities
from ..models import Label, Document from ..models import Label, Document
from ..utils import BaseStorage, ClassificationStorage, SequenceLabelingStorage, Seq2seqStorage, CoNLLParser from ..utils import BaseStorage, ClassificationStorage, SequenceLabelingStorage, Seq2seqStorage, CoNLLParser
from ..utils import Color
from ..utils import Color, iterable_to_io
class TestColor(TestCase): class TestColor(TestCase):
@ -153,3 +155,16 @@ class TestCoNLLParser(TestCase):
'text': 'EU rejects German call', 'text': 'EU rejects German call',
'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']] 'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
}) })
class TestIterableToIO(TestCase):
def test(self):
def iterable():
yield b'fo'
yield b'o\nbar\n'
yield b'baz\nrest'
stream = iterable_to_io(iterable())
stream = io.TextIOWrapper(stream)
self.assertEqual(stream.readlines(), ['foo\n', 'bar\n', 'baz\n', 'rest'])

23
app/server/utils.py

@ -343,6 +343,7 @@ class CSVParser(FileParser):
class JSONParser(FileParser): class JSONParser(FileParser):
def parse(self, file): def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
data = [] data = []
for i, line in enumerate(file, start=1): for i, line in enumerate(file, start=1):
if len(data) >= IMPORT_BATCH_SIZE: if len(data) >= IMPORT_BATCH_SIZE:
@ -443,3 +444,25 @@ class Color:
def random(cls, seed=None): def random(cls, seed=None):
rgb = Random(seed).choices(range(256), k=3) rgb = Random(seed).choices(range(256), k=3)
return cls(*rgb) return cls(*rgb)
def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""See https://stackoverflow.com/a/20260030/3817588."""
class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = None
def readable(self):
return True
def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
output, self.leftover = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF
return io.BufferedReader(IterStream(), buffer_size=buffer_size)

3
requirements.txt

@ -1,3 +1,5 @@
https://github.com/ryan-roemer/django-cloud-browser/archive/bb7ca58.zip
apache-libcloud==2.4.0
applicationinsights==0.11.7 applicationinsights==0.11.7
coverage==4.5.3 coverage==4.5.3
dj-database-url==0.5.0 dj-database-url==0.5.0
@ -17,6 +19,7 @@ Faker==0.8.8
flake8==3.6.0 flake8==3.6.0
furl==2.0.0 furl==2.0.0
gunicorn==19.9.0 gunicorn==19.9.0
lockfile==0.12.2
mixer==6.1.3 mixer==6.1.3
model-mommy==1.6.0 model-mommy==1.6.0
psycopg2-binary==2.7.7 psycopg2-binary==2.7.7

Loading…
Cancel
Save