Browse Source

Merge pull request #213 from CatalystCode/feature/import-data-from-cloud

Feature/Import data from cloud
pull/216/head
Hiroki Nakayama 5 years ago
committed by GitHub
parent
commit
175a29925c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 326 additions and 74 deletions
  1. 9
      app/app/settings.py
  2. 4
      app/app/urls.py
  3. 101
      app/server/api.py
  4. 6
      app/server/api_urls.py
  5. 2
      app/server/permissions.py
  6. 1
      app/server/static/components/http.js
  7. 39
      app/server/static/components/mixin.js
  8. 13
      app/server/static/components/projects.vue
  9. 50
      app/server/static/components/upload.pug
  10. 132
      app/server/tests/test_api.py
  11. 17
      app/server/tests/test_utils.py
  12. 23
      app/server/utils.py
  13. 3
      requirements.txt

9
app/app/settings.py

@ -57,6 +57,15 @@ INSTALLED_APPS = [
'webpack_loader',
]
CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER = env('CLOUD_BROWSER_LIBCLOUD_PROVIDER', None)
CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT = env('CLOUD_BROWSER_LIBCLOUD_ACCOUNT', None)
CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY = env('CLOUD_BROWSER_LIBCLOUD_KEY', None)
if CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER:
CLOUD_BROWSER_DATASTORE = 'ApacheLibcloud'
CLOUD_BROWSER_OBJECT_REDIRECT_URL = '/v1/cloud-upload'
INSTALLED_APPS.append('cloud_browser')
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware',

4
app/app/urls.py

@ -13,6 +13,7 @@ Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.conf import settings
from django.contrib import admin
from django.urls import path, include, re_path
from django.contrib.auth.views import PasswordResetView, LogoutView
@ -29,3 +30,6 @@ urlpatterns = [
path('api-auth/', include('rest_framework.urls')),
path('v1/', include('server.api_urls')),
]
if 'cloud_browser' in settings.INSTALLED_APPS:
urlpatterns.append(path('cloud-storage/', include('cloud_browser.urls')))

101
app/server/api.py

@ -1,8 +1,11 @@
from collections import Counter
from django.shortcuts import get_object_or_404
from django.conf import settings
from django.shortcuts import get_object_or_404, redirect
from django_filters.rest_framework import DjangoFilterBackend
from django.db.models import Count
from libcloud.base import DriverType, get_driver
from libcloud.storage.types import ContainerDoesNotExistError, ObjectDoesNotExistError
from rest_framework import generics, filters, status
from rest_framework.exceptions import ParseError, ValidationError
from rest_framework.permissions import IsAuthenticated, IsAdminUser
@ -16,7 +19,7 @@ from .models import Project, Label, Document
from .permissions import IsAdminUserAndWriteOnly, IsProjectUser, IsOwnAnnotation
from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer
from .serializers import ProjectPolymorphicSerializer
from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser
from .utils import CSVParser, JSONParser, PlainTextParser, CoNLLParser, iterable_to_io
from .utils import JSONLRenderer
from .utils import JSONPainter, CSVPainter
@ -29,6 +32,15 @@ class Me(APIView):
return Response(serializer.data)
class Features(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, *args, **kwargs):
return Response({
'cloud_upload': bool(settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER),
})
class ProjectList(generics.ListCreateAPIView):
queryset = Project.objects.all()
serializer_class = ProjectPolymorphicSerializer
@ -180,24 +192,87 @@ class TextUploadAPI(APIView):
def post(self, request, *args, **kwargs):
if 'file' not in request.data:
raise ParseError('Empty content')
project = get_object_or_404(Project, pk=self.kwargs['project_id'])
parser = self.select_parser(request.data['format'])
data = parser.parse(request.data['file'])
storage = project.get_storage(data)
storage.save(self.request.user)
self.save_file(
user=request.user,
file=request.data['file'],
file_format=request.data['format'],
project_id=kwargs['project_id'],
)
return Response(status=status.HTTP_201_CREATED)
def select_parser(self, format):
if format == 'plain':
@classmethod
def save_file(cls, user, file, file_format, project_id):
project = get_object_or_404(Project, pk=project_id)
parser = cls.select_parser(file_format)
data = parser.parse(file)
storage = project.get_storage(data)
storage.save(user)
@classmethod
def select_parser(cls, file_format):
if file_format == 'plain':
return PlainTextParser()
elif format == 'csv':
elif file_format == 'csv':
return CSVParser()
elif format == 'json':
elif file_format == 'json':
return JSONParser()
elif format == 'conll':
elif file_format == 'conll':
return CoNLLParser()
else:
raise ValidationError('format {} is invalid.'.format(format))
raise ValidationError('format {} is invalid.'.format(file_format))
class CloudUploadAPI(APIView):
permission_classes = TextUploadAPI.permission_classes
def get(self, request, *args, **kwargs):
try:
project_id = request.query_params['project_id']
file_format = request.query_params['upload_format']
cloud_container = request.query_params['container']
cloud_object = request.query_params['object']
except KeyError as ex:
raise ValidationError('query parameter {} is missing'.format(ex))
try:
cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object)
except ContainerDoesNotExistError:
raise ValidationError('cloud container {} does not exist'.format(cloud_container))
except ObjectDoesNotExistError:
raise ValidationError('cloud object {} does not exist'.format(cloud_object))
TextUploadAPI.save_file(
user=request.user,
file=cloud_file,
file_format=file_format,
project_id=project_id,
)
next_url = request.query_params.get('next')
if next_url == 'about:blank':
return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED)
if next_url:
return redirect(next_url)
return Response(status=status.HTTP_201_CREATED)
@classmethod
def get_cloud_object_as_io(cls, container_name, object_name):
provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower()
account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT
key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY
driver = get_driver(DriverType.STORAGE, provider)
client = driver(account, key)
cloud_container = client.get_container(container_name)
cloud_object = cloud_container.get_object(object_name)
return iterable_to_io(cloud_object.as_stream())
class TextDownloadAPI(APIView):

6
app/server/api_urls.py

@ -1,17 +1,19 @@
from django.urls import path
from rest_framework.urlpatterns import format_suffix_patterns
from .api import Me
from .api import Me, Features
from .api import ProjectList, ProjectDetail
from .api import LabelList, LabelDetail
from .api import DocumentList, DocumentDetail
from .api import AnnotationList, AnnotationDetail
from .api import TextUploadAPI, TextDownloadAPI
from .api import TextUploadAPI, TextDownloadAPI, CloudUploadAPI
from .api import StatisticsAPI
urlpatterns = [
path('me', Me.as_view(), name='me'),
path('features', Features.as_view(), name='features'),
path('cloud-upload', CloudUploadAPI.as_view(), name='cloud_uploader'),
path('projects', ProjectList.as_view(), name='project_list'),
path('projects/<int:project_id>', ProjectDetail.as_view(), name='project_detail'),
path('projects/<int:project_id>/statistics',

2
app/server/permissions.py

@ -9,7 +9,7 @@ class IsProjectUser(BasePermission):
def has_permission(self, request, view):
user = request.user
project_id = view.kwargs.get('project_id')
project_id = view.kwargs.get('project_id') or request.query_params.get('project_id')
project = get_object_or_404(Project, pk=project_id)
return user in project.users.all()

1
app/server/static/components/http.js

@ -7,4 +7,5 @@ const HTTP = axios.create({
baseURL: `/v1/${baseUrl}`,
});
export const newHttpClient = axios.create;
export default HTTP;

39
app/server/static/components/mixin.js

@ -2,7 +2,7 @@ import * as marked from 'marked';
import hljs from 'highlight.js';
import VueJsonPretty from 'vue-json-pretty';
import isEmpty from 'lodash.isempty';
import HTTP from './http';
import HTTP, { newHttpClient } from './http';
import Messages from './messages.vue';
const getOffsetFromUrl = (url) => {
@ -227,13 +227,48 @@ export const uploadMixin = {
messages: [],
format: 'json',
isLoading: false,
isCloudUploadActive: false,
canUploadFromCloud: false,
}),
mounted() {
hljs.initHighlighting();
},
created() {
newHttpClient().get('/v1/features').then((response) => {
this.canUploadFromCloud = response.data.cloud_upload;
});
},
computed: {
projectId() {
return window.location.pathname.split('/')[2];
},
postUploadUrl() {
return window.location.pathname.split('/').slice(0, -1).join('/');
},
cloudUploadUrl() {
return '/cloud-storage'
+ `?project_id=${this.projectId}`
+ `&upload_format=${this.format}`
+ `&next=${encodeURIComponent('about:blank')}`;
},
},
methods: {
cloudUpload() {
const iframeUrl = this.$refs.cloudUploadPane.contentWindow.location.href;
if (iframeUrl.indexOf('/v1/cloud-upload') > -1) {
this.isCloudUploadActive = false;
this.$nextTick(() => {
window.location.href = this.postUploadUrl;
});
}
},
upload() {
this.isLoading = true;
this.file = this.$refs.file.files[0];
@ -250,7 +285,7 @@ export const uploadMixin = {
.then((response) => {
console.log(response); // eslint-disable-line no-console
this.messages = [];
window.location = window.location.pathname.split('/').slice(0, -1).join('/');
window.location = this.postUploadUrl;
})
.catch((error) => {
this.isLoading = false;

13
app/server/static/components/projects.vue

@ -108,12 +108,11 @@
</template>
<script>
import axios from 'axios';
import { title, daysAgo } from './filter';
import { newHttpClient } from './http';
axios.defaults.xsrfCookieName = 'csrftoken';
axios.defaults.xsrfHeaderName = 'X-CSRFToken';
const baseUrl = window.location.href.split('/').slice(0, 3).join('/');
const httpClient = newHttpClient();
export default {
filters: { title, daysAgo },
@ -142,8 +141,8 @@ export default {
created() {
Promise.all([
axios.get(`${baseUrl}/v1/projects`),
axios.get(`${baseUrl}/v1/me`),
httpClient.get(`${baseUrl}/v1/projects`),
httpClient.get(`${baseUrl}/v1/me`),
]).then(([projects, me]) => {
this.items = projects.data;
this.username = me.data.username;
@ -153,7 +152,7 @@ export default {
methods: {
deleteProject() {
axios.delete(`${baseUrl}/v1/projects/${this.project.id}`).then(() => {
httpClient.delete(`${baseUrl}/v1/projects/${this.project.id}`).then(() => {
this.isDelete = false;
const index = this.items.indexOf(this.project);
this.items.splice(index, 1);
@ -186,7 +185,7 @@ export default {
guideline: 'Please write annotation guideline.',
resourcetype: this.resourceType(),
};
axios.post(`${baseUrl}/v1/projects`, payload)
httpClient.post(`${baseUrl}/v1/projects`, payload)
.then((response) => {
window.location = `${baseUrl}/projects/${response.data.id}/docs/create`;
})

50
app/server/static/components/upload.pug

@ -24,19 +24,37 @@ div.columns(v-cloak="")
block example-format-area
div.control(style="margin-top: 1em;")
div.file.has-name.is-primary
label.file-label
input.file-input(
v-on:change="upload()"
type="file"
ref="file"
name="file"
required
)
span.file-cta.button(v-bind:class="{'is-loading': isLoading}")
span.file-icon
i.fas.fa-upload
span.file-label Select a file…
span.file-name {{ file.name }}
div.field.is-grouped(style="margin-top: 1em;")
div.control
div.file.has-name.is-primary
label.file-label
input.file-input(
v-on:change="upload()"
type="file"
ref="file"
name="file"
required
)
span.file-cta.button(v-bind:class="{'is-loading': isLoading}")
span.file-icon
i.fas.fa-upload
span.file-label Select a file…
span.file-name {{ file.name }}
div.control(v-if="canUploadFromCloud")
button.button(
v-on:click="isCloudUploadActive = !isCloudUploadActive"
v-bind:class="{'is-loading': isLoading}"
)
span.file-icon
i.fa.fa-cloud-upload-alt
span Browse cloud…
div(v-if="isCloudUploadActive")
iframe(
ref="cloudUploadPane"
v-bind:src="cloudUploadUrl"
v-on:load="cloudUpload"
style="width: 100%; height: 20em;"
)

132
app/server/tests/test_api.py

@ -684,22 +684,21 @@ class TestUploader(APITestCase):
cls.labeling_project = mommy.make('server.SequenceLabelingProject',
users=[super_user], project_type=SEQUENCE_LABELING)
cls.seq2seq_project = mommy.make('server.Seq2seqProject', users=[super_user], project_type=SEQ2SEQ)
cls.classification_url = reverse(viewname='doc_uploader', args=[cls.classification_project.id])
cls.classification_labels_url = reverse(viewname='label_list', args=[cls.classification_project.id])
cls.labeling_url = reverse(viewname='doc_uploader', args=[cls.labeling_project.id])
cls.labeling_labels_url = reverse(viewname='label_list', args=[cls.labeling_project.id])
cls.seq2seq_url = reverse(viewname='doc_uploader', args=[cls.seq2seq_project.id])
def setUp(self):
self.client.login(username=self.super_user_name,
password=self.super_user_pass)
def upload_test_helper(self, url, filename, format, expected_status):
def upload_test_helper(self, project_id, filename, file_format, expected_status, **kwargs):
url = reverse(viewname='doc_uploader', args=[project_id])
with open(os.path.join(DATA_DIR, filename)) as f:
response = self.client.post(url, data={'file': f, 'format': format})
response = self.client.post(url, data={'file': f, 'format': file_format})
self.assertEqual(response.status_code, expected_status)
def label_test_helper(self, url, expected_labels, expected_label_keys):
def label_test_helper(self, project_id, expected_labels, expected_label_keys):
url = reverse(viewname='label_list', args=[project_id])
expected_keys = {key for label in expected_labels for key in label}
response = self.client.get(url).json()
@ -714,49 +713,49 @@ class TestUploader(APITestCase):
self.assertIsNotNone(label.get(expected_label_key))
def test_can_upload_conll_format_file(self):
self.upload_test_helper(url=self.labeling_url,
self.upload_test_helper(project_id=self.labeling_project.id,
filename='labeling.conll',
format='conll',
file_format='conll',
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_wrong_conll_format_file(self):
self.upload_test_helper(url=self.labeling_url,
self.upload_test_helper(project_id=self.labeling_project.id,
filename='labeling.invalid.conll',
format='conll',
file_format='conll',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_can_upload_classification_csv(self):
self.upload_test_helper(url=self.classification_url,
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.csv',
format='csv',
file_format='csv',
expected_status=status.HTTP_201_CREATED)
def test_can_upload_seq2seq_csv(self):
self.upload_test_helper(url=self.seq2seq_url,
self.upload_test_helper(project_id=self.seq2seq_project.id,
filename='example.csv',
format='csv',
file_format='csv',
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_csv_file_does_not_match_column_and_row(self):
self.upload_test_helper(url=self.classification_url,
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.invalid.1.csv',
format='csv',
file_format='csv',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_cannot_upload_csv_file_has_too_many_columns(self):
self.upload_test_helper(url=self.classification_url,
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.invalid.2.csv',
format='csv',
file_format='csv',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_can_upload_classification_jsonl(self):
self.upload_test_helper(url=self.classification_url,
self.upload_test_helper(project_id=self.classification_project.id,
filename='classification.jsonl',
format='json',
file_format='json',
expected_status=status.HTTP_201_CREATED)
self.label_test_helper(
url=self.classification_labels_url,
project_id=self.classification_project.id,
expected_labels=[
{'text': 'positive', 'suffix_key': 'p', 'prefix_key': None},
{'text': 'negative', 'suffix_key': 'n', 'prefix_key': None},
@ -768,13 +767,13 @@ class TestUploader(APITestCase):
])
def test_can_upload_labeling_jsonl(self):
self.upload_test_helper(url=self.labeling_url,
self.upload_test_helper(project_id=self.labeling_project.id,
filename='labeling.jsonl',
format='json',
file_format='json',
expected_status=status.HTTP_201_CREATED)
self.label_test_helper(
url=self.labeling_labels_url,
project_id=self.labeling_project.id,
expected_labels=[
{'text': 'LOC', 'suffix_key': 'l', 'prefix_key': None},
{'text': 'ORG', 'suffix_key': 'o', 'prefix_key': None},
@ -786,24 +785,93 @@ class TestUploader(APITestCase):
])
def test_can_upload_seq2seq_jsonl(self):
self.upload_test_helper(url=self.seq2seq_url,
self.upload_test_helper(project_id=self.seq2seq_project.id,
filename='seq2seq.jsonl',
format='json',
file_format='json',
expected_status=status.HTTP_201_CREATED)
def test_can_upload_plain_text(self):
self.upload_test_helper(url=self.classification_url,
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.txt',
format='plain',
file_format='plain',
expected_status=status.HTTP_201_CREATED)
def test_can_upload_data_without_label(self):
self.upload_test_helper(url=self.classification_url,
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.jsonl',
format='json',
file_format='json',
expected_status=status.HTTP_201_CREATED)
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER='LOCAL')
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT=os.path.dirname(DATA_DIR))
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY='not-used')
class TestCloudUploader(TestUploader):
def upload_test_helper(self, project_id, filename, file_format, expected_status, **kwargs):
query_params = {
'project_id': project_id,
'upload_format': file_format,
'container': kwargs.pop('container', os.path.basename(DATA_DIR)),
'object': filename,
}
query_params.update(kwargs)
response = self.client.get(reverse('cloud_uploader'), query_params)
self.assertEqual(response.status_code, expected_status)
def test_cannot_upload_with_missing_file(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='does-not-exist',
file_format='json',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_cannot_upload_with_missing_container(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.jsonl',
container='does-not-exist',
file_format='json',
expected_status=status.HTTP_400_BAD_REQUEST)
def test_cannot_upload_with_missing_query_parameters(self):
response = self.client.get(reverse('cloud_uploader'), {'project_id': self.classification_project.id})
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
def test_can_upload_with_redirect(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.jsonl',
next='http://somewhere',
file_format='json',
expected_status=status.HTTP_302_FOUND)
def test_can_upload_with_redirect_to_blank(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.jsonl',
next='about:blank',
file_format='json',
expected_status=status.HTTP_201_CREATED)
class TestFeatures(APITestCase):
@classmethod
def setUpTestData(cls):
cls.user_name = 'user_name'
cls.user_pass = 'user_pass'
cls.user = User.objects.create_user(username=cls.user_name, password=cls.user_pass, email='fizz@buzz.com')
def setUp(self):
self.client.login(username=self.user_name, password=self.user_pass)
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER=None)
def test_no_cloud_upload(self):
response = self.client.get(reverse('features'))
self.assertFalse(response.json().get('cloud_upload'))
class TestParser(APITestCase):
def parser_helper(self, filename, parser, include_label=True):

17
app/server/tests/test_utils.py

@ -1,10 +1,12 @@
import io
from django.test import TestCase
from seqeval.metrics.sequence_labeling import get_entities
from ..models import Label, Document
from ..utils import BaseStorage, ClassificationStorage, SequenceLabelingStorage, Seq2seqStorage, CoNLLParser
from ..utils import Color
from ..utils import Color, iterable_to_io
class TestColor(TestCase):
@ -153,3 +155,16 @@ class TestCoNLLParser(TestCase):
'text': 'EU rejects German call',
'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
})
class TestIterableToIO(TestCase):
def test(self):
def iterable():
yield b'fo'
yield b'o\nbar\n'
yield b'baz\nrest'
stream = iterable_to_io(iterable())
stream = io.TextIOWrapper(stream)
self.assertEqual(stream.readlines(), ['foo\n', 'bar\n', 'baz\n', 'rest'])

23
app/server/utils.py

@ -344,6 +344,7 @@ class CSVParser(FileParser):
class JSONParser(FileParser):
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
data = []
for i, line in enumerate(file, start=1):
if len(data) >= IMPORT_BATCH_SIZE:
@ -444,3 +445,25 @@ class Color:
def random(cls, seed=None):
rgb = Random(seed).choices(range(256), k=3)
return cls(*rgb)
def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""See https://stackoverflow.com/a/20260030/3817588."""
class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = None
def readable(self):
return True
def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
output, self.leftover = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF
return io.BufferedReader(IterStream(), buffer_size=buffer_size)

3
requirements.txt

@ -1,7 +1,9 @@
apache-libcloud==2.4.0
applicationinsights==0.11.7
coverage==4.5.3
dj-database-url==0.5.0
Django==2.1.7
django-cloud-browser==0.5.0
django-filter==2.0.0
django-heroku==0.3.1
django-webpack-loader==0.6.0
@ -17,6 +19,7 @@ Faker==0.8.8
flake8==3.6.0
furl==2.0.0
gunicorn==19.9.0
lockfile==0.12.2
mixer==6.1.3
model-mommy==1.6.0
psycopg2-binary==2.7.7

Loading…
Cancel
Save