Browse Source

Merge pull request #1640 from doccano/enhancement/createDataImport

[Enhancement]Separate data import app
pull/1641/head
Hiroki Nakayama 3 years ago
committed by GitHub
parent
commit
6c2ffdcb56
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
56 changed files with 136 additions and 200 deletions
  1. 2
      Pipfile
  2. 19
      backend/api/celery_tasks.py
  3. 2
      backend/api/tests/api/utils.py
  4. 19
      backend/api/urls.py
  5. 64
      backend/api/views/import_export.py
  6. 1
      backend/app/settings.py
  7. 1
      backend/app/urls.py
  8. 0
      backend/data_import/__init__.py
  9. 0
      backend/data_import/admin.py
  10. 6
      backend/data_import/apps.py
  11. 23
      backend/data_import/celery_tasks.py
  12. 0
      backend/data_import/migrations/__init__.py
  13. 0
      backend/data_import/models.py
  14. 0
      backend/data_import/pipeline/__init__.py
  15. 4
      backend/data_import/pipeline/builders.py
  16. 6
      backend/data_import/pipeline/catalog.py
  17. 4
      backend/data_import/pipeline/cleaners.py
  18. 2
      backend/data_import/pipeline/data.py
  19. 0
      backend/data_import/pipeline/examples.py
  20. 0
      backend/data_import/pipeline/exceptions.py
  21. 22
      backend/data_import/pipeline/factories.py
  22. 8
      backend/data_import/pipeline/labels.py
  23. 2
      backend/data_import/pipeline/parsers.py
  24. 4
      backend/data_import/pipeline/readers.py
  25. 4
      backend/data_import/pipeline/writers.py
  26. 0
      backend/data_import/tests/__init__.py
  27. 0
      backend/data_import/tests/data/example.txt
  28. 0
      backend/data_import/tests/data/example.utf16.csv
  29. 0
      backend/data_import/tests/data/intent/example.jsonl
  30. 0
      backend/data_import/tests/data/label/invalid_labels.json
  31. 0
      backend/data_import/tests/data/label/valid_labels.json
  32. 0
      backend/data_import/tests/data/seq2seq/example.csv
  33. 0
      backend/data_import/tests/data/seq2seq/example.json
  34. 0
      backend/data_import/tests/data/seq2seq/example.jsonl
  35. 0
      backend/data_import/tests/data/sequence_labeling/example.conll
  36. 0
      backend/data_import/tests/data/sequence_labeling/example.jsonl
  37. 0
      backend/data_import/tests/data/sequence_labeling/example_overlapping.jsonl
  38. 0
      backend/data_import/tests/data/sequence_labeling/labeling.invalid.conll
  39. 0
      backend/data_import/tests/data/sequence_labeling/labeling.trailing.conll
  40. 0
      backend/data_import/tests/data/text_classification/example.csv
  41. 0
      backend/data_import/tests/data/text_classification/example.invalid.2.csv
  42. 0
      backend/data_import/tests/data/text_classification/example.invalid.2.xlsx
  43. 0
      backend/data_import/tests/data/text_classification/example.json
  44. 0
      backend/data_import/tests/data/text_classification/example.jsonl
  45. 0
      backend/data_import/tests/data/text_classification/example.xlsx
  46. 0
      backend/data_import/tests/data/text_classification/example_fasttext.txt
  47. 0
      backend/data_import/tests/data/text_classification/example_one_column_no_header.xlsx
  48. 0
      backend/data_import/tests/data/text_classification/example_out_of_order_columns.csv
  49. 8
      backend/data_import/tests/test_builder.py
  50. 2
      backend/data_import/tests/test_parser.py
  51. 66
      backend/data_import/tests/test_tasks.py
  52. 19
      backend/data_import/tests/test_views.py
  53. 17
      backend/data_import/urls.py
  54. 16
      backend/data_import/views.py
  55. 4
      frontend/i18n/en/projects/dataset.js
  56. 11
      frontend/pages/projects/_id/upload/index.vue

2
Pipfile

@ -60,6 +60,6 @@ python_version = "3.8"
isort = "isort api -c --skip migrations"
flake8 = "flake8 --filename \"*.py\" --extend-exclude \"server,api/migrations,api/views/__init__.py,authentification,api/apps.py\""
wait_for_db = "python manage.py wait_for_db"
test = "python manage.py test api.tests roles.tests members.tests metrics.tests users.tests"
test = "python manage.py test api.tests roles.tests members.tests metrics.tests users.tests data_import.tests"
migrate = "python manage.py migrate"
collectstatic = "python manage.py collectstatic --noinput"

19
backend/api/celery_tasks.py

@ -1,34 +1,15 @@
from celery import shared_task
from celery.utils.log import get_task_logger
from django.conf import settings
from django.contrib.auth import get_user_model
from django.shortcuts import get_object_or_404
from .models import Project
from .views.download.factory import create_repository, create_writer
from .views.download.service import ExportApplicationService
from .views.upload.factories import (create_bulder, create_cleaner,
create_parser)
from .views.upload.readers import Reader
from .views.upload.writers import BulkWriter
logger = get_task_logger(__name__)
@shared_task
def ingest_data(user_id, project_id, filenames, format: str, **kwargs):
project = get_object_or_404(Project, pk=project_id)
user = get_object_or_404(get_user_model(), pk=user_id)
parser = create_parser(format, **kwargs)
builder = create_bulder(project, **kwargs)
reader = Reader(filenames=filenames, parser=parser, builder=builder)
cleaner = create_cleaner(project)
writer = BulkWriter(batch_size=settings.IMPORT_BATCH_SIZE)
writer.save(reader, project, user, cleaner)
return {'error': writer.errors}
@shared_task
def export_dataset(project_id, format: str, export_approved=False):
project = get_object_or_404(Project, pk=project_id)

2
backend/api/tests/api/utils.py

@ -15,7 +15,7 @@ from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
DATA_DIR = os.path.join(os.path.dirname(__file__), '../data')
DATA_DIR = os.path.join(os.path.dirname(__file__), '../../../data_import/tests/data')
ProjectData = namedtuple('ProjectData', ['item', 'users'])

19
backend/api/urls.py

@ -1,21 +1,10 @@
from django.urls import include, path
from .views import (annotation, auto_labeling, comment, example, example_state,
export_dataset, health, import_dataset, import_export,
label, project, tag, task)
export_dataset, health, label, project, tag, task)
from .views.tasks import category, relation, span, text
urlpatterns_project = [
path(
route='upload',
view=import_dataset.UploadAPI.as_view(),
name='upload'
),
path(
route='catalog',
view=import_dataset.DatasetCatalog.as_view(),
name='catalog'
),
path(
route='download-format',
view=export_dataset.DownloadDatasetCatalog.as_view(),
@ -214,12 +203,6 @@ urlpatterns = [
view=health.Health.as_view(),
name='health'
),
path('fp/', include('django_drf_filepond.urls')),
path(
route='features',
view=import_export.Features.as_view(),
name='features'
),
path(
route='projects',
view=project.ProjectList.as_view(),

64
backend/api/views/import_export.py

@ -1,64 +0,0 @@
from django.conf import settings
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
class Features(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, *args, **kwargs):
return Response({
'cloud_upload': bool(settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER),
})
# class CloudUploadAPI(APIView):
# permission_classes = TextUploadAPI.permission_classes
#
# def get(self, request, *args, **kwargs):
# try:
# project_id = request.query_params['project_id']
# file_format = request.query_params['upload_format']
# cloud_container = request.query_params['container']
# cloud_object = request.query_params['object']
# except KeyError as ex:
# raise ValidationError('query parameter {} is missing'.format(ex))
#
# try:
# cloud_file = self.get_cloud_object_as_io(cloud_container, cloud_object)
# except ContainerDoesNotExistError:
# raise ValidationError('cloud container {} does not exist'.format(cloud_container))
# except ObjectDoesNotExistError:
# raise ValidationError('cloud object {} does not exist'.format(cloud_object))
#
# TextUploadAPI.save_file(
# user=request.user,
# file=cloud_file,
# file_format=file_format,
# project_id=project_id,
# )
#
# next_url = request.query_params.get('next')
#
# if next_url == 'about:blank':
# return Response(data='', content_type='text/plain', status=status.HTTP_201_CREATED)
#
# if next_url:
# return redirect(next_url)
#
# return Response(status=status.HTTP_201_CREATED)
#
# @classmethod
# def get_cloud_object_as_io(cls, container_name, object_name):
# provider = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER.lower()
# account = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_ACCOUNT
# key = settings.CLOUD_BROWSER_APACHE_LIBCLOUD_SECRET_KEY
#
# driver = get_driver(DriverType.STORAGE, provider)
# client = driver(account, key)
#
# cloud_container = client.get_container(container_name)
# cloud_object = cloud_container.get_object(object_name)
#
# return iterable_to_io(cloud_object.as_stream())

1
backend/app/settings.py

@ -56,6 +56,7 @@ INSTALLED_APPS = [
'members.apps.MembersConfig',
'metrics.apps.MetricsConfig',
'users.apps.UsersConfig',
'data_import.apps.DataImportConfig',
'rest_framework',
'rest_framework.authtoken',
'django_filters',

1
backend/app/urls.py

@ -43,6 +43,7 @@ urlpatterns += [
path('v1/', include('api.urls')),
path('v1/', include('roles.urls')),
path('v1/', include('users.urls')),
path('v1/', include('data_import.urls')),
path('v1/projects/<int:project_id>/', include('members.urls')),
path('v1/projects/<int:project_id>/metrics/', include('metrics.urls')),
path('swagger/', schema_view.with_ui('swagger', cache_timeout=0), name='schema-swagger-ui'),

backend/api/tests/upload/__init__.py → backend/data_import/__init__.py

backend/api/views/upload/__init__.py → backend/data_import/admin.py

6
backend/data_import/apps.py

@ -0,0 +1,6 @@
from django.apps import AppConfig
class DataImportConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'data_import'

23
backend/data_import/celery_tasks.py

@ -0,0 +1,23 @@
from celery import shared_task
from django.conf import settings
from django.contrib.auth import get_user_model
from django.shortcuts import get_object_or_404
from api.models import Project
from .pipeline.factories import create_parser, create_bulder, create_cleaner
from .pipeline.readers import Reader
from .pipeline.writers import BulkWriter
@shared_task
def import_dataset(user_id, project_id, filenames, file_format: str, **kwargs):
project = get_object_or_404(Project, pk=project_id)
user = get_object_or_404(get_user_model(), pk=user_id)
parser = create_parser(file_format, **kwargs)
builder = create_bulder(project, **kwargs)
reader = Reader(filenames=filenames, parser=parser, builder=builder)
cleaner = create_cleaner(project)
writer = BulkWriter(batch_size=settings.IMPORT_BATCH_SIZE)
writer.save(reader, project, user, cleaner)
return {'error': writer.errors}

0
backend/data_import/migrations/__init__.py

0
backend/data_import/models.py

0
backend/data_import/pipeline/__init__.py

backend/api/views/upload/builders.py → backend/data_import/pipeline/builders.py

@ -5,8 +5,8 @@ from typing import Any, Dict, List, Optional, Type, TypeVar
from pydantic import ValidationError
from .data import BaseData
from .exception import FileParseException
from .label import Label
from .exceptions import FileParseException
from .labels import Label
from .readers import Builder, Record
logger = getLogger(__name__)

backend/api/views/upload/catalog.py → backend/data_import/pipeline/catalog.py

@ -4,9 +4,9 @@ from typing import Dict, List, Type
from pydantic import BaseModel
from typing_extensions import Literal
from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from api.models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from . import examples
encodings = Literal[

backend/api/views/upload/cleaners.py → backend/data_import/pipeline/cleaners.py

@ -1,7 +1,7 @@
from typing import List
from ...models import Project
from .label import CategoryLabel, Label, SpanLabel
from api.models import Project
from .labels import CategoryLabel, Label, SpanLabel
class Cleaner:

backend/api/views/upload/data.py → backend/data_import/pipeline/data.py

@ -4,7 +4,7 @@ from typing import Any, Dict
from pydantic import BaseModel, validator
from ...models import Example, Project
from api.models import Example, Project
class BaseData(BaseModel, abc.ABC):

backend/api/views/upload/examples.py → backend/data_import/pipeline/examples.py

backend/api/views/upload/exception.py → backend/data_import/pipeline/exceptions.py

backend/api/views/upload/factories.py → backend/data_import/pipeline/factories.py

@ -1,7 +1,7 @@
from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from . import builders, catalog, cleaners, data, label, parsers, readers
from api.models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from . import builders, catalog, cleaners, data, labels, parsers, readers
def get_data_class(project_type: str):
@ -37,11 +37,11 @@ def create_parser(file_format: str, **kwargs):
def get_label_class(project_type: str):
mapping = {
DOCUMENT_CLASSIFICATION: label.CategoryLabel,
SEQUENCE_LABELING: label.SpanLabel,
SEQ2SEQ: label.TextLabel,
IMAGE_CLASSIFICATION: label.CategoryLabel,
SPEECH2TEXT: label.TextLabel,
DOCUMENT_CLASSIFICATION: labels.CategoryLabel,
SEQUENCE_LABELING: labels.SpanLabel,
SEQ2SEQ: labels.TextLabel,
IMAGE_CLASSIFICATION: labels.CategoryLabel,
SPEECH2TEXT: labels.TextLabel,
}
if project_type not in mapping:
ValueError(f'Invalid project type: {project_type}')
@ -71,11 +71,11 @@ def create_bulder(project, **kwargs):
label_columns = [
builders.LabelColumn(
name='cats',
value_class=label.CategoryLabel
value_class=labels.CategoryLabel
),
builders.LabelColumn(
name='entities',
value_class=label.SpanLabel
value_class=labels.SpanLabel
)
]
else:

backend/api/views/upload/label.py → backend/data_import/pipeline/labels.py

@ -3,10 +3,10 @@ from typing import Any, Dict, Optional, Union
from pydantic import BaseModel, validator
from ...models import Category, CategoryType
from ...models import Label as LabelModel
from ...models import Project, Span, SpanType
from ...models import TextLabel as TL
from api.models import Category, CategoryType
from api.models import Label as LabelModel
from api.models import Project, Span, SpanType
from api.models import TextLabel as TL
class Label(BaseModel, abc.ABC):

backend/api/views/upload/parsers.py → backend/data_import/pipeline/parsers.py

@ -10,7 +10,7 @@ import pyexcel.exceptions
from chardet import UniversalDetector
from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens
from .exception import FileParseException
from .exceptions import FileParseException
from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser
DEFAULT_ENCODING = 'Auto'

backend/api/views/upload/readers.py → backend/data_import/pipeline/readers.py

@ -4,8 +4,8 @@ from typing import Any, Dict, Iterator, List, Type
from .cleaners import Cleaner
from .data import BaseData
from .exception import FileParseException
from .label import Label
from .exceptions import FileParseException
from .labels import Label
DEFAULT_TEXT_COLUMN = 'text'
DEFAULT_LABEL_COLUMN = 'label'

backend/api/views/upload/writers.py → backend/data_import/pipeline/writers.py

@ -5,8 +5,8 @@ from typing import Any, Dict, List
from django.conf import settings
from ...models import CategoryType, Example, Project, SpanType
from .exception import FileParseException
from api.models import CategoryType, Example, Project, SpanType
from .exceptions import FileParseException
from .readers import BaseReader

0
backend/data_import/tests/__init__.py

backend/api/tests/data/example.txt → backend/data_import/tests/data/example.txt

backend/api/tests/data/example.utf16.csv → backend/data_import/tests/data/example.utf16.csv

backend/api/tests/data/intent/example.jsonl → backend/data_import/tests/data/intent/example.jsonl

backend/api/tests/data/label/invalid_labels.json → backend/data_import/tests/data/label/invalid_labels.json

backend/api/tests/data/label/valid_labels.json → backend/data_import/tests/data/label/valid_labels.json

backend/api/tests/data/seq2seq/example.csv → backend/data_import/tests/data/seq2seq/example.csv

backend/api/tests/data/seq2seq/example.json → backend/data_import/tests/data/seq2seq/example.json

backend/api/tests/data/seq2seq/example.jsonl → backend/data_import/tests/data/seq2seq/example.jsonl

backend/api/tests/data/sequence_labeling/example.conll → backend/data_import/tests/data/sequence_labeling/example.conll

backend/api/tests/data/sequence_labeling/example.jsonl → backend/data_import/tests/data/sequence_labeling/example.jsonl

backend/api/tests/data/sequence_labeling/example_overlapping.jsonl → backend/data_import/tests/data/sequence_labeling/example_overlapping.jsonl

backend/api/tests/data/sequence_labeling/labeling.invalid.conll → backend/data_import/tests/data/sequence_labeling/labeling.invalid.conll

backend/api/tests/data/sequence_labeling/labeling.trailing.conll → backend/data_import/tests/data/sequence_labeling/labeling.trailing.conll

backend/api/tests/data/text_classification/example.csv → backend/data_import/tests/data/text_classification/example.csv

backend/api/tests/data/text_classification/example.invalid.2.csv → backend/data_import/tests/data/text_classification/example.invalid.2.csv

backend/api/tests/data/text_classification/example.invalid.2.xlsx → backend/data_import/tests/data/text_classification/example.invalid.2.xlsx

backend/api/tests/data/text_classification/example.json → backend/data_import/tests/data/text_classification/example.json

backend/api/tests/data/text_classification/example.jsonl → backend/data_import/tests/data/text_classification/example.jsonl

backend/api/tests/data/text_classification/example.xlsx → backend/data_import/tests/data/text_classification/example.xlsx

backend/api/tests/data/text_classification/example_fasttext.txt → backend/data_import/tests/data/text_classification/example_fasttext.txt

backend/api/tests/data/text_classification/example_one_column_no_header.xlsx → backend/data_import/tests/data/text_classification/example_one_column_no_header.xlsx

backend/api/tests/data/text_classification/example_out_of_order_columns.csv → backend/data_import/tests/data/text_classification/example_out_of_order_columns.csv

backend/api/tests/upload/test_builder.py → backend/data_import/tests/test_builder.py

@ -1,10 +1,10 @@
import unittest
from typing import List
from ...views.upload import builders
from ...views.upload.data import TextData
from ...views.upload.exception import FileParseException
from ...views.upload.label import CategoryLabel, SpanLabel
from data_import.pipeline import builders
from data_import.pipeline.data import TextData
from data_import.pipeline.exceptions import FileParseException
from data_import.pipeline.labels import CategoryLabel, SpanLabel
class TestColumnBuilder(unittest.TestCase):

backend/api/tests/upload/test_parser.py → backend/data_import/tests/test_parser.py

@ -4,7 +4,7 @@ import shutil
import tempfile
import unittest
from ...views.upload import parsers
from data_import.pipeline import parsers
class TestParser(unittest.TestCase):

backend/api/tests/test_tasks.py → backend/data_import/tests/test_tasks.py

@ -2,15 +2,15 @@ import pathlib
from django.test import TestCase
from ..celery_tasks import ingest_data
from ..models import (DOCUMENT_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, Category, CategoryType, Example, Span,
SpanType)
from .api.utils import prepare_project
from data_import.celery_tasks import import_dataset
from api.models import (DOCUMENT_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, Category, CategoryType, Example, Span,
SpanType)
from api.tests.api.utils import prepare_project
class TestIngestData(TestCase):
class TestImportData(TestCase):
task = 'Any'
annotation_class = Category
@ -19,13 +19,13 @@ class TestIngestData(TestCase):
self.user = self.project.users[0]
self.data_path = pathlib.Path(__file__).parent / 'data'
def ingest_data(self, filename, file_format, kwargs=None):
def import_dataset(self, filename, file_format, kwargs=None):
filenames = [str(self.data_path / filename)]
kwargs = kwargs or {}
return ingest_data(self.user.id, self.project.item.id, filenames, file_format, **kwargs)
return import_dataset(self.user.id, self.project.item.id, filenames, file_format, **kwargs)
class TestIngestClassificationData(TestIngestData):
class TestImportClassificationData(TestImportData):
task = DOCUMENT_CLASSIFICATION
def assert_examples(self, dataset):
@ -50,7 +50,7 @@ class TestIngestClassificationData(TestIngestData):
('exampleB', ['positive', 'negative']),
('exampleC', [])
]
self.ingest_data(filename, file_format, kwargs)
self.import_dataset(filename, file_format, kwargs)
self.assert_examples(dataset)
def test_csv(self):
@ -60,7 +60,7 @@ class TestIngestClassificationData(TestIngestData):
('exampleA', ['positive']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_csv_out_of_order_columns(self):
@ -70,7 +70,7 @@ class TestIngestClassificationData(TestIngestData):
('exampleA', ['positive']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_fasttext(self):
@ -81,7 +81,7 @@ class TestIngestClassificationData(TestIngestData):
('exampleB', ['positive', 'negative']),
('exampleC', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_excel(self):
@ -91,7 +91,7 @@ class TestIngestClassificationData(TestIngestData):
('exampleA', ['positive']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_json(self):
@ -102,7 +102,7 @@ class TestIngestClassificationData(TestIngestData):
('exampleB', ['positive', 'negative']),
('exampleC', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_textfile(self):
@ -111,7 +111,7 @@ class TestIngestClassificationData(TestIngestData):
dataset = [
('exampleA\nexampleB\n\nexampleC\n', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_textline(self):
@ -122,35 +122,35 @@ class TestIngestClassificationData(TestIngestData):
('exampleB', []),
('exampleC', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_wrong_jsonl(self):
filename = 'text_classification/example.json'
file_format = 'JSONL'
response = self.ingest_data(filename, file_format)
response = self.import_dataset(filename, file_format)
self.assert_parse_error(response)
def test_wrong_json(self):
filename = 'text_classification/example.jsonl'
file_format = 'JSON'
response = self.ingest_data(filename, file_format)
response = self.import_dataset(filename, file_format)
self.assert_parse_error(response)
def test_wrong_excel(self):
filename = 'text_classification/example.jsonl'
file_format = 'Excel'
response = self.ingest_data(filename, file_format)
response = self.import_dataset(filename, file_format)
self.assert_parse_error(response)
def test_wrong_csv(self):
filename = 'text_classification/example.jsonl'
file_format = 'CSV'
response = self.ingest_data(filename, file_format)
response = self.import_dataset(filename, file_format)
self.assert_parse_error(response)
class TestIngestSequenceLabelingData(TestIngestData):
class TestImportSequenceLabelingData(TestImportData):
task = SEQUENCE_LABELING
def assert_examples(self, dataset):
@ -173,7 +173,7 @@ class TestIngestSequenceLabelingData(TestIngestData):
('exampleA', [[0, 1, 'LOC']]),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_conll(self):
@ -183,23 +183,23 @@ class TestIngestSequenceLabelingData(TestIngestData):
('JAPAN GET', [[0, 5, 'LOC']]),
('Nadim Ladki', [[0, 11, 'PER']])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_wrong_conll(self):
filename = 'sequence_labeling/example.jsonl'
file_format = 'CoNLL'
response = self.ingest_data(filename, file_format)
response = self.import_dataset(filename, file_format)
self.assert_parse_error(response)
def test_jsonl_with_overlapping(self):
filename = 'sequence_labeling/example_overlapping.jsonl'
file_format = 'JSONL'
response = self.ingest_data(filename, file_format)
response = self.import_dataset(filename, file_format)
self.assertEqual(len(response['error']), 1)
class TestIngestSeq2seqData(TestIngestData):
class TestImportSeq2seqData(TestImportData):
task = SEQ2SEQ
def assert_examples(self, dataset):
@ -216,7 +216,7 @@ class TestIngestSeq2seqData(TestIngestData):
('exampleA', ['label1']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_json(self):
@ -226,7 +226,7 @@ class TestIngestSeq2seqData(TestIngestData):
('exampleA', ['label1']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
def test_csv(self):
@ -236,11 +236,11 @@ class TestIngestSeq2seqData(TestIngestData):
('exampleA', ['label1']),
('exampleB', [])
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)
class TextIngestIntentDetectionAndSlotFillingData(TestIngestData):
class TextImportIntentDetectionAndSlotFillingData(TestImportData):
task = INTENT_DETECTION_AND_SLOT_FILLING
def assert_examples(self, dataset):
@ -261,5 +261,5 @@ class TextIngestIntentDetectionAndSlotFillingData(TestIngestData):
('exampleC', {'cats': [], 'entities': [(0, 1, 'LOC')]}),
('exampleD', {'cats': [], 'entities': []}),
]
self.ingest_data(filename, file_format)
self.import_dataset(filename, file_format)
self.assert_examples(dataset)

backend/api/tests/api/test_upload.py → backend/data_import/tests/test_views.py

@ -1,23 +1,8 @@
from django.test import override_settings
from rest_framework import status
from rest_framework.reverse import reverse
from ...models import DOCUMENT_CLASSIFICATION
from .utils import CRUDMixin, create_default_roles, make_user, prepare_project
class TestFeatures(CRUDMixin):
@classmethod
def setUpTestData(cls):
create_default_roles()
cls.user = make_user()
cls.url = reverse('features')
@override_settings(CLOUD_BROWSER_APACHE_LIBCLOUD_PROVIDER=None)
def test_no_cloud_upload(self):
response = self.assert_fetch(self.user, status.HTTP_200_OK)
self.assertFalse(response.json().get('cloud_upload'))
from api.models import DOCUMENT_CLASSIFICATION
from api.tests.api.utils import CRUDMixin, prepare_project
class TestImportCatalog(CRUDMixin):

17
backend/data_import/urls.py

@ -0,0 +1,17 @@
from django.urls import include, path
from .views import DatasetImportAPI, DatasetCatalog
urlpatterns = [
path('fp/', include('django_drf_filepond.urls')),
path(
route='projects/<int:project_id>/upload',
view=DatasetImportAPI.as_view(),
name='upload'
),
path(
route='projects/<int:project_id>/catalog',
view=DatasetCatalog.as_view(),
name='catalog'
),
]

backend/api/views/import_dataset.py → backend/data_import/views.py

@ -8,11 +8,11 @@ from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
from members.permissions import IsProjectAdmin
from ..celery_tasks import ingest_data
from ..models import Project
from .upload.catalog import Options
from api.models import Project
from members.permissions import IsProjectAdmin
from .celery_tasks import import_dataset
from .pipeline.catalog import Options
class DatasetCatalog(APIView):
@ -25,13 +25,13 @@ class DatasetCatalog(APIView):
return Response(data=options, status=status.HTTP_200_OK)
class UploadAPI(APIView):
class DatasetImportAPI(APIView):
permission_classes = [IsAuthenticated & IsProjectAdmin]
def post(self, request, *args, **kwargs):
project_id = self.kwargs['project_id']
upload_ids = request.data.pop('uploadIds')
format = request.data.pop('format')
file_format = request.data.pop('format')
tus = [TemporaryUpload.objects.get(upload_id=upload_id) for upload_id in upload_ids]
sus = [
@ -42,11 +42,11 @@ class UploadAPI(APIView):
for tu in tus
]
filenames = [su.file.path for su in sus]
task = ingest_data.delay(
task = import_dataset.delay(
user_id=request.user.id,
project_id=project_id,
filenames=filenames,
format=format,
file_format=file_format,
**request.data
)
return Response({'task_id': task.task_id})

4
frontend/i18n/en/projects/dataset.js

@ -7,11 +7,11 @@ export default {
metadata: 'Metadata',
action: 'Action',
annotate: 'Annotate',
importDataTitle: 'Upload Data',
importDataTitle: 'Import Dataset',
importDataMessage1: 'Select a file format',
importDataMessage2: 'Select file(s)',
importDataPlaceholder: 'File input',
exportDataTitle: 'Export Data',
exportDataTitle: 'Export Dataset',
exportDataMessage: 'Select a file format',
exportDataMessage2: 'Select a file name',
deleteDocumentsTitle: 'Delete Document',

11
frontend/pages/projects/_id/upload/index.vue

@ -4,7 +4,7 @@
{{ $t('dataset.importDataTitle') }}
</v-card-title>
<v-card-text>
<v-overlay :value="taskId">
<v-overlay :value="isImporting">
<v-progress-circular
indeterminate
size="64"
@ -85,9 +85,9 @@
<v-btn
class='text-capitalize me-2 primary'
:disabled="isDisabled"
@click="injest"
@click="importDataset"
>
Ingest
Import
</v-btn>
</v-card-actions>
</v-card>
@ -143,6 +143,7 @@ export default {
},
uploadedFiles: [],
valid: false,
isImporting: false,
}
},
@ -232,7 +233,8 @@ export default {
this.$nextTick()
}
},
async injest() {
async importDataset() {
this.isImporting = true
this.taskId = await this.$services.parse.analyze(
this.$route.params.id,
this.selected,
@ -249,6 +251,7 @@ export default {
this.errors = res.result.error
this.myFiles = []
this.uploadedFiles = []
this.isImporting = false
}
}
}, 3000)

Loading…
Cancel
Save