Browse Source

Move data import's code and data to data_import app

pull/1640/head
Hironsan 2 years ago
parent
commit
fd7bc43947
53 changed files with 97 additions and 79 deletions
  1. 2
      Pipfile
  2. 19
      backend/api/celery_tasks.py
  3. 2
      backend/api/tests/api/utils.py
  4. 15
      backend/api/urls.py
  5. 1
      backend/app/settings.py
  6. 1
      backend/app/urls.py
  7. 0
      backend/data_import/__init__.py
  8. 0
      backend/data_import/admin.py
  9. 6
      backend/data_import/apps.py
  10. 23
      backend/data_import/celery_tasks.py
  11. 0
      backend/data_import/migrations/__init__.py
  12. 0
      backend/data_import/models.py
  13. 0
      backend/data_import/pipeline/__init__.py
  14. 4
      backend/data_import/pipeline/builders.py
  15. 6
      backend/data_import/pipeline/catalog.py
  16. 4
      backend/data_import/pipeline/cleaners.py
  17. 2
      backend/data_import/pipeline/data.py
  18. 0
      backend/data_import/pipeline/examples.py
  19. 0
      backend/data_import/pipeline/exceptions.py
  20. 22
      backend/data_import/pipeline/factories.py
  21. 8
      backend/data_import/pipeline/labels.py
  22. 2
      backend/data_import/pipeline/parsers.py
  23. 4
      backend/data_import/pipeline/readers.py
  24. 4
      backend/data_import/pipeline/writers.py
  25. 0
      backend/data_import/tests/__init__.py
  26. 0
      backend/data_import/tests/data/example.txt
  27. 0
      backend/data_import/tests/data/example.utf16.csv
  28. 0
      backend/data_import/tests/data/intent/example.jsonl
  29. 0
      backend/data_import/tests/data/label/invalid_labels.json
  30. 0
      backend/data_import/tests/data/label/valid_labels.json
  31. 0
      backend/data_import/tests/data/seq2seq/example.csv
  32. 0
      backend/data_import/tests/data/seq2seq/example.json
  33. 0
      backend/data_import/tests/data/seq2seq/example.jsonl
  34. 0
      backend/data_import/tests/data/sequence_labeling/example.conll
  35. 0
      backend/data_import/tests/data/sequence_labeling/example.jsonl
  36. 0
      backend/data_import/tests/data/sequence_labeling/example_overlapping.jsonl
  37. 0
      backend/data_import/tests/data/sequence_labeling/labeling.invalid.conll
  38. 0
      backend/data_import/tests/data/sequence_labeling/labeling.trailing.conll
  39. 0
      backend/data_import/tests/data/text_classification/example.csv
  40. 0
      backend/data_import/tests/data/text_classification/example.invalid.2.csv
  41. 0
      backend/data_import/tests/data/text_classification/example.invalid.2.xlsx
  42. 0
      backend/data_import/tests/data/text_classification/example.json
  43. 0
      backend/data_import/tests/data/text_classification/example.jsonl
  44. 0
      backend/data_import/tests/data/text_classification/example.xlsx
  45. 0
      backend/data_import/tests/data/text_classification/example_fasttext.txt
  46. 0
      backend/data_import/tests/data/text_classification/example_one_column_no_header.xlsx
  47. 0
      backend/data_import/tests/data/text_classification/example_out_of_order_columns.csv
  48. 8
      backend/data_import/tests/test_builder.py
  49. 2
      backend/data_import/tests/test_parser.py
  50. 12
      backend/data_import/tests/test_tasks.py
  51. 4
      backend/data_import/tests/test_views.py
  52. 17
      backend/data_import/urls.py
  53. 8
      backend/data_import/views.py

2
Pipfile

@ -60,6 +60,6 @@ python_version = "3.8"
isort = "isort api -c --skip migrations"
flake8 = "flake8 --filename \"*.py\" --extend-exclude \"server,api/migrations,api/views/__init__.py,authentification,api/apps.py\""
wait_for_db = "python manage.py wait_for_db"
test = "python manage.py test api.tests roles.tests members.tests metrics.tests users.tests"
test = "python manage.py test api.tests roles.tests members.tests metrics.tests users.tests data_import.tests"
migrate = "python manage.py migrate"
collectstatic = "python manage.py collectstatic --noinput"

19
backend/api/celery_tasks.py

@ -1,34 +1,15 @@
from celery import shared_task
from celery.utils.log import get_task_logger
from django.conf import settings
from django.contrib.auth import get_user_model
from django.shortcuts import get_object_or_404
from .models import Project
from .views.download.factory import create_repository, create_writer
from .views.download.service import ExportApplicationService
from .views.upload.factories import (create_bulder, create_cleaner,
create_parser)
from .views.upload.readers import Reader
from .views.upload.writers import BulkWriter
logger = get_task_logger(__name__)
@shared_task
def ingest_data(user_id, project_id, filenames, format: str, **kwargs):
project = get_object_or_404(Project, pk=project_id)
user = get_object_or_404(get_user_model(), pk=user_id)
parser = create_parser(format, **kwargs)
builder = create_bulder(project, **kwargs)
reader = Reader(filenames=filenames, parser=parser, builder=builder)
cleaner = create_cleaner(project)
writer = BulkWriter(batch_size=settings.IMPORT_BATCH_SIZE)
writer.save(reader, project, user, cleaner)
return {'error': writer.errors}
@shared_task
def export_dataset(project_id, format: str, export_approved=False):
project = get_object_or_404(Project, pk=project_id)

2
backend/api/tests/api/utils.py

@ -15,7 +15,7 @@ from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
DATA_DIR = os.path.join(os.path.dirname(__file__), '../data')
DATA_DIR = os.path.join(os.path.dirname(__file__), '../../../data_import/tests/data')
ProjectData = namedtuple('ProjectData', ['item', 'users'])

15
backend/api/urls.py

@ -1,21 +1,11 @@
from django.urls import include, path
from .views import (annotation, auto_labeling, comment, example, example_state,
export_dataset, health, import_dataset, import_export,
label, project, tag, task)
export_dataset, health, import_export, label, project, tag,
task)
from .views.tasks import category, relation, span, text
urlpatterns_project = [
path(
route='upload',
view=import_dataset.UploadAPI.as_view(),
name='upload'
),
path(
route='catalog',
view=import_dataset.DatasetCatalog.as_view(),
name='catalog'
),
path(
route='download-format',
view=export_dataset.DownloadDatasetCatalog.as_view(),
@ -214,7 +204,6 @@ urlpatterns = [
view=health.Health.as_view(),
name='health'
),
path('fp/', include('django_drf_filepond.urls')),
path(
route='features',
view=import_export.Features.as_view(),

1
backend/app/settings.py

@ -56,6 +56,7 @@ INSTALLED_APPS = [
'members.apps.MembersConfig',
'metrics.apps.MetricsConfig',
'users.apps.UsersConfig',
'data_import.apps.DataImportConfig',
'rest_framework',
'rest_framework.authtoken',
'django_filters',

1
backend/app/urls.py

@ -43,6 +43,7 @@ urlpatterns += [
path('v1/', include('api.urls')),
path('v1/', include('roles.urls')),
path('v1/', include('users.urls')),
path('v1/', include('data_import.urls')),
path('v1/projects/<int:project_id>/', include('members.urls')),
path('v1/projects/<int:project_id>/metrics/', include('metrics.urls')),
path('swagger/', schema_view.with_ui('swagger', cache_timeout=0), name='schema-swagger-ui'),

backend/api/tests/upload/__init__.py → backend/data_import/__init__.py

backend/api/views/upload/__init__.py → backend/data_import/admin.py

6
backend/data_import/apps.py

@ -0,0 +1,6 @@
from django.apps import AppConfig
class DataImportConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'data_import'

23
backend/data_import/celery_tasks.py

@ -0,0 +1,23 @@
from celery import shared_task
from django.conf import settings
from django.contrib.auth import get_user_model
from django.shortcuts import get_object_or_404
from api.models import Project
from .pipeline.factories import create_parser, create_bulder, create_cleaner
from .pipeline.readers import Reader
from .pipeline.writers import BulkWriter
@shared_task
def ingest_data(user_id, project_id, filenames, format: str, **kwargs):
project = get_object_or_404(Project, pk=project_id)
user = get_object_or_404(get_user_model(), pk=user_id)
parser = create_parser(format, **kwargs)
builder = create_bulder(project, **kwargs)
reader = Reader(filenames=filenames, parser=parser, builder=builder)
cleaner = create_cleaner(project)
writer = BulkWriter(batch_size=settings.IMPORT_BATCH_SIZE)
writer.save(reader, project, user, cleaner)
return {'error': writer.errors}

0
backend/data_import/migrations/__init__.py

0
backend/data_import/models.py

0
backend/data_import/pipeline/__init__.py

backend/api/views/upload/builders.py → backend/data_import/pipeline/builders.py

@ -5,8 +5,8 @@ from typing import Any, Dict, List, Optional, Type, TypeVar
from pydantic import ValidationError
from .data import BaseData
from .exception import FileParseException
from .label import Label
from .exceptions import FileParseException
from .labels import Label
from .readers import Builder, Record
logger = getLogger(__name__)

backend/api/views/upload/catalog.py → backend/data_import/pipeline/catalog.py

@ -4,9 +4,9 @@ from typing import Dict, List, Type
from pydantic import BaseModel
from typing_extensions import Literal
from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from api.models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from . import examples
encodings = Literal[

backend/api/views/upload/cleaners.py → backend/data_import/pipeline/cleaners.py

@ -1,7 +1,7 @@
from typing import List
from ...models import Project
from .label import CategoryLabel, Label, SpanLabel
from api.models import Project
from .labels import CategoryLabel, Label, SpanLabel
class Cleaner:

backend/api/views/upload/data.py → backend/data_import/pipeline/data.py

@ -4,7 +4,7 @@ from typing import Any, Dict
from pydantic import BaseModel, validator
from ...models import Example, Project
from api.models import Example, Project
class BaseData(BaseModel, abc.ABC):

backend/api/views/upload/examples.py → backend/data_import/pipeline/examples.py

backend/api/views/upload/exception.py → backend/data_import/pipeline/exceptions.py

backend/api/views/upload/factories.py → backend/data_import/pipeline/factories.py

@ -1,7 +1,7 @@
from ...models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from . import builders, catalog, cleaners, data, label, parsers, readers
from api.models import (DOCUMENT_CLASSIFICATION, IMAGE_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, SPEECH2TEXT)
from . import builders, catalog, cleaners, data, labels, parsers, readers
def get_data_class(project_type: str):
@ -37,11 +37,11 @@ def create_parser(file_format: str, **kwargs):
def get_label_class(project_type: str):
mapping = {
DOCUMENT_CLASSIFICATION: label.CategoryLabel,
SEQUENCE_LABELING: label.SpanLabel,
SEQ2SEQ: label.TextLabel,
IMAGE_CLASSIFICATION: label.CategoryLabel,
SPEECH2TEXT: label.TextLabel,
DOCUMENT_CLASSIFICATION: labels.CategoryLabel,
SEQUENCE_LABELING: labels.SpanLabel,
SEQ2SEQ: labels.TextLabel,
IMAGE_CLASSIFICATION: labels.CategoryLabel,
SPEECH2TEXT: labels.TextLabel,
}
if project_type not in mapping:
ValueError(f'Invalid project type: {project_type}')
@ -71,11 +71,11 @@ def create_bulder(project, **kwargs):
label_columns = [
builders.LabelColumn(
name='cats',
value_class=label.CategoryLabel
value_class=labels.CategoryLabel
),
builders.LabelColumn(
name='entities',
value_class=label.SpanLabel
value_class=labels.SpanLabel
)
]
else:

backend/api/views/upload/label.py → backend/data_import/pipeline/labels.py

@ -3,10 +3,10 @@ from typing import Any, Dict, Optional, Union
from pydantic import BaseModel, validator
from ...models import Category, CategoryType
from ...models import Label as LabelModel
from ...models import Project, Span, SpanType
from ...models import TextLabel as TL
from api.models import Category, CategoryType
from api.models import Label as LabelModel
from api.models import Project, Span, SpanType
from api.models import TextLabel as TL
class Label(BaseModel, abc.ABC):

backend/api/views/upload/parsers.py → backend/data_import/pipeline/parsers.py

@ -10,7 +10,7 @@ import pyexcel.exceptions
from chardet import UniversalDetector
from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens
from .exception import FileParseException
from .exceptions import FileParseException
from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser
DEFAULT_ENCODING = 'Auto'

backend/api/views/upload/readers.py → backend/data_import/pipeline/readers.py

@ -4,8 +4,8 @@ from typing import Any, Dict, Iterator, List, Type
from .cleaners import Cleaner
from .data import BaseData
from .exception import FileParseException
from .label import Label
from .exceptions import FileParseException
from .labels import Label
DEFAULT_TEXT_COLUMN = 'text'
DEFAULT_LABEL_COLUMN = 'label'

backend/api/views/upload/writers.py → backend/data_import/pipeline/writers.py

@ -5,8 +5,8 @@ from typing import Any, Dict, List
from django.conf import settings
from ...models import CategoryType, Example, Project, SpanType
from .exception import FileParseException
from api.models import CategoryType, Example, Project, SpanType
from .exceptions import FileParseException
from .readers import BaseReader

0
backend/data_import/tests/__init__.py

backend/api/tests/data/example.txt → backend/data_import/tests/data/example.txt

backend/api/tests/data/example.utf16.csv → backend/data_import/tests/data/example.utf16.csv

backend/api/tests/data/intent/example.jsonl → backend/data_import/tests/data/intent/example.jsonl

backend/api/tests/data/label/invalid_labels.json → backend/data_import/tests/data/label/invalid_labels.json

backend/api/tests/data/label/valid_labels.json → backend/data_import/tests/data/label/valid_labels.json

backend/api/tests/data/seq2seq/example.csv → backend/data_import/tests/data/seq2seq/example.csv

backend/api/tests/data/seq2seq/example.json → backend/data_import/tests/data/seq2seq/example.json

backend/api/tests/data/seq2seq/example.jsonl → backend/data_import/tests/data/seq2seq/example.jsonl

backend/api/tests/data/sequence_labeling/example.conll → backend/data_import/tests/data/sequence_labeling/example.conll

backend/api/tests/data/sequence_labeling/example.jsonl → backend/data_import/tests/data/sequence_labeling/example.jsonl

backend/api/tests/data/sequence_labeling/example_overlapping.jsonl → backend/data_import/tests/data/sequence_labeling/example_overlapping.jsonl

backend/api/tests/data/sequence_labeling/labeling.invalid.conll → backend/data_import/tests/data/sequence_labeling/labeling.invalid.conll

backend/api/tests/data/sequence_labeling/labeling.trailing.conll → backend/data_import/tests/data/sequence_labeling/labeling.trailing.conll

backend/api/tests/data/text_classification/example.csv → backend/data_import/tests/data/text_classification/example.csv

backend/api/tests/data/text_classification/example.invalid.2.csv → backend/data_import/tests/data/text_classification/example.invalid.2.csv

backend/api/tests/data/text_classification/example.invalid.2.xlsx → backend/data_import/tests/data/text_classification/example.invalid.2.xlsx

backend/api/tests/data/text_classification/example.json → backend/data_import/tests/data/text_classification/example.json

backend/api/tests/data/text_classification/example.jsonl → backend/data_import/tests/data/text_classification/example.jsonl

backend/api/tests/data/text_classification/example.xlsx → backend/data_import/tests/data/text_classification/example.xlsx

backend/api/tests/data/text_classification/example_fasttext.txt → backend/data_import/tests/data/text_classification/example_fasttext.txt

backend/api/tests/data/text_classification/example_one_column_no_header.xlsx → backend/data_import/tests/data/text_classification/example_one_column_no_header.xlsx

backend/api/tests/data/text_classification/example_out_of_order_columns.csv → backend/data_import/tests/data/text_classification/example_out_of_order_columns.csv

backend/api/tests/upload/test_builder.py → backend/data_import/tests/test_builder.py

@ -1,10 +1,10 @@
import unittest
from typing import List
from ...views.upload import builders
from ...views.upload.data import TextData
from ...views.upload.exception import FileParseException
from ...views.upload.label import CategoryLabel, SpanLabel
from data_import.pipeline import builders
from data_import.pipeline.data import TextData
from data_import.pipeline.exceptions import FileParseException
from data_import.pipeline.labels import CategoryLabel, SpanLabel
class TestColumnBuilder(unittest.TestCase):

backend/api/tests/upload/test_parser.py → backend/data_import/tests/test_parser.py

@ -4,7 +4,7 @@ import shutil
import tempfile
import unittest
from ...views.upload import parsers
from data_import.pipeline import parsers
class TestParser(unittest.TestCase):

backend/api/tests/test_tasks.py → backend/data_import/tests/test_tasks.py

@ -2,12 +2,12 @@ import pathlib
from django.test import TestCase
from ..celery_tasks import ingest_data
from ..models import (DOCUMENT_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, Category, CategoryType, Example, Span,
SpanType)
from .api.utils import prepare_project
from data_import.celery_tasks import ingest_data
from api.models import (DOCUMENT_CLASSIFICATION,
INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ,
SEQUENCE_LABELING, Category, CategoryType, Example, Span,
SpanType)
from api.tests.api.utils import prepare_project
class TestIngestData(TestCase):

backend/api/tests/api/test_upload.py → backend/data_import/tests/test_views.py

@ -2,8 +2,8 @@ from django.test import override_settings
from rest_framework import status
from rest_framework.reverse import reverse
from ...models import DOCUMENT_CLASSIFICATION
from .utils import CRUDMixin, create_default_roles, make_user, prepare_project
from api.models import DOCUMENT_CLASSIFICATION
from api.tests.api.utils import CRUDMixin, create_default_roles, make_user, prepare_project
class TestFeatures(CRUDMixin):

17
backend/data_import/urls.py

@ -0,0 +1,17 @@
from django.urls import include, path
from .views import UploadAPI, DatasetCatalog
urlpatterns = [
path('fp/', include('django_drf_filepond.urls')),
path(
route='projects/<int:project_id>/upload',
view=UploadAPI.as_view(),
name='upload'
),
path(
route='projects/<int:project_id>/catalog',
view=DatasetCatalog.as_view(),
name='catalog'
),
]

backend/api/views/import_dataset.py → backend/data_import/views.py

@ -8,11 +8,11 @@ from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
from members.permissions import IsProjectAdmin
from ..celery_tasks import ingest_data
from ..models import Project
from .upload.catalog import Options
from api.models import Project
from members.permissions import IsProjectAdmin
from .celery_tasks import ingest_data
from .pipeline.catalog import Options
class DatasetCatalog(APIView):
Loading…
Cancel
Save