mirror of https://github.com/doccano/doccano.git
10 changed files with 264 additions and 36 deletions
Split View
Diff Options
-
64backend/data_export/celery_tasks.py
-
4backend/data_export/pipeline/catalog.py
-
20backend/data_export/pipeline/dataset.py
-
43backend/data_export/pipeline/factories.py
-
9backend/data_export/pipeline/formatters.py
-
8backend/data_export/pipeline/labels.py
-
19backend/data_export/pipeline/services.py
-
12backend/data_export/pipeline/writers.py
-
117backend/data_export/tests/test_task.py
-
4backend/projects/tests/utils.py
@ -1,20 +1,70 @@ |
|||
import os |
|||
|
|||
from celery import shared_task |
|||
from celery.utils.log import get_task_logger |
|||
from django.conf import settings |
|||
from django.shortcuts import get_object_or_404 |
|||
|
|||
from .pipeline.factories import create_repository, create_writer |
|||
from .pipeline.dataset import Dataset, filter_examples |
|||
from .pipeline.factories import ( |
|||
create_formatter, |
|||
create_labels, |
|||
create_writer, |
|||
select_label_collection, |
|||
) |
|||
from .pipeline.services import ExportApplicationService |
|||
from projects.models import Project |
|||
from .pipeline.writers import zip_files |
|||
from projects.models import Member, Project |
|||
|
|||
logger = get_task_logger(__name__) |
|||
|
|||
|
|||
def create_collaborative_dataset(project: Project, file_format: str, confirmed_only: bool): |
|||
examples = filter_examples( |
|||
examples=project.examples.all(), |
|||
is_collaborative=project.collaborative_annotation, |
|||
confirmed_only=confirmed_only, |
|||
) |
|||
label_collection = select_label_collection(project) |
|||
labels = create_labels(label_collection, examples=examples) |
|||
dataset = Dataset(examples, labels) |
|||
formatter = create_formatter(project, file_format)(target_column=label_collection.field_name) |
|||
writer = create_writer(file_format) |
|||
service = ExportApplicationService(dataset, formatter, writer) |
|||
filepath = os.path.join(settings.MEDIA_URL, f"all.{writer.extension}") |
|||
service.export(filepath) |
|||
return filepath |
|||
|
|||
|
|||
def create_individual_dataset(project: Project, file_format: str, confirmed_only: bool): |
|||
files = [] |
|||
members = Member.objects.filter(project=project) |
|||
for member in members: |
|||
examples = filter_examples( |
|||
examples=project.examples.all(), |
|||
is_collaborative=project.collaborative_annotation, |
|||
confirmed_only=confirmed_only, |
|||
user=member.user, |
|||
) |
|||
label_collection = select_label_collection(project) |
|||
labels = create_labels(label_collection, examples=examples, user=member.user) |
|||
dataset = Dataset(examples, labels) |
|||
formatter = create_formatter(project, file_format)(target_column=label_collection.field_name) |
|||
writer = create_writer(file_format) |
|||
service = ExportApplicationService(dataset, formatter, writer) |
|||
filepath = os.path.join(settings.MEDIA_URL, f"{member.username}.{writer.extension}") |
|||
service.export(filepath) |
|||
files.append(filepath) |
|||
zip_file = zip_files(files, settings.MEDIA_URL) |
|||
for file in files: |
|||
os.remove(file) |
|||
return zip_file |
|||
|
|||
|
|||
@shared_task |
|||
def export_dataset(project_id, file_format: str, export_approved=False): |
|||
project = get_object_or_404(Project, pk=project_id) |
|||
repository = create_repository(project, file_format) |
|||
writer = create_writer(file_format)(settings.MEDIA_ROOT) |
|||
service = ExportApplicationService(repository, writer) |
|||
filepath = service.export(export_approved) |
|||
return filepath |
|||
if project.collaborative_annotation: |
|||
return create_collaborative_dataset(project, file_format, export_approved) |
|||
else: |
|||
return create_individual_dataset(project, file_format, export_approved) |
@ -1,13 +1,16 @@ |
|||
from .repositories import BaseRepository |
|||
from .writers import BaseWriter |
|||
from .dataset import Dataset |
|||
from .formatters import Formatter |
|||
from .writers import Writer |
|||
|
|||
|
|||
class ExportApplicationService: |
|||
def __init__(self, repository: BaseRepository, writer: BaseWriter): |
|||
self.repository = repository |
|||
def __init__(self, dataset: Dataset, formatter: Formatter, writer: Writer): |
|||
self.dataset = dataset |
|||
self.formatter = formatter |
|||
self.writer = writer |
|||
|
|||
def export(self, export_approved=False) -> str: |
|||
records = self.repository.list(export_approved=export_approved) |
|||
filepath = self.writer.write(records) |
|||
return filepath |
|||
def export(self, file): |
|||
dataset = self.dataset.to_dataframe() |
|||
dataset = self.formatter.format(dataset) |
|||
self.writer.write(file, dataset) |
|||
return file |
@ -0,0 +1,117 @@ |
|||
import os |
|||
import zipfile |
|||
|
|||
import numpy as np |
|||
import pandas as pd |
|||
from django.test import TestCase, override_settings |
|||
from model_mommy import mommy |
|||
from pandas.testing import assert_frame_equal |
|||
|
|||
from ..celery_tasks import export_dataset |
|||
from projects.models import DOCUMENT_CLASSIFICATION |
|||
from projects.tests.utils import prepare_project |
|||
|
|||
|
|||
def read_zip_content(file): |
|||
datasets = {} |
|||
with zipfile.ZipFile(file) as z: |
|||
for file in z.filelist: |
|||
username = file.filename.split(".")[0] |
|||
with z.open(file) as f: |
|||
try: |
|||
df = pd.read_csv(f) |
|||
except pd.errors.EmptyDataError: |
|||
continue |
|||
datasets[username] = df |
|||
return datasets |
|||
|
|||
|
|||
@override_settings(MEDIA_URL=os.path.dirname(__file__)) |
|||
class TestExportTask(TestCase): |
|||
def prepare_data(self, collaborative=False): |
|||
self.project = prepare_project(DOCUMENT_CLASSIFICATION, collaborative_annotation=collaborative) |
|||
self.example1 = mommy.make("Example", project=self.project.item, text="confirmed") |
|||
self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed") |
|||
self.category1 = mommy.make("Category", example=self.example1, user=self.project.admin) |
|||
self.category2 = mommy.make("Category", example=self.example1, user=self.project.annotator) |
|||
mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin) |
|||
|
|||
def test_unconfirmed_and_non_collaborative(self): |
|||
self.prepare_data() |
|||
file = export_dataset(self.project.id, "CSV", False) |
|||
datasets = read_zip_content(file) |
|||
os.remove(file) |
|||
expected_datasets = { |
|||
self.project.admin.username: pd.DataFrame( |
|||
[ |
|||
{"id": self.example1.id, "data": self.example1.text, "categories": self.category1.label.text}, |
|||
{"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, |
|||
] |
|||
), |
|||
self.project.approver.username: pd.DataFrame( |
|||
[ |
|||
{"id": self.example1.id, "data": self.example1.text, "categories": np.nan}, |
|||
{"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, |
|||
] |
|||
), |
|||
self.project.annotator.username: pd.DataFrame( |
|||
[ |
|||
{"id": self.example1.id, "data": self.example1.text, "categories": self.category2.label.text}, |
|||
{"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, |
|||
] |
|||
), |
|||
} |
|||
for username, dataset in expected_datasets.items(): |
|||
assert_frame_equal(dataset, datasets[username]) |
|||
|
|||
def test_unconfirmed_and_collaborative(self): |
|||
self.prepare_data(collaborative=True) |
|||
file = export_dataset(self.project.id, "CSV", False) |
|||
dataset = pd.read_csv(file) |
|||
os.remove(file) |
|||
expected_dataset = pd.DataFrame( |
|||
[ |
|||
{ |
|||
"id": self.example1.id, |
|||
"data": self.example1.text, |
|||
"categories": "#".join(sorted([self.category1.label.text, self.category2.label.text])), |
|||
}, |
|||
{"id": self.example2.id, "data": self.example2.text, "categories": np.nan}, |
|||
] |
|||
) |
|||
assert_frame_equal(dataset, expected_dataset) |
|||
|
|||
def test_confirmed_and_non_collaborative(self): |
|||
self.prepare_data() |
|||
file = export_dataset(self.project.id, "CSV", True) |
|||
datasets = read_zip_content(file) |
|||
os.remove(file) |
|||
expected_datasets = { |
|||
self.project.admin.username: pd.DataFrame( |
|||
[ |
|||
{ |
|||
"id": self.example1.id, |
|||
"data": self.example1.text, |
|||
"categories": self.category1.label.text, |
|||
} |
|||
] |
|||
) |
|||
} |
|||
for username, dataset in expected_datasets.items(): |
|||
assert_frame_equal(dataset, datasets[username]) |
|||
|
|||
def test_confirmed_and_collaborative(self): |
|||
self.prepare_data(collaborative=True) |
|||
file = export_dataset(self.project.id, "CSV", True) |
|||
dataset = pd.read_csv(file) |
|||
os.remove(file) |
|||
expected_dataset = pd.DataFrame( |
|||
[ |
|||
{ |
|||
"id": self.example1.id, |
|||
"data": self.example1.text, |
|||
"categories": "#".join(sorted([self.category1.label.text, self.category2.label.text])), |
|||
} |
|||
] |
|||
) |
|||
assert_frame_equal(dataset, expected_dataset) |
Write
Preview
Loading…
Cancel
Save