diff --git a/backend/data_export/celery_tasks.py b/backend/data_export/celery_tasks.py index 075f69a7..84755c72 100644 --- a/backend/data_export/celery_tasks.py +++ b/backend/data_export/celery_tasks.py @@ -1,4 +1,6 @@ import os +import shutil +import uuid from celery import shared_task from celery.utils.log import get_task_logger @@ -8,7 +10,6 @@ from django.shortcuts import get_object_or_404 from .pipeline.dataset import Dataset from .pipeline.factories import create_formatter, create_labels, create_writer from .pipeline.services import ExportApplicationService -from .pipeline.writers import remove_files, zip_files from data_export.models import ExportedExample from projects.models import Member, Project @@ -27,15 +28,22 @@ def create_collaborative_dataset(project: Project, file_format: str, confirmed_o formatters = create_formatter(project, file_format) writer = create_writer(file_format) service = ExportApplicationService(dataset, formatters, writer) - filepath = os.path.join(settings.MEDIA_ROOT, f"all.{writer.extension}") + dirname = str(uuid.uuid4()) + dirpath = os.path.join(settings.MEDIA_ROOT, dirname) + os.makedirs(dirpath, exist_ok=True) + filepath = os.path.join(dirpath, f"all.{writer.extension}") service.export(filepath) - return filepath + zip_file = shutil.make_archive(dirpath, "zip", dirpath) + shutil.rmtree(dirpath) + return zip_file def create_individual_dataset(project: Project, file_format: str, confirmed_only: bool): - files = [] members = Member.objects.filter(project=project) is_text_project = project.is_text_project + dirname = str(uuid.uuid4()) + dirpath = os.path.join(settings.MEDIA_ROOT, dirname) + os.makedirs(dirpath, exist_ok=True) for member in members: if confirmed_only: examples = ExportedExample.objects.confirmed(project, user=member.user) @@ -47,11 +55,10 @@ def create_individual_dataset(project: Project, file_format: str, confirmed_only formatters = create_formatter(project, file_format) writer = create_writer(file_format) service = ExportApplicationService(dataset, formatters, writer) - filepath = os.path.join(settings.MEDIA_ROOT, f"{member.username}.{writer.extension}") + filepath = os.path.join(dirpath, f"{member.username}.{writer.extension}") service.export(filepath) - files.append(filepath) - zip_file = zip_files(files, settings.MEDIA_ROOT) - remove_files(files) + zip_file = shutil.make_archive(dirpath, "zip", dirpath) + shutil.rmtree(dirpath) return zip_file diff --git a/backend/data_export/pipeline/writers.py b/backend/data_export/pipeline/writers.py index 31ff499f..3377a97c 100644 --- a/backend/data_export/pipeline/writers.py +++ b/backend/data_export/pipeline/writers.py @@ -1,24 +1,8 @@ import abc -import os -import uuid -import zipfile import pandas as pd -def zip_files(files, dirname): - save_file = os.path.join(dirname, f"{uuid.uuid4()}.zip") - with zipfile.ZipFile(save_file, "w", compression=zipfile.ZIP_DEFLATED) as zf: - for file in files: - zf.write(filename=file, arcname=os.path.basename(file)) - return save_file - - -def remove_files(files): - for file in files: - os.remove(file) - - class Writer(abc.ABC): extension = ""