From 1e0f5fc1284d3873865b6b6cba08fe60b74f985e Mon Sep 17 00:00:00 2001
From: Hironsan <light.tree.1.13@gmail.com>
Date: Sun, 24 Apr 2022 09:40:44 +0900
Subject: [PATCH] Refactor test_task.py

---
 backend/data_export/models.py              |   4 +-
 backend/data_export/pipeline/formatters.py |   4 +-
 backend/data_export/tests/test_task.py     | 834 ++++++++-------------
 3 files changed, 339 insertions(+), 503 deletions(-)

diff --git a/backend/data_export/models.py b/backend/data_export/models.py
index c90aeb7a..fe808bca 100644
--- a/backend/data_export/models.py
+++ b/backend/data_export/models.py
@@ -5,10 +5,12 @@ from django.db import models
 from examples.models import Example
 from labels.models import Category, Relation, Span, TextLabel
 
+DATA = "data"
+
 
 class ExportedExample(Example):
     def to_dict(self) -> Dict[str, Any]:
-        return {"id": self.id, "data": self.text if self.project.is_text_project else self.upload_name, **self.meta}
+        return {"id": self.id, DATA: self.text if self.project.is_text_project else self.upload_name, **self.meta}
 
     class Meta:
         proxy = True
diff --git a/backend/data_export/pipeline/formatters.py b/backend/data_export/pipeline/formatters.py
index bfe6e460..5c4d758f 100644
--- a/backend/data_export/pipeline/formatters.py
+++ b/backend/data_export/pipeline/formatters.py
@@ -5,6 +5,8 @@ import abc
 
 import pandas as pd
 
+from data_export.models import DATA
+
 
 class Formatter(abc.ABC):
     def __init__(self, target_column: str):
@@ -43,7 +45,7 @@ class FastTextCategoryFormatter(Formatter):
         """Format the label column to `__label__LabelA __label__LabelB` format.
         Also, drop the columns except for `data` and `self.target_column`.
         """
-        dataset = dataset[["data", self.target_column]]
+        dataset = dataset[[DATA, self.target_column]]
         dataset[self.target_column] = dataset[self.target_column].apply(
             lambda labels: sorted(f"__label__{label.to_string()}" for label in labels)
         )
diff --git a/backend/data_export/tests/test_task.py b/backend/data_export/tests/test_task.py
index 111c6322..8b579290 100644
--- a/backend/data_export/tests/test_task.py
+++ b/backend/data_export/tests/test_task.py
@@ -1,11 +1,9 @@
 import os
 import zipfile
 
-import numpy as np
 import pandas as pd
 from django.test import TestCase, override_settings
 from model_mommy import mommy
-from pandas.testing import assert_frame_equal
 
 from ..celery_tasks import export_dataset
 from projects.models import (
@@ -19,203 +17,165 @@ from projects.models import (
 from projects.tests.utils import prepare_project
 
 
-def read_zip_content(file, file_format="csv"):
+def read_zip_content(file):
     datasets = {}
     with zipfile.ZipFile(file) as z:
         for file in z.filelist:
             username = file.filename.split(".")[0]
             with z.open(file) as f:
                 try:
-                    if file_format == "csv":
-                        df = pd.read_csv(f)
-                    elif file_format == "json":
-                        df = pd.read_json(f)
-                    elif file_format == "jsonl":
-                        df = pd.read_json(f, lines=True)
+                    df = pd.read_json(f, lines=True)
                 except pd.errors.EmptyDataError:
                     continue
-            datasets[username] = df
+            datasets[username] = df.to_dict(orient="records")
     return datasets
 
 
 @override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportCategory(TestCase):
+class TestExport(TestCase):
+    def export_dataset(self, confirmed_only=False):
+        file = export_dataset(self.project.id, "JSONL", confirmed_only)
+        if self.project.item.collaborative_annotation:
+            dataset = pd.read_json(file, lines=True).to_dict(orient="records")
+        else:
+            dataset = read_zip_content(file)
+        os.remove(file)
+        return dataset
+
+
+class TestExportCategory(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(DOCUMENT_CLASSIFICATION, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="confirmed")
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
-        self.category1 = mommy.make("Category", example=self.example1, user=self.project.admin)
-        self.category2 = mommy.make("Category", example=self.example1, user=self.project.annotator)
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="example1")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="example2")
+        self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin)
+        self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator)
         mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin)
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "CSV", False)
-        datasets = read_zip_content(file)
-        os.remove(file)
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "categories": self.category1.label.text},
-                    {"id": self.example2.id, "data": self.example2.text, "categories": np.nan},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "categories": np.nan},
-                    {"id": self.example2.id, "data": self.example2.text, "categories": np.nan},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "categories": self.category2.label.text},
-                    {"id": self.example2.id, "data": self.example2.text, "categories": np.nan},
-                ]
-            ),
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "categories": [self.category1.to_string()]},
+                {**self.example2.to_dict(), "categories": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "categories": []},
+                {**self.example2.to_dict(), "categories": []},
+            ],
+            self.project.annotator.username: [
+                {**self.example1.to_dict(), "categories": [self.category2.to_string()]},
+                {**self.example2.to_dict(), "categories": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            assert_frame_equal(dataset, datasets[username])
+            self.assertEqual(datasets[username], dataset)
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "CSV", False)
-        dataset = pd.read_csv(file)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "categories": "#".join(sorted([self.category1.label.text, self.category2.label.text])),
-                },
-                {"id": self.example2.id, "data": self.example2.text, "categories": np.nan},
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
+            },
+            {**self.example2.to_dict(), "categories": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "CSV", True)
-        datasets = read_zip_content(file)
-        os.remove(file)
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [{"id": self.example1.id, "data": self.example1.text, "categories": self.category1.label.text}]
-            )
+            self.project.admin.username: [{**self.example1.to_dict(), "categories": [self.category1.to_string()]}]
         }
         for username, dataset in expected_datasets.items():
-            assert_frame_equal(dataset, datasets[username])
+            self.assertEqual(datasets[username], dataset)
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "CSV", True)
-        dataset = pd.read_csv(file)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "categories": "#".join(sorted([self.category1.label.text, self.category2.label.text])),
-                }
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
+            }
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
 
-@override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportSeq2seq(TestCase):
+class TestExportSeq2seq(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(SEQ2SEQ, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="confirmed")
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
         self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin)
         self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator)
         mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin)
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "CSV", False)
-        datasets = read_zip_content(file)
-        os.remove(file)
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "labels": self.text1.text},
-                    {"id": self.example2.id, "data": self.example2.text, "labels": np.nan},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "labels": np.nan},
-                    {"id": self.example2.id, "data": self.example2.text, "labels": np.nan},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "labels": self.text2.text},
-                    {"id": self.example2.id, "data": self.example2.text, "labels": np.nan},
-                ]
-            ),
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "labels": [self.text1.text]},
+                {**self.example2.to_dict(), "labels": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "labels": []},
+                {**self.example2.to_dict(), "labels": []},
+            ],
+            self.project.annotator.username: [
+                {**self.example1.to_dict(), "labels": [self.text2.text]},
+                {**self.example2.to_dict(), "labels": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            assert_frame_equal(dataset, datasets[username])
+            self.assertEqual(datasets[username], dataset)
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "CSV", False)
-        dataset = pd.read_csv(file)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "labels": "#".join(sorted([self.text1.text, self.text2.text])),
-                },
-                {"id": self.example2.id, "data": self.example2.text, "labels": np.nan},
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "labels": sorted([self.text1.text, self.text2.text]),
+            },
+            {**self.example2.to_dict(), "labels": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "CSV", True)
-        datasets = read_zip_content(file)
-        os.remove(file)
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "labels": self.text1.text},
-                ]
-            )
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "labels": [self.text1.text]},
+            ],
+            self.project.approver.username: [],
+            self.project.annotator.username: [],
         }
         for username, dataset in datasets.items():
-            assert_frame_equal(dataset, expected_datasets[username])
+            self.assertEqual(dataset, expected_datasets[username])
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "CSV", True)
-        dataset = pd.read_csv(file)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "labels": "#".join(sorted([self.text1.text, self.text2.text])),
-                }
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "labels": sorted([self.text1.text, self.text2.text]),
+            }
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
 
-@override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportIntentDetectionAndSlotFilling(TestCase):
+class TestExportIntentDetectionAndSlotFilling(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(INTENT_DETECTION_AND_SLOT_FILLING, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="confirmed")
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
         self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin)
         self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator)
         self.span = mommy.make(
@@ -225,107 +185,79 @@ class TestExportIntentDetectionAndSlotFilling(TestCase):
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", False)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
-
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.text,
-                        "entities": [list(self.span.to_tuple())],
-                        "categories": [self.category1.to_string()],
-                    },
-                    {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.text,
-                        "entities": [],
-                        "categories": [self.category2.to_string()],
-                    },
-                    {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "entities": [], "categories": []},
-                    {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []},
-                ]
-            ),
+            self.project.admin.username: [
+                {
+                    **self.example1.to_dict(),
+                    "entities": [list(self.span.to_tuple())],
+                    "categories": [self.category1.to_string()],
+                },
+                {**self.example2.to_dict(), "entities": [], "categories": []},
+            ],
+            self.project.annotator.username: [
+                {
+                    **self.example1.to_dict(),
+                    "entities": [],
+                    "categories": [self.category2.to_string()],
+                },
+                {**self.example2.to_dict(), "entities": [], "categories": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "entities": [], "categories": []},
+                {**self.example2.to_dict(), "entities": [], "categories": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            self.assertEqual(dataset.to_dict(), datasets[username].to_dict())
+            self.assertEqual(dataset, datasets[username])
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", False)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "entities": [list(self.span.to_tuple())],
-                    "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
-                },
-                {"id": self.example2.id, "data": self.example2.text, "entities": [], "categories": []},
-            ]
-        )
-        self.assertEqual(dataset.to_dict(), expected_dataset.to_dict())
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "entities": [list(self.span.to_tuple())],
+                "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
+            },
+            {**self.example2.to_dict(), "entities": [], "categories": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", True)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
-
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.text,
-                        "entities": [list(self.span.to_tuple())],
-                        "categories": [self.category1.to_string()],
-                    },
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(),
-            self.project.approver.username: pd.DataFrame(),
+            self.project.admin.username: [
+                {
+                    **self.example1.to_dict(),
+                    "entities": [list(self.span.to_tuple())],
+                    "categories": [self.category1.to_string()],
+                },
+            ],
+            self.project.annotator.username: [],
+            self.project.approver.username: [],
         }
         for username, dataset in expected_datasets.items():
-            self.assertEqual(dataset.to_dict(), datasets[username].to_dict())
+            self.assertEqual(dataset, datasets[username])
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", True)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "entities": [list(self.span.to_tuple())],
-                    "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
-                },
-            ]
-        )
-        self.assertEqual(dataset.to_dict(), expected_dataset.to_dict())
-
-
-@override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportSequenceLabeling(TestCase):
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "entities": [list(self.span.to_tuple())],
+                "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
+            },
+        ]
+        self.assertEqual(dataset, expected_dataset)
+
+
+class TestExportSequenceLabeling(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(SEQUENCE_LABELING, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="confirmed")
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed")
         self.span1 = mommy.make(
             "ExportedSpan", example=self.example1, user=self.project.admin, start_offset=0, end_offset=1
         )
@@ -333,277 +265,204 @@ class TestExportSequenceLabeling(TestCase):
             "ExportedSpan", example=self.example1, user=self.project.annotator, start_offset=1, end_offset=2
         )
         mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin)
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", False)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
-
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "entities": [list(self.span1.to_tuple())]},
-                    {"id": self.example2.id, "data": self.example2.text, "entities": []},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "entities": [list(self.span2.to_tuple())]},
-                    {"id": self.example2.id, "data": self.example2.text, "entities": []},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "entities": []},
-                    {"id": self.example2.id, "data": self.example2.text, "entities": []},
-                ]
-            ),
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "entities": [list(self.span1.to_tuple())]},
+                {**self.example2.to_dict(), "entities": []},
+            ],
+            self.project.annotator.username: [
+                {**self.example1.to_dict(), "entities": [list(self.span2.to_tuple())]},
+                {**self.example2.to_dict(), "entities": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "entities": []},
+                {**self.example2.to_dict(), "entities": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            self.assertEqual(dataset.to_dict(), datasets[username].to_dict())
+            self.assertEqual(dataset, datasets[username])
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", False)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "entities": [self.span1.to_tuple(), self.span2.to_tuple()],
-                },
-                {"id": self.example2.id, "data": self.example2.text, "entities": []},
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "entities": [list(self.span1.to_tuple()), list(self.span2.to_tuple())],
+            },
+            {**self.example2.to_dict(), "entities": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", True)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
-
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "entities": [list(self.span1.to_tuple())]},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(),
-            self.project.approver.username: pd.DataFrame(),
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "entities": [list(self.span1.to_tuple())]},
+            ],
+            self.project.annotator.username: [],
+            self.project.approver.username: [],
         }
         for username, dataset in expected_datasets.items():
-            self.assertEqual(dataset.to_dict(), datasets[username].to_dict())
+            self.assertEqual(dataset, datasets[username])
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", True)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "entities": [self.span1.to_tuple(), self.span2.to_tuple()],
-                },
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "entities": [list(self.span1.to_tuple()), list(self.span2.to_tuple())],
+            },
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
 
-@override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportSpeechToText(TestCase):
+class TestExportSpeechToText(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(SPEECH2TEXT, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="confirmed")
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
         self.text1 = mommy.make("TextLabel", example=self.example1, user=self.project.admin)
         self.text2 = mommy.make("TextLabel", example=self.example1, user=self.project.annotator)
         mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin)
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", False)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.upload_name, "labels": [self.text1.text]},
-                    {"id": self.example2.id, "data": self.example2.upload_name, "labels": []},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.upload_name, "labels": []},
-                    {"id": self.example2.id, "data": self.example2.upload_name, "labels": []},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.upload_name, "labels": [self.text2.text]},
-                    {"id": self.example2.id, "data": self.example2.upload_name, "labels": []},
-                ]
-            ),
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "labels": [self.text1.text]},
+                {**self.example2.to_dict(), "labels": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "labels": []},
+                {**self.example2.to_dict(), "labels": []},
+            ],
+            self.project.annotator.username: [
+                {**self.example1.to_dict(), "labels": [self.text2.text]},
+                {**self.example2.to_dict(), "labels": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            assert_frame_equal(dataset, datasets[username])
+            self.assertEqual(datasets[username], dataset)
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", False)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.upload_name,
-                    "labels": sorted([self.text1.text, self.text2.text]),
-                },
-                {"id": self.example2.id, "data": self.example2.upload_name, "labels": []},
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "labels": sorted([self.text1.text, self.text2.text]),
+            },
+            {**self.example2.to_dict(), "labels": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", True)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.upload_name, "labels": [self.text1.text]},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(),
-            self.project.approver.username: pd.DataFrame(),
+            self.project.admin.username: [
+                {**self.example1.to_dict(), "labels": [self.text1.text]},
+            ],
+            self.project.annotator.username: [],
+            self.project.approver.username: [],
         }
         for username, dataset in datasets.items():
-            self.assertEqual(dataset.to_dict(), expected_datasets[username].to_dict())
+            self.assertEqual(dataset, expected_datasets[username])
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", True)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.upload_name,
-                    "labels": sorted([self.text1.text, self.text2.text]),
-                }
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "labels": sorted([self.text1.text, self.text2.text]),
+            }
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
 
-@override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportImageClassification(TestCase):
+class TestExportImageClassification(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(IMAGE_CLASSIFICATION, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="confirmed")
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
-        self.category1 = mommy.make("Category", example=self.example1, user=self.project.admin)
-        self.category2 = mommy.make("Category", example=self.example1, user=self.project.annotator)
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="confirmed")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
+        self.category1 = mommy.make("ExportedCategory", example=self.example1, user=self.project.admin)
+        self.category2 = mommy.make("ExportedCategory", example=self.example1, user=self.project.annotator)
         mommy.make("ExampleState", example=self.example1, confirmed_by=self.project.admin)
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", False)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.upload_name,
-                        "categories": [self.category1.label.text],
-                    },
-                    {"id": self.example2.id, "data": self.example2.upload_name, "categories": []},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.upload_name, "categories": []},
-                    {"id": self.example2.id, "data": self.example2.upload_name, "categories": []},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.upload_name,
-                        "categories": [self.category2.label.text],
-                    },
-                    {"id": self.example2.id, "data": self.example2.upload_name, "categories": []},
-                ]
-            ),
+            self.project.admin.username: [
+                {
+                    **self.example1.to_dict(),
+                    "categories": [self.category1.to_string()],
+                },
+                {**self.example2.to_dict(), "categories": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "categories": []},
+                {**self.example2.to_dict(), "categories": []},
+            ],
+            self.project.annotator.username: [
+                {
+                    **self.example1.to_dict(),
+                    "categories": [self.category2.to_string()],
+                },
+                {**self.example2.to_dict(), "categories": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            assert_frame_equal(dataset, datasets[username])
+            self.assertEqual(datasets[username], dataset)
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", False)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.upload_name,
-                    "categories": sorted([self.category1.label.text, self.category2.label.text]),
-                },
-                {"id": self.example2.id, "data": self.example2.upload_name, "categories": []},
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
+            },
+            {**self.example2.to_dict(), "categories": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", True)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [{"id": self.example1.id, "data": self.example1.upload_name, "categories": [self.category1.label.text]}]
-            )
+            self.project.admin.username: [{**self.example1.to_dict(), "categories": [self.category1.to_string()]}]
         }
         for username, dataset in expected_datasets.items():
-            assert_frame_equal(dataset, datasets[username])
+            self.assertEqual(datasets[username], dataset)
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", True)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.upload_name,
-                    "categories": sorted([self.category1.label.text, self.category2.label.text]),
-                }
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "categories": sorted([self.category1.to_string(), self.category2.to_string()]),
+            }
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
 
-@override_settings(MEDIA_URL=os.path.dirname(__file__))
-class TestExportRelation(TestCase):
+class TestExportRelation(TestExport):
     def prepare_data(self, collaborative=False):
         self.project = prepare_project(SEQUENCE_LABELING, use_relation=True, collaborative_annotation=collaborative)
-        self.example1 = mommy.make("Example", project=self.project.item, text="example")
-        self.example2 = mommy.make("Example", project=self.project.item, text="unconfirmed")
+        self.example1 = mommy.make("ExportedExample", project=self.project.item, text="example")
+        self.example2 = mommy.make("ExportedExample", project=self.project.item, text="unconfirmed")
         self.span1 = mommy.make(
             "ExportedSpan", example=self.example1, user=self.project.admin, start_offset=0, end_offset=1
         )
@@ -620,97 +479,70 @@ class TestExportRelation(TestCase):
 
     def test_unconfirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", False)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
-
+        datasets = self.export_dataset()
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.text,
-                        "entities": [self.span1.to_dict(), self.span2.to_dict()],
-                        "relations": [self.relation.to_dict()],
-                    },
-                    {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []},
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.text,
-                        "entities": [self.span3.to_dict()],
-                        "relations": [],
-                    },
-                    {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []},
-                ]
-            ),
-            self.project.approver.username: pd.DataFrame(
-                [
-                    {"id": self.example1.id, "data": self.example1.text, "entities": [], "relations": []},
-                    {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []},
-                ]
-            ),
+            self.project.admin.username: [
+                {
+                    **self.example1.to_dict(),
+                    "entities": [self.span1.to_dict(), self.span2.to_dict()],
+                    "relations": [self.relation.to_dict()],
+                },
+                {**self.example2.to_dict(), "entities": [], "relations": []},
+            ],
+            self.project.annotator.username: [
+                {
+                    **self.example1.to_dict(),
+                    "entities": [self.span3.to_dict()],
+                    "relations": [],
+                },
+                {**self.example2.to_dict(), "entities": [], "relations": []},
+            ],
+            self.project.approver.username: [
+                {**self.example1.to_dict(), "entities": [], "relations": []},
+                {**self.example2.to_dict(), "entities": [], "relations": []},
+            ],
         }
         for username, dataset in expected_datasets.items():
-            self.assertEqual(dataset.to_dict(), datasets[username].to_dict())
+            self.assertEqual(datasets[username], dataset)
 
     def test_unconfirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", False)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()],
-                    "relations": [self.relation.to_dict()],
-                },
-                {"id": self.example2.id, "data": self.example2.text, "entities": [], "relations": []},
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset()
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()],
+                "relations": [self.relation.to_dict()],
+            },
+            {**self.example2.to_dict(), "entities": [], "relations": []},
+        ]
+        self.assertEqual(dataset, expected_dataset)
 
     def test_confirmed_and_non_collaborative(self):
         self.prepare_data()
-        file = export_dataset(self.project.id, "JSONL", True)
-        datasets = read_zip_content(file, "jsonl")
-        os.remove(file)
-
+        datasets = self.export_dataset(confirmed_only=True)
         expected_datasets = {
-            self.project.admin.username: pd.DataFrame(
-                [
-                    {
-                        "id": self.example1.id,
-                        "data": self.example1.text,
-                        "entities": [self.span1.to_dict(), self.span2.to_dict()],
-                        "relations": [self.relation.to_dict()],
-                    },
-                ]
-            ),
-            self.project.annotator.username: pd.DataFrame(),
-            self.project.approver.username: pd.DataFrame(),
+            self.project.admin.username: [
+                {
+                    **self.example1.to_dict(),
+                    "entities": [self.span1.to_dict(), self.span2.to_dict()],
+                    "relations": [self.relation.to_dict()],
+                },
+            ],
+            self.project.annotator.username: [],
+            self.project.approver.username: [],
         }
         for username, dataset in datasets.items():
-            self.assertEqual(dataset.to_dict(), expected_datasets[username].to_dict())
+            self.assertEqual(dataset, expected_datasets[username])
 
     def test_confirmed_and_collaborative(self):
         self.prepare_data(collaborative=True)
-        file = export_dataset(self.project.id, "JSONL", True)
-        dataset = pd.read_json(file, lines=True)
-        os.remove(file)
-        expected_dataset = pd.DataFrame(
-            [
-                {
-                    "id": self.example1.id,
-                    "data": self.example1.text,
-                    "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()],
-                    "relations": [self.relation.to_dict()],
-                }
-            ]
-        )
-        assert_frame_equal(dataset, expected_dataset)
+        dataset = self.export_dataset(confirmed_only=True)
+        expected_dataset = [
+            {
+                **self.example1.to_dict(),
+                "entities": [self.span1.to_dict(), self.span2.to_dict(), self.span3.to_dict()],
+                "relations": [self.relation.to_dict()],
+            }
+        ]
+        self.assertEqual(dataset, expected_dataset)