Browse Source

Add makers for data import

pull/1823/head
Hironsan 2 years ago
parent
commit
e9d799086b
4 changed files with 196 additions and 155 deletions
  1. 41
      backend/data_import/pipeline/formatters.py
  2. 108
      backend/data_import/pipeline/makers.py
  3. 114
      backend/data_import/tests/test_formatter.py
  4. 88
      backend/data_import/tests/test_makers.py

41
backend/data_import/pipeline/formatters.py

@ -1,41 +0,0 @@
from typing import Type
import pandas as pd
from .data import BaseData
from .labels import Label
from .readers import (
DEFAULT_LABEL_COLUMN,
DEFAULT_TEXT_COLUMN,
LINE_NUM_COLUMN,
UUID_COLUMN,
)
DEFAULT_DATA_COLUMN = "#data"
class DataFormatter:
def __init__(self, column: str, data_class: Type[BaseData]):
self.column = column
self.data_class = data_class
def format(self, df: pd.DataFrame) -> pd.DataFrame:
df.drop(columns=[LINE_NUM_COLUMN], inplace=True)
df.dropna(subset=[self.column], inplace=True)
df.rename(columns={self.column: DEFAULT_TEXT_COLUMN}, inplace=True)
df[DEFAULT_DATA_COLUMN] = df.apply(lambda row: self.data_class.parse(**row.to_dict()), axis=1)
return df[[UUID_COLUMN, DEFAULT_DATA_COLUMN]]
class LabelFormatter:
def __init__(self, column: str, label_class: Type[Label]):
self.column = column
self.label_class = label_class
def format(self, df: pd.DataFrame) -> pd.DataFrame:
df_label = df[[UUID_COLUMN, self.column]].explode(self.column)
df_label.dropna(inplace=True)
df_label[DEFAULT_LABEL_COLUMN] = df_label[self.column].map(self.label_class.parse)
df_label.dropna(inplace=True)
df_label.reset_index(inplace=True, drop=True)
return df_label

108
backend/data_import/pipeline/makers.py

@ -0,0 +1,108 @@
from typing import List, Optional, Type
import pandas as pd
from .data import BaseData
from .exceptions import FileParseException
from .label import Label
from .readers import (
DEFAULT_TEXT_COLUMN,
LINE_NUMBER_COLUMN,
UPLOAD_NAME_COLUMN,
UUID_COLUMN,
)
from examples.models import Example
from projects.models import Project
class ExampleMaker:
def __init__(
self,
project: Project,
data_class: Type[BaseData],
column_data: str,
exclude_columns: Optional[List[str]] = None,
):
self.project = project
self.data_class = data_class
self.column_data = column_data
self.exclude_columns = exclude_columns or []
self._errors = []
def make(self, df: pd.DataFrame) -> List[Example]:
if not self.check_column_existence(df):
return []
self.check_value_existence(df)
# make dataframe without exclude columns and missing data
df_with_data_column = df.loc[:, ~df.columns.isin(self.exclude_columns)]
df_with_data_column = df_with_data_column.dropna(subset=[self.column_data])
examples = []
for row in df_with_data_column.to_dict(orient="records"):
line_num = row.pop(LINE_NUMBER_COLUMN, 0)
row[DEFAULT_TEXT_COLUMN] = row.pop(self.column_data) # Rename column for parsing
try:
data = self.data_class.parse(**row)
example = data.create(self.project)
examples.append(example)
except ValueError:
message = f"Invalid data in line {line_num}"
error = FileParseException(row[UPLOAD_NAME_COLUMN], line_num, message)
self._errors.append(error)
return examples
def check_column_existence(self, df: pd.DataFrame) -> bool:
message = f"Column {self.column_data} not found in the file"
if self.column_data not in df.columns:
for filename in df[UPLOAD_NAME_COLUMN].unique():
self._errors.append(FileParseException(filename, 0, message))
return False
return True
def check_value_existence(self, df: pd.DataFrame):
df_without_data_column = df[df[self.column_data].isnull()]
for row in df_without_data_column.to_dict(orient="records"):
message = f"Column {self.column_data} not found in record"
error = FileParseException(row[UPLOAD_NAME_COLUMN], row.get(LINE_NUMBER_COLUMN, 0), message)
self._errors.append(error)
@property
def errors(self) -> List[FileParseException]:
self._errors.sort(key=lambda error: error.line_num)
return self._errors
class LabelMaker:
def __init__(self, column: str, label_class: Type[Label]):
self.column = column
self.label_class = label_class
self._errors = []
def make(self, df: pd.DataFrame) -> List[Label]:
if not self.check_column_existence(df):
return []
df_label = df.explode(self.column)
df_label = df_label[[UUID_COLUMN, self.column]]
df_label.dropna(subset=[self.column], inplace=True)
labels = []
for row in df_label.to_dict(orient="records"):
try:
label = self.label_class.parse(row[UUID_COLUMN], row[self.column])
labels.append(label)
except ValueError:
pass
return labels
def check_column_existence(self, df: pd.DataFrame) -> bool:
message = f"Column {self.column} not found in the file"
if self.column not in df.columns:
for filename in df[UPLOAD_NAME_COLUMN].unique():
self._errors.append(FileParseException(filename, 0, message))
return False
return True
@property
def errors(self) -> List[FileParseException]:
self._errors.sort(key=lambda error: error.line_num)
return self._errors

114
backend/data_import/tests/test_formatter.py

@ -1,114 +0,0 @@
import unittest
from unittest.mock import MagicMock
import pandas as pd
from pandas.testing import assert_frame_equal
from data_import.pipeline.formatters import (
DEFAULT_DATA_COLUMN,
DataFormatter,
LabelFormatter,
)
from data_import.pipeline.readers import (
DEFAULT_LABEL_COLUMN,
DEFAULT_TEXT_COLUMN,
FILE_NAME_COLUMN,
LINE_NUM_COLUMN,
UPLOAD_NAME_COLUMN,
UUID_COLUMN,
)
class TestLabelFormatter(unittest.TestCase):
def setUp(self):
self.label_column = "label"
self.label_class = MagicMock
self.label_class.parse = lambda x: x
self.df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: ["B", "C"]},
]
)
def test_format(self):
label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class)
df = label_formatter.format(self.df)
expected_df = pd.DataFrame(
[
{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"},
{UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"},
{UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"},
]
)
assert_frame_equal(df, expected_df)
def test_format_without_specified_column(self):
label_formatter = LabelFormatter(column="invalid_column", label_class=self.label_class)
with self.assertRaises(KeyError):
label_formatter.format(self.df)
def test_format_with_partially_correct_column(self):
label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class)
df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, "invalid_column": ["B"]},
{LINE_NUM_COLUMN: 3, UUID_COLUMN: 3},
]
)
df_label = label_formatter.format(df)
expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
assert_frame_equal(df_label, expected_df)
def test_format_with_invalid_label(self):
label_class = MagicMock
label_class.parse = lambda x: x if x else None
label_formatter = LabelFormatter(column=self.label_column, label_class=label_class)
df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: [{}]},
]
)
df_label = label_formatter.format(df)
expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
assert_frame_equal(df_label, expected_df)
class TestDataFormatter(unittest.TestCase):
def setUp(self):
self.data_column = "data"
self.filename = "filename"
self.upload_name = "upload_name"
def test_format(self):
data_class = MagicMock
data_class.parse = lambda **kwargs: kwargs
data_formatter = DataFormatter(column=self.data_column, data_class=data_class)
df = pd.DataFrame(
[
{
LINE_NUM_COLUMN: 1,
UUID_COLUMN: 1,
self.data_column: "A",
FILE_NAME_COLUMN: self.filename,
UPLOAD_NAME_COLUMN: self.upload_name,
},
]
)
df_data = data_formatter.format(df)
expected_df = pd.DataFrame(
[
{
UUID_COLUMN: 1,
DEFAULT_DATA_COLUMN: {
UUID_COLUMN: 1,
DEFAULT_TEXT_COLUMN: "A",
"filename": self.filename,
"upload_name": self.upload_name,
},
},
]
)
assert_frame_equal(df_data, expected_df)

88
backend/data_import/tests/test_makers.py

@ -0,0 +1,88 @@
import uuid
import pandas as pd
from django.test import TestCase
from data_import.pipeline.data import TextData
from data_import.pipeline.formatters import ExampleMaker, LabelMaker
from data_import.pipeline.label import CategoryLabel
from data_import.pipeline.readers import (
FILE_NAME_COLUMN,
LINE_NUMBER_COLUMN,
UPLOAD_NAME_COLUMN,
UUID_COLUMN,
)
from projects.tests.utils import prepare_project
class TestExamplesMaker(TestCase):
def setUp(self):
self.project = prepare_project()
self.label_column = "label"
self.text_column = "text"
self.record = {
LINE_NUMBER_COLUMN: 1,
UUID_COLUMN: uuid.uuid4(),
FILE_NAME_COLUMN: "file1",
UPLOAD_NAME_COLUMN: "upload1",
self.text_column: "text1",
self.label_column: ["A"],
}
self.maker = ExampleMaker(self.project.item, TextData, self.text_column, [self.label_column])
def test_make_examples(self):
df = pd.DataFrame([self.record])
examples = self.maker.make(df)
self.assertEqual(len(examples), 1)
def test_check_column_existence(self):
self.record.pop(self.text_column)
df = pd.DataFrame([self.record])
examples = self.maker.make(df)
self.assertEqual(len(examples), 0)
self.assertEqual(len(self.maker.errors), 1)
def test_empty_text_raises_error(self):
self.record[self.text_column] = ""
df = pd.DataFrame([self.record])
examples = self.maker.make(df)
self.assertEqual(len(examples), 0)
self.assertEqual(len(self.maker.errors), 1)
class TestLabelFormatter(TestCase):
def setUp(self):
self.label_column = "label"
self.label_class = CategoryLabel
self.df = pd.DataFrame(
[
{LINE_NUMBER_COLUMN: 1, UUID_COLUMN: uuid.uuid4(), self.label_column: ["A"]},
{LINE_NUMBER_COLUMN: 2, UUID_COLUMN: uuid.uuid4(), self.label_column: ["B", "C"]},
]
)
def test_make(self):
label_maker = LabelMaker(column=self.label_column, label_class=self.label_class)
labels = label_maker.make(self.df)
self.assertEqual(len(labels), 3)
with self.subTest():
for label, expected in zip(labels, ["A", "B", "C"]):
self.assertEqual(getattr(label, "label"), expected)
def test_format_without_specified_column(self):
label_maker = LabelMaker(column="invalid_column", label_class=self.label_class)
with self.assertRaises(KeyError):
label_maker.make(self.df)
def test_format_with_partially_correct_column(self):
label_maker = LabelMaker(column=self.label_column, label_class=self.label_class)
df = pd.DataFrame(
[
{LINE_NUMBER_COLUMN: 1, UUID_COLUMN: uuid.uuid4(), self.label_column: ["A"]},
{LINE_NUMBER_COLUMN: 2, UUID_COLUMN: uuid.uuid4(), "invalid_column": ["B"]},
{LINE_NUMBER_COLUMN: 3, UUID_COLUMN: uuid.uuid4()},
{LINE_NUMBER_COLUMN: 3, UUID_COLUMN: uuid.uuid4(), self.label_column: [{}]},
]
)
labels = label_maker.make(df)
self.assertEqual(len(labels), 1)
Loading…
Cancel
Save