diff --git a/backend/data_import/pipeline/formatters.py b/backend/data_import/pipeline/formatters.py deleted file mode 100644 index 6fe4d764..00000000 --- a/backend/data_import/pipeline/formatters.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Type - -import pandas as pd - -from .data import BaseData -from .labels import Label -from .readers import ( - DEFAULT_LABEL_COLUMN, - DEFAULT_TEXT_COLUMN, - LINE_NUM_COLUMN, - UUID_COLUMN, -) - -DEFAULT_DATA_COLUMN = "#data" - - -class DataFormatter: - def __init__(self, column: str, data_class: Type[BaseData]): - self.column = column - self.data_class = data_class - - def format(self, df: pd.DataFrame) -> pd.DataFrame: - df.drop(columns=[LINE_NUM_COLUMN], inplace=True) - df.dropna(subset=[self.column], inplace=True) - df.rename(columns={self.column: DEFAULT_TEXT_COLUMN}, inplace=True) - df[DEFAULT_DATA_COLUMN] = df.apply(lambda row: self.data_class.parse(**row.to_dict()), axis=1) - return df[[UUID_COLUMN, DEFAULT_DATA_COLUMN]] - - -class LabelFormatter: - def __init__(self, column: str, label_class: Type[Label]): - self.column = column - self.label_class = label_class - - def format(self, df: pd.DataFrame) -> pd.DataFrame: - df_label = df[[UUID_COLUMN, self.column]].explode(self.column) - df_label.dropna(inplace=True) - df_label[DEFAULT_LABEL_COLUMN] = df_label[self.column].map(self.label_class.parse) - df_label.dropna(inplace=True) - df_label.reset_index(inplace=True, drop=True) - return df_label diff --git a/backend/data_import/pipeline/makers.py b/backend/data_import/pipeline/makers.py new file mode 100644 index 00000000..59e033f5 --- /dev/null +++ b/backend/data_import/pipeline/makers.py @@ -0,0 +1,108 @@ +from typing import List, Optional, Type + +import pandas as pd + +from .data import BaseData +from .exceptions import FileParseException +from .label import Label +from .readers import ( + DEFAULT_TEXT_COLUMN, + LINE_NUMBER_COLUMN, + UPLOAD_NAME_COLUMN, + UUID_COLUMN, +) +from examples.models import Example +from projects.models import Project + + +class ExampleMaker: + def __init__( + self, + project: Project, + data_class: Type[BaseData], + column_data: str, + exclude_columns: Optional[List[str]] = None, + ): + self.project = project + self.data_class = data_class + self.column_data = column_data + self.exclude_columns = exclude_columns or [] + self._errors = [] + + def make(self, df: pd.DataFrame) -> List[Example]: + if not self.check_column_existence(df): + return [] + self.check_value_existence(df) + # make dataframe without exclude columns and missing data + df_with_data_column = df.loc[:, ~df.columns.isin(self.exclude_columns)] + df_with_data_column = df_with_data_column.dropna(subset=[self.column_data]) + + examples = [] + for row in df_with_data_column.to_dict(orient="records"): + line_num = row.pop(LINE_NUMBER_COLUMN, 0) + row[DEFAULT_TEXT_COLUMN] = row.pop(self.column_data) # Rename column for parsing + try: + data = self.data_class.parse(**row) + example = data.create(self.project) + examples.append(example) + except ValueError: + message = f"Invalid data in line {line_num}" + error = FileParseException(row[UPLOAD_NAME_COLUMN], line_num, message) + self._errors.append(error) + return examples + + def check_column_existence(self, df: pd.DataFrame) -> bool: + message = f"Column {self.column_data} not found in the file" + if self.column_data not in df.columns: + for filename in df[UPLOAD_NAME_COLUMN].unique(): + self._errors.append(FileParseException(filename, 0, message)) + return False + return True + + def check_value_existence(self, df: pd.DataFrame): + df_without_data_column = df[df[self.column_data].isnull()] + for row in df_without_data_column.to_dict(orient="records"): + message = f"Column {self.column_data} not found in record" + error = FileParseException(row[UPLOAD_NAME_COLUMN], row.get(LINE_NUMBER_COLUMN, 0), message) + self._errors.append(error) + + @property + def errors(self) -> List[FileParseException]: + self._errors.sort(key=lambda error: error.line_num) + return self._errors + + +class LabelMaker: + def __init__(self, column: str, label_class: Type[Label]): + self.column = column + self.label_class = label_class + self._errors = [] + + def make(self, df: pd.DataFrame) -> List[Label]: + if not self.check_column_existence(df): + return [] + + df_label = df.explode(self.column) + df_label = df_label[[UUID_COLUMN, self.column]] + df_label.dropna(subset=[self.column], inplace=True) + labels = [] + for row in df_label.to_dict(orient="records"): + try: + label = self.label_class.parse(row[UUID_COLUMN], row[self.column]) + labels.append(label) + except ValueError: + pass + return labels + + def check_column_existence(self, df: pd.DataFrame) -> bool: + message = f"Column {self.column} not found in the file" + if self.column not in df.columns: + for filename in df[UPLOAD_NAME_COLUMN].unique(): + self._errors.append(FileParseException(filename, 0, message)) + return False + return True + + @property + def errors(self) -> List[FileParseException]: + self._errors.sort(key=lambda error: error.line_num) + return self._errors diff --git a/backend/data_import/tests/test_formatter.py b/backend/data_import/tests/test_formatter.py deleted file mode 100644 index ddce3a4f..00000000 --- a/backend/data_import/tests/test_formatter.py +++ /dev/null @@ -1,114 +0,0 @@ -import unittest -from unittest.mock import MagicMock - -import pandas as pd -from pandas.testing import assert_frame_equal - -from data_import.pipeline.formatters import ( - DEFAULT_DATA_COLUMN, - DataFormatter, - LabelFormatter, -) -from data_import.pipeline.readers import ( - DEFAULT_LABEL_COLUMN, - DEFAULT_TEXT_COLUMN, - FILE_NAME_COLUMN, - LINE_NUM_COLUMN, - UPLOAD_NAME_COLUMN, - UUID_COLUMN, -) - - -class TestLabelFormatter(unittest.TestCase): - def setUp(self): - self.label_column = "label" - self.label_class = MagicMock - self.label_class.parse = lambda x: x - self.df = pd.DataFrame( - [ - {LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]}, - {LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: ["B", "C"]}, - ] - ) - - def test_format(self): - label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class) - df = label_formatter.format(self.df) - expected_df = pd.DataFrame( - [ - {UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}, - {UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"}, - {UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"}, - ] - ) - assert_frame_equal(df, expected_df) - - def test_format_without_specified_column(self): - label_formatter = LabelFormatter(column="invalid_column", label_class=self.label_class) - with self.assertRaises(KeyError): - label_formatter.format(self.df) - - def test_format_with_partially_correct_column(self): - label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class) - df = pd.DataFrame( - [ - {LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]}, - {LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, "invalid_column": ["B"]}, - {LINE_NUM_COLUMN: 3, UUID_COLUMN: 3}, - ] - ) - df_label = label_formatter.format(df) - expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}]) - assert_frame_equal(df_label, expected_df) - - def test_format_with_invalid_label(self): - label_class = MagicMock - label_class.parse = lambda x: x if x else None - label_formatter = LabelFormatter(column=self.label_column, label_class=label_class) - df = pd.DataFrame( - [ - {LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]}, - {LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: [{}]}, - ] - ) - df_label = label_formatter.format(df) - expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}]) - assert_frame_equal(df_label, expected_df) - - -class TestDataFormatter(unittest.TestCase): - def setUp(self): - self.data_column = "data" - self.filename = "filename" - self.upload_name = "upload_name" - - def test_format(self): - data_class = MagicMock - data_class.parse = lambda **kwargs: kwargs - data_formatter = DataFormatter(column=self.data_column, data_class=data_class) - df = pd.DataFrame( - [ - { - LINE_NUM_COLUMN: 1, - UUID_COLUMN: 1, - self.data_column: "A", - FILE_NAME_COLUMN: self.filename, - UPLOAD_NAME_COLUMN: self.upload_name, - }, - ] - ) - df_data = data_formatter.format(df) - expected_df = pd.DataFrame( - [ - { - UUID_COLUMN: 1, - DEFAULT_DATA_COLUMN: { - UUID_COLUMN: 1, - DEFAULT_TEXT_COLUMN: "A", - "filename": self.filename, - "upload_name": self.upload_name, - }, - }, - ] - ) - assert_frame_equal(df_data, expected_df) diff --git a/backend/data_import/tests/test_makers.py b/backend/data_import/tests/test_makers.py new file mode 100644 index 00000000..c463dd51 --- /dev/null +++ b/backend/data_import/tests/test_makers.py @@ -0,0 +1,88 @@ +import uuid + +import pandas as pd +from django.test import TestCase + +from data_import.pipeline.data import TextData +from data_import.pipeline.formatters import ExampleMaker, LabelMaker +from data_import.pipeline.label import CategoryLabel +from data_import.pipeline.readers import ( + FILE_NAME_COLUMN, + LINE_NUMBER_COLUMN, + UPLOAD_NAME_COLUMN, + UUID_COLUMN, +) +from projects.tests.utils import prepare_project + + +class TestExamplesMaker(TestCase): + def setUp(self): + self.project = prepare_project() + self.label_column = "label" + self.text_column = "text" + self.record = { + LINE_NUMBER_COLUMN: 1, + UUID_COLUMN: uuid.uuid4(), + FILE_NAME_COLUMN: "file1", + UPLOAD_NAME_COLUMN: "upload1", + self.text_column: "text1", + self.label_column: ["A"], + } + self.maker = ExampleMaker(self.project.item, TextData, self.text_column, [self.label_column]) + + def test_make_examples(self): + df = pd.DataFrame([self.record]) + examples = self.maker.make(df) + self.assertEqual(len(examples), 1) + + def test_check_column_existence(self): + self.record.pop(self.text_column) + df = pd.DataFrame([self.record]) + examples = self.maker.make(df) + self.assertEqual(len(examples), 0) + self.assertEqual(len(self.maker.errors), 1) + + def test_empty_text_raises_error(self): + self.record[self.text_column] = "" + df = pd.DataFrame([self.record]) + examples = self.maker.make(df) + self.assertEqual(len(examples), 0) + self.assertEqual(len(self.maker.errors), 1) + + +class TestLabelFormatter(TestCase): + def setUp(self): + self.label_column = "label" + self.label_class = CategoryLabel + self.df = pd.DataFrame( + [ + {LINE_NUMBER_COLUMN: 1, UUID_COLUMN: uuid.uuid4(), self.label_column: ["A"]}, + {LINE_NUMBER_COLUMN: 2, UUID_COLUMN: uuid.uuid4(), self.label_column: ["B", "C"]}, + ] + ) + + def test_make(self): + label_maker = LabelMaker(column=self.label_column, label_class=self.label_class) + labels = label_maker.make(self.df) + self.assertEqual(len(labels), 3) + with self.subTest(): + for label, expected in zip(labels, ["A", "B", "C"]): + self.assertEqual(getattr(label, "label"), expected) + + def test_format_without_specified_column(self): + label_maker = LabelMaker(column="invalid_column", label_class=self.label_class) + with self.assertRaises(KeyError): + label_maker.make(self.df) + + def test_format_with_partially_correct_column(self): + label_maker = LabelMaker(column=self.label_column, label_class=self.label_class) + df = pd.DataFrame( + [ + {LINE_NUMBER_COLUMN: 1, UUID_COLUMN: uuid.uuid4(), self.label_column: ["A"]}, + {LINE_NUMBER_COLUMN: 2, UUID_COLUMN: uuid.uuid4(), "invalid_column": ["B"]}, + {LINE_NUMBER_COLUMN: 3, UUID_COLUMN: uuid.uuid4()}, + {LINE_NUMBER_COLUMN: 3, UUID_COLUMN: uuid.uuid4(), self.label_column: [{}]}, + ] + ) + labels = label_maker.make(df) + self.assertEqual(len(labels), 1)