Browse Source

Return uuid from formatters

pull/1823/head
Hironsan 3 years ago
parent
commit
e6d222a8f7
2 changed files with 25 additions and 16 deletions
  1. 11
      backend/data_import/pipeline/formatters.py
  2. 30
      backend/data_import/tests/test_formatter.py

11
backend/data_import/pipeline/formatters.py

@ -4,7 +4,12 @@ import pandas as pd
from .data import BaseData
from .labels import Label
from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, LINE_NUM_COLUMN
from .readers import (
DEFAULT_LABEL_COLUMN,
DEFAULT_TEXT_COLUMN,
LINE_NUM_COLUMN,
UUID_COLUMN,
)
DEFAULT_DATA_COLUMN = "#data"
@ -19,7 +24,7 @@ class DataFormatter:
df.dropna(subset=[self.column], inplace=True)
df.rename(columns={self.column: DEFAULT_TEXT_COLUMN}, inplace=True)
df[DEFAULT_DATA_COLUMN] = df.apply(lambda row: self.data_class.parse(**row.to_dict()), axis=1)
return df[[DEFAULT_DATA_COLUMN]]
return df[[UUID_COLUMN, DEFAULT_DATA_COLUMN]]
class LabelFormatter:
@ -28,7 +33,7 @@ class LabelFormatter:
self.label_class = label_class
def format(self, df: pd.DataFrame) -> pd.DataFrame:
df_label = df[[LINE_NUM_COLUMN, self.column]].explode(self.column)
df_label = df[[UUID_COLUMN, self.column]].explode(self.column)
df_label.dropna(inplace=True)
df_label[DEFAULT_LABEL_COLUMN] = df_label[self.column].map(self.label_class.parse)
df_label.dropna(inplace=True)

30
backend/data_import/tests/test_formatter.py

@ -15,6 +15,7 @@ from data_import.pipeline.readers import (
FILE_NAME_COLUMN,
LINE_NUM_COLUMN,
UPLOAD_NAME_COLUMN,
UUID_COLUMN,
)
@ -25,8 +26,8 @@ class TestLabelFormatter(unittest.TestCase):
self.label_class.parse = lambda x: x
self.df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, self.label_column: ["B", "C"]},
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: ["B", "C"]},
]
)
@ -35,9 +36,9 @@ class TestLabelFormatter(unittest.TestCase):
df = label_formatter.format(self.df)
expected_df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"},
{LINE_NUM_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"},
{LINE_NUM_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"},
{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"},
{UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"},
{UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"},
]
)
assert_frame_equal(df, expected_df)
@ -51,13 +52,13 @@ class TestLabelFormatter(unittest.TestCase):
label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class)
df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, "invalid_column": ["B"]},
{LINE_NUM_COLUMN: 3},
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, "invalid_column": ["B"]},
{LINE_NUM_COLUMN: 3, UUID_COLUMN: 3},
]
)
df_label = label_formatter.format(df)
expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
assert_frame_equal(df_label, expected_df)
def test_format_with_invalid_label(self):
@ -66,12 +67,12 @@ class TestLabelFormatter(unittest.TestCase):
label_formatter = LabelFormatter(column=self.label_column, label_class=label_class)
df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, self.label_column: [{}]},
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]},
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: [{}]},
]
)
df_label = label_formatter.format(df)
expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
assert_frame_equal(df_label, expected_df)
@ -89,6 +90,7 @@ class TestDataFormatter(unittest.TestCase):
[
{
LINE_NUM_COLUMN: 1,
UUID_COLUMN: 1,
self.data_column: "A",
FILE_NAME_COLUMN: self.filename,
UPLOAD_NAME_COLUMN: self.upload_name,
@ -99,11 +101,13 @@ class TestDataFormatter(unittest.TestCase):
expected_df = pd.DataFrame(
[
{
UUID_COLUMN: 1,
DEFAULT_DATA_COLUMN: {
UUID_COLUMN: 1,
DEFAULT_TEXT_COLUMN: "A",
"filename": self.filename,
"upload_name": self.upload_name,
}
},
},
]
)

Loading…
Cancel
Save