Browse Source

Add DataFormatter to extract label dataframe

pull/1823/head
Hironsan 3 years ago
parent
commit
839396056f
5 changed files with 73 additions and 54 deletions
  1. 5
      backend/data_import/pipeline/data.py
  2. 14
      backend/data_import/pipeline/formatters.py
  3. 37
      backend/data_import/pipeline/labels.py
  4. 10
      backend/data_import/pipeline/readers.py
  5. 61
      backend/data_import/tests/test_formatter.py

5
backend/data_import/pipeline/data.py

@ -12,14 +12,15 @@ class BaseData(BaseModel, abc.ABC):
filename: str
upload_name: str
uuid: UUID4
meta: Dict[Any, Any] = {}
def __init__(self, **data):
data["uuid"] = uuid.uuid4()
super().__init__(**data)
@classmethod
def parse(cls, **kwargs):
return cls.parse_obj(kwargs)
def parse(cls, filename: str, upload_name: str, text: str = "", **kwargs):
return cls(filename=filename, upload_name=upload_name, text=text, meta=kwargs)
def __hash__(self):
return hash(tuple(self.dict()))

14
backend/data_import/pipeline/formatters.py

@ -4,7 +4,9 @@ import pandas as pd
from .data import BaseData
from .labels import Label
from .readers import LINE_NUM_COLUMN
from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, LINE_NUM_COLUMN
DEFAULT_DATA_COLUMN = "#data"
class DataFormatter:
@ -13,9 +15,11 @@ class DataFormatter:
self.data_class = data_class
def format(self, df: pd.DataFrame) -> pd.DataFrame:
df_data = df[[LINE_NUM_COLUMN, self.column]]
df_data.dropna(inplace=True)
return df_data
df.drop(columns=[LINE_NUM_COLUMN], inplace=True)
df.dropna(subset=[self.column], inplace=True)
df.rename(columns={self.column: DEFAULT_TEXT_COLUMN}, inplace=True)
df[DEFAULT_DATA_COLUMN] = df.apply(lambda row: self.data_class.parse(**row.to_dict()), axis=1)
return df[[DEFAULT_DATA_COLUMN]]
class LabelFormatter:
@ -26,7 +30,7 @@ class LabelFormatter:
def format(self, df: pd.DataFrame) -> pd.DataFrame:
df_label = df[[LINE_NUM_COLUMN, self.column]].explode(self.column)
df_label.dropna(inplace=True)
df_label[self.column] = df_label[self.column].map(self.label_class.parse)
df_label[DEFAULT_LABEL_COLUMN] = df_label[self.column].map(self.label_class.parse)
df_label.dropna(inplace=True)
df_label.reset_index(inplace=True, drop=True)
return df_label

37
backend/data_import/pipeline/labels.py

@ -21,15 +21,6 @@ class Label(BaseModel, abc.ABC):
data["uuid"] = uuid.uuid4()
super().__init__(**data)
@abc.abstractmethod
def has_name(self) -> bool:
raise NotImplementedError()
@property
@abc.abstractmethod
def name(self) -> str:
raise NotImplementedError()
@classmethod
def parse(cls, obj: Any):
raise NotImplementedError()
@ -56,13 +47,6 @@ class CategoryLabel(Label):
else:
raise ValueError("is not empty.")
def has_name(self) -> bool:
return True
@property
def name(self) -> str:
return self.label
@classmethod
def parse(cls, obj: Any):
try:
@ -82,13 +66,6 @@ class SpanLabel(Label):
start_offset: int
end_offset: int
def has_name(self) -> bool:
return True
@property
def name(self) -> str:
return self.label
@classmethod
def parse(cls, obj: Any):
try:
@ -118,13 +95,6 @@ class SpanLabel(Label):
class TextLabel(Label):
text: str
def has_name(self) -> bool:
return False
@property
def name(self) -> str:
return self.text
@classmethod
def parse(cls, obj: Any):
try:
@ -144,13 +114,6 @@ class RelationLabel(Label):
to_id: int
type: str
def has_name(self) -> bool:
return True
@property
def name(self) -> str:
return self.type
@classmethod
def parse(cls, obj: Any):
try:

10
backend/data_import/pipeline/readers.py

@ -11,7 +11,8 @@ from .labeled_examples import Record
DEFAULT_TEXT_COLUMN = "text"
DEFAULT_LABEL_COLUMN = "label"
LINE_NUM_COLUMN = "#line_num"
FILE_NAME_COLUMN = "#file_name"
FILE_NAME_COLUMN = "filename"
UPLOAD_NAME_COLUMN = "upload_name"
class BaseReader(collections.abc.Iterable):
@ -77,7 +78,12 @@ class Reader(BaseReader):
rows = self.parser.parse(filename.full_path)
for line_num, row in enumerate(rows, start=1):
try:
yield {LINE_NUM_COLUMN: line_num, FILE_NAME_COLUMN: filename, **row}
yield {
LINE_NUM_COLUMN: line_num,
FILE_NAME_COLUMN: filename.generated_name,
UPLOAD_NAME_COLUMN: filename.upload_name,
**row,
}
except FileParseException as e:
self._errors.append(e)

61
backend/data_import/tests/test_formatter.py

@ -4,11 +4,21 @@ from unittest.mock import MagicMock
import pandas as pd
from pandas.testing import assert_frame_equal
from data_import.pipeline.formatters import LabelFormatter
from data_import.pipeline.readers import LINE_NUM_COLUMN
from data_import.pipeline.formatters import (
DEFAULT_DATA_COLUMN,
DataFormatter,
LabelFormatter,
)
from data_import.pipeline.readers import (
DEFAULT_LABEL_COLUMN,
DEFAULT_TEXT_COLUMN,
FILE_NAME_COLUMN,
LINE_NUM_COLUMN,
UPLOAD_NAME_COLUMN,
)
class TestFormatter(unittest.TestCase):
class TestLabelFormatter(unittest.TestCase):
def setUp(self):
self.label_column = "label"
self.label_class = MagicMock
@ -25,9 +35,9 @@ class TestFormatter(unittest.TestCase):
df = label_formatter.format(self.df)
expected_df = pd.DataFrame(
[
{LINE_NUM_COLUMN: 1, self.label_column: "A"},
{LINE_NUM_COLUMN: 2, self.label_column: "B"},
{LINE_NUM_COLUMN: 2, self.label_column: "C"},
{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"},
{LINE_NUM_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"},
{LINE_NUM_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"},
]
)
assert_frame_equal(df, expected_df)
@ -47,7 +57,7 @@ class TestFormatter(unittest.TestCase):
]
)
df_label = label_formatter.format(df)
expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, self.label_column: "A"}])
expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
assert_frame_equal(df_label, expected_df)
def test_format_with_invalid_label(self):
@ -61,5 +71,40 @@ class TestFormatter(unittest.TestCase):
]
)
df_label = label_formatter.format(df)
expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, self.label_column: "A"}])
expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
assert_frame_equal(df_label, expected_df)
class TestDataFormatter(unittest.TestCase):
def setUp(self):
self.data_column = "data"
self.filename = "filename"
self.upload_name = "upload_name"
def test_format(self):
data_class = MagicMock
data_class.parse = lambda **kwargs: kwargs
data_formatter = DataFormatter(column=self.data_column, data_class=data_class)
df = pd.DataFrame(
[
{
LINE_NUM_COLUMN: 1,
self.data_column: "A",
FILE_NAME_COLUMN: self.filename,
UPLOAD_NAME_COLUMN: self.upload_name,
},
]
)
df_data = data_formatter.format(df)
expected_df = pd.DataFrame(
[
{
DEFAULT_DATA_COLUMN: {
DEFAULT_TEXT_COLUMN: "A",
"filename": self.filename,
"upload_name": self.upload_name,
}
},
]
)
assert_frame_equal(df_data, expected_df)
Loading…
Cancel
Save