Add DataFormatter to extract label dataframe

3 years ago · 839396056f
5 changed files with 73 additions and 54 deletions
--- a/backend/data_import/pipeline/data.py
+++ b/backend/data_import/pipeline/data.py
@ -12,14 +12,15 @@ class BaseData(BaseModel, abc.ABC):
    filename: str
    upload_name: str
    uuid: UUID4
+    meta: Dict[Any, Any] = {}

    def __init__(self, **data):
        data["uuid"] = uuid.uuid4()
        super().__init__(**data)

    @classmethod
-    def parse(cls, **kwargs):
-        return cls.parse_obj(kwargs)
+    def parse(cls, filename: str, upload_name: str, text: str = "", **kwargs):
+        return cls(filename=filename, upload_name=upload_name, text=text, meta=kwargs)

    def __hash__(self):
        return hash(tuple(self.dict()))
--- a/backend/data_import/pipeline/formatters.py
+++ b/backend/data_import/pipeline/formatters.py
@ -4,7 +4,9 @@ import pandas as pd

 from .data import BaseData
 from .labels import Label
-from .readers import LINE_NUM_COLUMN
+from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, LINE_NUM_COLUMN
+
+DEFAULT_DATA_COLUMN = "#data"


 class DataFormatter:
@ -13,9 +15,11 @@ class DataFormatter:
        self.data_class = data_class

    def format(self, df: pd.DataFrame) -> pd.DataFrame:
-        df_data = df[[LINE_NUM_COLUMN, self.column]]
-        df_data.dropna(inplace=True)
-        return df_data
+        df.drop(columns=[LINE_NUM_COLUMN], inplace=True)
+        df.dropna(subset=[self.column], inplace=True)
+        df.rename(columns={self.column: DEFAULT_TEXT_COLUMN}, inplace=True)
+        df[DEFAULT_DATA_COLUMN] = df.apply(lambda row: self.data_class.parse(**row.to_dict()), axis=1)
+        return df[[DEFAULT_DATA_COLUMN]]


 class LabelFormatter:
@ -26,7 +30,7 @@ class LabelFormatter:
    def format(self, df: pd.DataFrame) -> pd.DataFrame:
        df_label = df[[LINE_NUM_COLUMN, self.column]].explode(self.column)
        df_label.dropna(inplace=True)
-        df_label[self.column] = df_label[self.column].map(self.label_class.parse)
+        df_label[DEFAULT_LABEL_COLUMN] = df_label[self.column].map(self.label_class.parse)
        df_label.dropna(inplace=True)
        df_label.reset_index(inplace=True, drop=True)
        return df_label
--- a/backend/data_import/pipeline/labels.py
+++ b/backend/data_import/pipeline/labels.py
@ -21,15 +21,6 @@ class Label(BaseModel, abc.ABC):
        data["uuid"] = uuid.uuid4()
        super().__init__(**data)

-    @abc.abstractmethod
-    def has_name(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    @abc.abstractmethod
-    def name(self) -> str:
-        raise NotImplementedError()
-
    @classmethod
    def parse(cls, obj: Any):
        raise NotImplementedError()
@ -56,13 +47,6 @@ class CategoryLabel(Label):
        else:
            raise ValueError("is not empty.")

-    def has_name(self) -> bool:
-        return True
-
-    @property
-    def name(self) -> str:
-        return self.label
-
    @classmethod
    def parse(cls, obj: Any):
        try:
@ -82,13 +66,6 @@ class SpanLabel(Label):
    start_offset: int
    end_offset: int

-    def has_name(self) -> bool:
-        return True
-
-    @property
-    def name(self) -> str:
-        return self.label
-
    @classmethod
    def parse(cls, obj: Any):
        try:
@ -118,13 +95,6 @@ class SpanLabel(Label):
 class TextLabel(Label):
    text: str

-    def has_name(self) -> bool:
-        return False
-
-    @property
-    def name(self) -> str:
-        return self.text
-
    @classmethod
    def parse(cls, obj: Any):
        try:
@ -144,13 +114,6 @@ class RelationLabel(Label):
    to_id: int
    type: str

-    def has_name(self) -> bool:
-        return True
-
-    @property
-    def name(self) -> str:
-        return self.type
-
    @classmethod
    def parse(cls, obj: Any):
        try:
--- a/backend/data_import/pipeline/readers.py
+++ b/backend/data_import/pipeline/readers.py
@ -11,7 +11,8 @@ from .labeled_examples import Record
 DEFAULT_TEXT_COLUMN = "text"
 DEFAULT_LABEL_COLUMN = "label"
 LINE_NUM_COLUMN = "#line_num"
-FILE_NAME_COLUMN = "#file_name"
+FILE_NAME_COLUMN = "filename"
+UPLOAD_NAME_COLUMN = "upload_name"


 class BaseReader(collections.abc.Iterable):
@ -77,7 +78,12 @@ class Reader(BaseReader):
            rows = self.parser.parse(filename.full_path)
            for line_num, row in enumerate(rows, start=1):
                try:
-                    yield {LINE_NUM_COLUMN: line_num, FILE_NAME_COLUMN: filename, **row}
+                    yield {
+                        LINE_NUM_COLUMN: line_num,
+                        FILE_NAME_COLUMN: filename.generated_name,
+                        UPLOAD_NAME_COLUMN: filename.upload_name,
+                        **row,
+                    }
                except FileParseException as e:
                    self._errors.append(e)

--- a/backend/data_import/tests/test_formatter.py
+++ b/backend/data_import/tests/test_formatter.py
@ -4,11 +4,21 @@ from unittest.mock import MagicMock
 import pandas as pd
 from pandas.testing import assert_frame_equal

-from data_import.pipeline.formatters import LabelFormatter
-from data_import.pipeline.readers import LINE_NUM_COLUMN
+from data_import.pipeline.formatters import (
+    DEFAULT_DATA_COLUMN,
+    DataFormatter,
+    LabelFormatter,
+)
+from data_import.pipeline.readers import (
+    DEFAULT_LABEL_COLUMN,
+    DEFAULT_TEXT_COLUMN,
+    FILE_NAME_COLUMN,
+    LINE_NUM_COLUMN,
+    UPLOAD_NAME_COLUMN,
+)


-class TestFormatter(unittest.TestCase):
+class TestLabelFormatter(unittest.TestCase):
    def setUp(self):
        self.label_column = "label"
        self.label_class = MagicMock
@ -25,9 +35,9 @@ class TestFormatter(unittest.TestCase):
        df = label_formatter.format(self.df)
        expected_df = pd.DataFrame(
            [
-                {LINE_NUM_COLUMN: 1, self.label_column: "A"},
-                {LINE_NUM_COLUMN: 2, self.label_column: "B"},
-                {LINE_NUM_COLUMN: 2, self.label_column: "C"},
+                {LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"},
+                {LINE_NUM_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"},
+                {LINE_NUM_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"},
            ]
        )
        assert_frame_equal(df, expected_df)
@ -47,7 +57,7 @@ class TestFormatter(unittest.TestCase):
            ]
        )
        df_label = label_formatter.format(df)
-        expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, self.label_column: "A"}])
+        expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
        assert_frame_equal(df_label, expected_df)

    def test_format_with_invalid_label(self):
@ -61,5 +71,40 @@ class TestFormatter(unittest.TestCase):
            ]
        )
        df_label = label_formatter.format(df)
-        expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, self.label_column: "A"}])
+        expected_df = pd.DataFrame([{LINE_NUM_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}])
        assert_frame_equal(df_label, expected_df)
+
+
+class TestDataFormatter(unittest.TestCase):
+    def setUp(self):
+        self.data_column = "data"
+        self.filename = "filename"
+        self.upload_name = "upload_name"
+
+    def test_format(self):
+        data_class = MagicMock
+        data_class.parse = lambda **kwargs: kwargs
+        data_formatter = DataFormatter(column=self.data_column, data_class=data_class)
+        df = pd.DataFrame(
+            [
+                {
+                    LINE_NUM_COLUMN: 1,
+                    self.data_column: "A",
+                    FILE_NAME_COLUMN: self.filename,
+                    UPLOAD_NAME_COLUMN: self.upload_name,
+                },
+            ]
+        )
+        df_data = data_formatter.format(df)
+        expected_df = pd.DataFrame(
+            [
+                {
+                    DEFAULT_DATA_COLUMN: {
+                        DEFAULT_TEXT_COLUMN: "A",
+                        "filename": self.filename,
+                        "upload_name": self.upload_name,
+                    }
+                },
+            ]
+        )
+        assert_frame_equal(df_data, expected_df)