mirror of https://github.com/doccano/doccano.git
Hironsan
2 years ago
4 changed files with 196 additions and 155 deletions
Split View
Diff Options
-
41backend/data_import/pipeline/formatters.py
-
108backend/data_import/pipeline/makers.py
-
114backend/data_import/tests/test_formatter.py
-
88backend/data_import/tests/test_makers.py
@ -1,41 +0,0 @@ |
|||
from typing import Type |
|||
|
|||
import pandas as pd |
|||
|
|||
from .data import BaseData |
|||
from .labels import Label |
|||
from .readers import ( |
|||
DEFAULT_LABEL_COLUMN, |
|||
DEFAULT_TEXT_COLUMN, |
|||
LINE_NUM_COLUMN, |
|||
UUID_COLUMN, |
|||
) |
|||
|
|||
DEFAULT_DATA_COLUMN = "#data" |
|||
|
|||
|
|||
class DataFormatter: |
|||
def __init__(self, column: str, data_class: Type[BaseData]): |
|||
self.column = column |
|||
self.data_class = data_class |
|||
|
|||
def format(self, df: pd.DataFrame) -> pd.DataFrame: |
|||
df.drop(columns=[LINE_NUM_COLUMN], inplace=True) |
|||
df.dropna(subset=[self.column], inplace=True) |
|||
df.rename(columns={self.column: DEFAULT_TEXT_COLUMN}, inplace=True) |
|||
df[DEFAULT_DATA_COLUMN] = df.apply(lambda row: self.data_class.parse(**row.to_dict()), axis=1) |
|||
return df[[UUID_COLUMN, DEFAULT_DATA_COLUMN]] |
|||
|
|||
|
|||
class LabelFormatter: |
|||
def __init__(self, column: str, label_class: Type[Label]): |
|||
self.column = column |
|||
self.label_class = label_class |
|||
|
|||
def format(self, df: pd.DataFrame) -> pd.DataFrame: |
|||
df_label = df[[UUID_COLUMN, self.column]].explode(self.column) |
|||
df_label.dropna(inplace=True) |
|||
df_label[DEFAULT_LABEL_COLUMN] = df_label[self.column].map(self.label_class.parse) |
|||
df_label.dropna(inplace=True) |
|||
df_label.reset_index(inplace=True, drop=True) |
|||
return df_label |
@ -0,0 +1,108 @@ |
|||
from typing import List, Optional, Type |
|||
|
|||
import pandas as pd |
|||
|
|||
from .data import BaseData |
|||
from .exceptions import FileParseException |
|||
from .label import Label |
|||
from .readers import ( |
|||
DEFAULT_TEXT_COLUMN, |
|||
LINE_NUMBER_COLUMN, |
|||
UPLOAD_NAME_COLUMN, |
|||
UUID_COLUMN, |
|||
) |
|||
from examples.models import Example |
|||
from projects.models import Project |
|||
|
|||
|
|||
class ExampleMaker: |
|||
def __init__( |
|||
self, |
|||
project: Project, |
|||
data_class: Type[BaseData], |
|||
column_data: str, |
|||
exclude_columns: Optional[List[str]] = None, |
|||
): |
|||
self.project = project |
|||
self.data_class = data_class |
|||
self.column_data = column_data |
|||
self.exclude_columns = exclude_columns or [] |
|||
self._errors = [] |
|||
|
|||
def make(self, df: pd.DataFrame) -> List[Example]: |
|||
if not self.check_column_existence(df): |
|||
return [] |
|||
self.check_value_existence(df) |
|||
# make dataframe without exclude columns and missing data |
|||
df_with_data_column = df.loc[:, ~df.columns.isin(self.exclude_columns)] |
|||
df_with_data_column = df_with_data_column.dropna(subset=[self.column_data]) |
|||
|
|||
examples = [] |
|||
for row in df_with_data_column.to_dict(orient="records"): |
|||
line_num = row.pop(LINE_NUMBER_COLUMN, 0) |
|||
row[DEFAULT_TEXT_COLUMN] = row.pop(self.column_data) # Rename column for parsing |
|||
try: |
|||
data = self.data_class.parse(**row) |
|||
example = data.create(self.project) |
|||
examples.append(example) |
|||
except ValueError: |
|||
message = f"Invalid data in line {line_num}" |
|||
error = FileParseException(row[UPLOAD_NAME_COLUMN], line_num, message) |
|||
self._errors.append(error) |
|||
return examples |
|||
|
|||
def check_column_existence(self, df: pd.DataFrame) -> bool: |
|||
message = f"Column {self.column_data} not found in the file" |
|||
if self.column_data not in df.columns: |
|||
for filename in df[UPLOAD_NAME_COLUMN].unique(): |
|||
self._errors.append(FileParseException(filename, 0, message)) |
|||
return False |
|||
return True |
|||
|
|||
def check_value_existence(self, df: pd.DataFrame): |
|||
df_without_data_column = df[df[self.column_data].isnull()] |
|||
for row in df_without_data_column.to_dict(orient="records"): |
|||
message = f"Column {self.column_data} not found in record" |
|||
error = FileParseException(row[UPLOAD_NAME_COLUMN], row.get(LINE_NUMBER_COLUMN, 0), message) |
|||
self._errors.append(error) |
|||
|
|||
@property |
|||
def errors(self) -> List[FileParseException]: |
|||
self._errors.sort(key=lambda error: error.line_num) |
|||
return self._errors |
|||
|
|||
|
|||
class LabelMaker: |
|||
def __init__(self, column: str, label_class: Type[Label]): |
|||
self.column = column |
|||
self.label_class = label_class |
|||
self._errors = [] |
|||
|
|||
def make(self, df: pd.DataFrame) -> List[Label]: |
|||
if not self.check_column_existence(df): |
|||
return [] |
|||
|
|||
df_label = df.explode(self.column) |
|||
df_label = df_label[[UUID_COLUMN, self.column]] |
|||
df_label.dropna(subset=[self.column], inplace=True) |
|||
labels = [] |
|||
for row in df_label.to_dict(orient="records"): |
|||
try: |
|||
label = self.label_class.parse(row[UUID_COLUMN], row[self.column]) |
|||
labels.append(label) |
|||
except ValueError: |
|||
pass |
|||
return labels |
|||
|
|||
def check_column_existence(self, df: pd.DataFrame) -> bool: |
|||
message = f"Column {self.column} not found in the file" |
|||
if self.column not in df.columns: |
|||
for filename in df[UPLOAD_NAME_COLUMN].unique(): |
|||
self._errors.append(FileParseException(filename, 0, message)) |
|||
return False |
|||
return True |
|||
|
|||
@property |
|||
def errors(self) -> List[FileParseException]: |
|||
self._errors.sort(key=lambda error: error.line_num) |
|||
return self._errors |
@ -1,114 +0,0 @@ |
|||
import unittest |
|||
from unittest.mock import MagicMock |
|||
|
|||
import pandas as pd |
|||
from pandas.testing import assert_frame_equal |
|||
|
|||
from data_import.pipeline.formatters import ( |
|||
DEFAULT_DATA_COLUMN, |
|||
DataFormatter, |
|||
LabelFormatter, |
|||
) |
|||
from data_import.pipeline.readers import ( |
|||
DEFAULT_LABEL_COLUMN, |
|||
DEFAULT_TEXT_COLUMN, |
|||
FILE_NAME_COLUMN, |
|||
LINE_NUM_COLUMN, |
|||
UPLOAD_NAME_COLUMN, |
|||
UUID_COLUMN, |
|||
) |
|||
|
|||
|
|||
class TestLabelFormatter(unittest.TestCase): |
|||
def setUp(self): |
|||
self.label_column = "label" |
|||
self.label_class = MagicMock |
|||
self.label_class.parse = lambda x: x |
|||
self.df = pd.DataFrame( |
|||
[ |
|||
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]}, |
|||
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: ["B", "C"]}, |
|||
] |
|||
) |
|||
|
|||
def test_format(self): |
|||
label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class) |
|||
df = label_formatter.format(self.df) |
|||
expected_df = pd.DataFrame( |
|||
[ |
|||
{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}, |
|||
{UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "B"}, |
|||
{UUID_COLUMN: 2, DEFAULT_LABEL_COLUMN: "C"}, |
|||
] |
|||
) |
|||
assert_frame_equal(df, expected_df) |
|||
|
|||
def test_format_without_specified_column(self): |
|||
label_formatter = LabelFormatter(column="invalid_column", label_class=self.label_class) |
|||
with self.assertRaises(KeyError): |
|||
label_formatter.format(self.df) |
|||
|
|||
def test_format_with_partially_correct_column(self): |
|||
label_formatter = LabelFormatter(column=self.label_column, label_class=self.label_class) |
|||
df = pd.DataFrame( |
|||
[ |
|||
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]}, |
|||
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, "invalid_column": ["B"]}, |
|||
{LINE_NUM_COLUMN: 3, UUID_COLUMN: 3}, |
|||
] |
|||
) |
|||
df_label = label_formatter.format(df) |
|||
expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}]) |
|||
assert_frame_equal(df_label, expected_df) |
|||
|
|||
def test_format_with_invalid_label(self): |
|||
label_class = MagicMock |
|||
label_class.parse = lambda x: x if x else None |
|||
label_formatter = LabelFormatter(column=self.label_column, label_class=label_class) |
|||
df = pd.DataFrame( |
|||
[ |
|||
{LINE_NUM_COLUMN: 1, UUID_COLUMN: 1, self.label_column: ["A"]}, |
|||
{LINE_NUM_COLUMN: 2, UUID_COLUMN: 2, self.label_column: [{}]}, |
|||
] |
|||
) |
|||
df_label = label_formatter.format(df) |
|||
expected_df = pd.DataFrame([{UUID_COLUMN: 1, DEFAULT_LABEL_COLUMN: "A"}]) |
|||
assert_frame_equal(df_label, expected_df) |
|||
|
|||
|
|||
class TestDataFormatter(unittest.TestCase): |
|||
def setUp(self): |
|||
self.data_column = "data" |
|||
self.filename = "filename" |
|||
self.upload_name = "upload_name" |
|||
|
|||
def test_format(self): |
|||
data_class = MagicMock |
|||
data_class.parse = lambda **kwargs: kwargs |
|||
data_formatter = DataFormatter(column=self.data_column, data_class=data_class) |
|||
df = pd.DataFrame( |
|||
[ |
|||
{ |
|||
LINE_NUM_COLUMN: 1, |
|||
UUID_COLUMN: 1, |
|||
self.data_column: "A", |
|||
FILE_NAME_COLUMN: self.filename, |
|||
UPLOAD_NAME_COLUMN: self.upload_name, |
|||
}, |
|||
] |
|||
) |
|||
df_data = data_formatter.format(df) |
|||
expected_df = pd.DataFrame( |
|||
[ |
|||
{ |
|||
UUID_COLUMN: 1, |
|||
DEFAULT_DATA_COLUMN: { |
|||
UUID_COLUMN: 1, |
|||
DEFAULT_TEXT_COLUMN: "A", |
|||
"filename": self.filename, |
|||
"upload_name": self.upload_name, |
|||
}, |
|||
}, |
|||
] |
|||
) |
|||
assert_frame_equal(df_data, expected_df) |
@ -0,0 +1,88 @@ |
|||
import uuid |
|||
|
|||
import pandas as pd |
|||
from django.test import TestCase |
|||
|
|||
from data_import.pipeline.data import TextData |
|||
from data_import.pipeline.formatters import ExampleMaker, LabelMaker |
|||
from data_import.pipeline.label import CategoryLabel |
|||
from data_import.pipeline.readers import ( |
|||
FILE_NAME_COLUMN, |
|||
LINE_NUMBER_COLUMN, |
|||
UPLOAD_NAME_COLUMN, |
|||
UUID_COLUMN, |
|||
) |
|||
from projects.tests.utils import prepare_project |
|||
|
|||
|
|||
class TestExamplesMaker(TestCase): |
|||
def setUp(self): |
|||
self.project = prepare_project() |
|||
self.label_column = "label" |
|||
self.text_column = "text" |
|||
self.record = { |
|||
LINE_NUMBER_COLUMN: 1, |
|||
UUID_COLUMN: uuid.uuid4(), |
|||
FILE_NAME_COLUMN: "file1", |
|||
UPLOAD_NAME_COLUMN: "upload1", |
|||
self.text_column: "text1", |
|||
self.label_column: ["A"], |
|||
} |
|||
self.maker = ExampleMaker(self.project.item, TextData, self.text_column, [self.label_column]) |
|||
|
|||
def test_make_examples(self): |
|||
df = pd.DataFrame([self.record]) |
|||
examples = self.maker.make(df) |
|||
self.assertEqual(len(examples), 1) |
|||
|
|||
def test_check_column_existence(self): |
|||
self.record.pop(self.text_column) |
|||
df = pd.DataFrame([self.record]) |
|||
examples = self.maker.make(df) |
|||
self.assertEqual(len(examples), 0) |
|||
self.assertEqual(len(self.maker.errors), 1) |
|||
|
|||
def test_empty_text_raises_error(self): |
|||
self.record[self.text_column] = "" |
|||
df = pd.DataFrame([self.record]) |
|||
examples = self.maker.make(df) |
|||
self.assertEqual(len(examples), 0) |
|||
self.assertEqual(len(self.maker.errors), 1) |
|||
|
|||
|
|||
class TestLabelFormatter(TestCase): |
|||
def setUp(self): |
|||
self.label_column = "label" |
|||
self.label_class = CategoryLabel |
|||
self.df = pd.DataFrame( |
|||
[ |
|||
{LINE_NUMBER_COLUMN: 1, UUID_COLUMN: uuid.uuid4(), self.label_column: ["A"]}, |
|||
{LINE_NUMBER_COLUMN: 2, UUID_COLUMN: uuid.uuid4(), self.label_column: ["B", "C"]}, |
|||
] |
|||
) |
|||
|
|||
def test_make(self): |
|||
label_maker = LabelMaker(column=self.label_column, label_class=self.label_class) |
|||
labels = label_maker.make(self.df) |
|||
self.assertEqual(len(labels), 3) |
|||
with self.subTest(): |
|||
for label, expected in zip(labels, ["A", "B", "C"]): |
|||
self.assertEqual(getattr(label, "label"), expected) |
|||
|
|||
def test_format_without_specified_column(self): |
|||
label_maker = LabelMaker(column="invalid_column", label_class=self.label_class) |
|||
with self.assertRaises(KeyError): |
|||
label_maker.make(self.df) |
|||
|
|||
def test_format_with_partially_correct_column(self): |
|||
label_maker = LabelMaker(column=self.label_column, label_class=self.label_class) |
|||
df = pd.DataFrame( |
|||
[ |
|||
{LINE_NUMBER_COLUMN: 1, UUID_COLUMN: uuid.uuid4(), self.label_column: ["A"]}, |
|||
{LINE_NUMBER_COLUMN: 2, UUID_COLUMN: uuid.uuid4(), "invalid_column": ["B"]}, |
|||
{LINE_NUMBER_COLUMN: 3, UUID_COLUMN: uuid.uuid4()}, |
|||
{LINE_NUMBER_COLUMN: 3, UUID_COLUMN: uuid.uuid4(), self.label_column: [{}]}, |
|||
] |
|||
) |
|||
labels = label_maker.make(df) |
|||
self.assertEqual(len(labels), 1) |
Write
Preview
Loading…
Cancel
Save