mirror of https://github.com/doccano/doccano.git
pythondatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learningannotation-tool
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
118 lines
4.2 KiB
118 lines
4.2 KiB
from typing import List, Optional, Type
|
|
|
|
import pandas as pd
|
|
|
|
from .data import BaseData
|
|
from .exceptions import FileParseException
|
|
from .label import Label
|
|
from .readers import (
|
|
DEFAULT_TEXT_COLUMN,
|
|
LINE_NUMBER_COLUMN,
|
|
UPLOAD_NAME_COLUMN,
|
|
UUID_COLUMN,
|
|
)
|
|
from examples.models import Example
|
|
from projects.models import Project
|
|
|
|
|
|
class ExampleMaker:
|
|
def __init__(
|
|
self,
|
|
project: Project,
|
|
data_class: Type[BaseData],
|
|
column_data: str = DEFAULT_TEXT_COLUMN,
|
|
exclude_columns: Optional[List[str]] = None,
|
|
):
|
|
self.project = project
|
|
self.data_class = data_class
|
|
self.column_data = column_data
|
|
self.exclude_columns = exclude_columns or []
|
|
self._errors: List[FileParseException] = []
|
|
|
|
def make(self, df: pd.DataFrame) -> List[Example]:
|
|
if not self.check_column_existence(df):
|
|
return []
|
|
self.check_value_existence(df)
|
|
# make dataframe without exclude columns and missing data
|
|
df_with_data_column = df.loc[:, ~df.columns.isin(self.exclude_columns)]
|
|
df_with_data_column = df_with_data_column.dropna(subset=[self.column_data])
|
|
|
|
examples = []
|
|
for row in df_with_data_column.to_dict(orient="records"):
|
|
line_num = row.pop(LINE_NUMBER_COLUMN, 0)
|
|
row[DEFAULT_TEXT_COLUMN] = row.pop(self.column_data) # Rename column for parsing
|
|
try:
|
|
data = self.data_class.parse(**row)
|
|
example = data.create(self.project)
|
|
examples.append(example)
|
|
except ValueError:
|
|
message = f"Invalid data in line {line_num}"
|
|
error = FileParseException(row[UPLOAD_NAME_COLUMN], line_num, message)
|
|
self._errors.append(error)
|
|
return examples
|
|
|
|
def check_column_existence(self, df: pd.DataFrame) -> bool:
|
|
message = f"Column {self.column_data} not found in the file"
|
|
if self.column_data not in df.columns:
|
|
for filename in df[UPLOAD_NAME_COLUMN].unique():
|
|
self._errors.append(FileParseException(filename, 0, message))
|
|
return False
|
|
return True
|
|
|
|
def check_value_existence(self, df: pd.DataFrame):
|
|
df_without_data_column = df[df[self.column_data].isnull()]
|
|
for row in df_without_data_column.to_dict(orient="records"):
|
|
message = f"Column {self.column_data} not found in record"
|
|
error = FileParseException(row[UPLOAD_NAME_COLUMN], row.get(LINE_NUMBER_COLUMN, 0), message)
|
|
self._errors.append(error)
|
|
|
|
@property
|
|
def errors(self) -> List[FileParseException]:
|
|
self._errors.sort(key=lambda error: error.line_num)
|
|
return self._errors
|
|
|
|
|
|
class BinaryExampleMaker(ExampleMaker):
|
|
def make(self, df: pd.DataFrame) -> List[Example]:
|
|
examples = []
|
|
for row in df.to_dict(orient="records"):
|
|
data = self.data_class.parse(**row)
|
|
example = data.create(self.project)
|
|
examples.append(example)
|
|
return examples
|
|
|
|
|
|
class LabelMaker:
|
|
def __init__(self, column: str, label_class: Type[Label]):
|
|
self.column = column
|
|
self.label_class = label_class
|
|
self._errors: List[FileParseException] = []
|
|
|
|
def make(self, df: pd.DataFrame) -> List[Label]:
|
|
if not self.check_column_existence(df):
|
|
return []
|
|
|
|
df_label = df.explode(self.column)
|
|
df_label = df_label[[UUID_COLUMN, self.column]]
|
|
df_label.dropna(subset=[self.column], inplace=True)
|
|
labels = []
|
|
for row in df_label.to_dict(orient="records"):
|
|
try:
|
|
label = self.label_class.parse(row[UUID_COLUMN], row[self.column])
|
|
labels.append(label)
|
|
except ValueError:
|
|
pass
|
|
return labels
|
|
|
|
def check_column_existence(self, df: pd.DataFrame) -> bool:
|
|
message = f"Column {self.column} not found in the file"
|
|
if self.column not in df.columns:
|
|
for filename in df[UPLOAD_NAME_COLUMN].unique():
|
|
self._errors.append(FileParseException(filename, 0, message))
|
|
return False
|
|
return True
|
|
|
|
@property
|
|
def errors(self) -> List[FileParseException]:
|
|
self._errors.sort(key=lambda error: error.line_num)
|
|
return self._errors
|