mirror of https://github.com/doccano/doccano.git
pythonannotation-tooldatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learning
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
125 lines
3.7 KiB
125 lines
3.7 KiB
import abc
|
|
import collections.abc
|
|
from typing import Any, Dict, Iterator, List, Type
|
|
|
|
from .cleaners import Cleaner
|
|
from .data import BaseData
|
|
from .exceptions import FileParseException
|
|
from .labels import Label
|
|
|
|
DEFAULT_TEXT_COLUMN = 'text'
|
|
DEFAULT_LABEL_COLUMN = 'label'
|
|
|
|
|
|
class Record:
|
|
"""Record represents a data."""
|
|
|
|
def __init__(self,
|
|
data: Type[BaseData],
|
|
label: List[Label] = None,
|
|
meta: Dict[Any, Any] = None,
|
|
line_num: int = -1):
|
|
if label is None:
|
|
label = []
|
|
if meta is None:
|
|
meta = {}
|
|
self._data = data
|
|
self._label = label
|
|
self._meta = meta
|
|
self._line_num = line_num
|
|
|
|
def __str__(self):
|
|
return f'{self._data}\t{self._label}'
|
|
|
|
def clean(self, cleaner: Cleaner):
|
|
label = cleaner.clean(self._label)
|
|
changed = len(label) != len(self.label)
|
|
self._label = label
|
|
if changed:
|
|
raise FileParseException(
|
|
filename=self._data.filename,
|
|
line_num=self._line_num,
|
|
message=cleaner.message
|
|
)
|
|
|
|
@property
|
|
def data(self):
|
|
return self._data
|
|
|
|
def create_data(self, project):
|
|
return self._data.create(project, self._meta)
|
|
|
|
def create_label(self, project):
|
|
return [label.create(project) for label in self._label]
|
|
|
|
def create_annotation(self, user, example, mapping):
|
|
return [label.create_annotation(user, example, mapping) for label in self._label]
|
|
|
|
@property
|
|
def label(self):
|
|
return [label.dict() for label in self._label if label.has_name() and label.name]
|
|
|
|
|
|
class BaseReader(collections.abc.Iterable):
|
|
"""Reader has a role to parse files and return a Record iterator."""
|
|
|
|
@abc.abstractmethod
|
|
def __iter__(self) -> Iterator[Record]:
|
|
"""Creates an iterator for elements of this dataset.
|
|
|
|
Returns:
|
|
A `Record` for the elements of this dataset.
|
|
"""
|
|
raise NotImplementedError('Please implement this method in the subclass.')
|
|
|
|
@property
|
|
@abc.abstractmethod
|
|
def errors(self):
|
|
raise NotImplementedError('Please implement this method in the subclass.')
|
|
|
|
|
|
class Parser(abc.ABC):
|
|
"""The abstract file parser."""
|
|
|
|
@abc.abstractmethod
|
|
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
|
|
"""Parses the file and returns the dictionary."""
|
|
raise NotImplementedError('Please implement this method in the subclass.')
|
|
|
|
@property
|
|
def errors(self) -> List[FileParseException]:
|
|
"""Returns parsing errors."""
|
|
return []
|
|
|
|
|
|
class Builder(abc.ABC):
|
|
"""The abstract Record builder."""
|
|
|
|
@abc.abstractmethod
|
|
def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:
|
|
"""Builds the record from the dictionary."""
|
|
raise NotImplementedError('Please implement this method in the subclass.')
|
|
|
|
|
|
class Reader(BaseReader):
|
|
|
|
def __init__(self, filenames: List[str], parser: Parser, builder: Builder):
|
|
self.filenames = filenames
|
|
self.parser = parser
|
|
self.builder = builder
|
|
self._errors = []
|
|
|
|
def __iter__(self) -> Iterator[Record]:
|
|
for filename in self.filenames:
|
|
rows = self.parser.parse(filename)
|
|
for line_num, row in enumerate(rows, start=1):
|
|
try:
|
|
yield self.builder.build(row, filename, line_num)
|
|
except FileParseException as e:
|
|
self._errors.append(e)
|
|
|
|
@property
|
|
def errors(self) -> List[FileParseException]:
|
|
"""Aggregates parser and builder errors."""
|
|
errors = self.parser.errors + self._errors
|
|
return errors
|