You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

118 lines
3.6 KiB

import abc
import collections.abc
from typing import Any, Dict, Iterator, List, Type
from .cleaners import Cleaner
from .data import BaseData
from .exceptions import FileParseException
from .labels import Label
DEFAULT_TEXT_COLUMN = "text"
DEFAULT_LABEL_COLUMN = "label"
class Record:
"""Record represents a data."""
def __init__(
self, data: Type[BaseData], label: List[Label] = None, meta: Dict[Any, Any] = None, line_num: int = -1
):
if label is None:
label = []
if meta is None:
meta = {}
self._data = data
self._label = label
self._meta = meta
self._line_num = line_num
def __str__(self):
return f"{self._data}\t{self._label}"
def clean(self, cleaner: Cleaner):
label = cleaner.clean(self._label)
changed = len(label) != len(self.label)
self._label = label
if changed:
raise FileParseException(filename=self._data.filename, line_num=self._line_num, message=cleaner.message)
@property
def data(self):
return self._data
def create_data(self, project):
return self._data.create(project, self._meta)
def create_label(self, project):
return [label.create(project) for label in self._label]
def create_annotation(self, user, example, mapping):
return [label.create_annotation(user, example, mapping) for label in self._label]
@property
def label(self):
return [label.dict() for label in self._label if label.has_name() and label.name]
class BaseReader(collections.abc.Iterable):
"""Reader has a role to parse files and return a Record iterator."""
@abc.abstractmethod
def __iter__(self) -> Iterator[Record]:
"""Creates an iterator for elements of this dataset.
Returns:
A `Record` for the elements of this dataset.
"""
raise NotImplementedError("Please implement this method in the subclass.")
@property
@abc.abstractmethod
def errors(self):
raise NotImplementedError("Please implement this method in the subclass.")
class Parser(abc.ABC):
"""The abstract file parser."""
@abc.abstractmethod
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
"""Parses the file and returns the dictionary."""
raise NotImplementedError("Please implement this method in the subclass.")
@property
def errors(self) -> List[FileParseException]:
"""Returns parsing errors."""
return []
class Builder(abc.ABC):
"""The abstract Record builder."""
@abc.abstractmethod
def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:
"""Builds the record from the dictionary."""
raise NotImplementedError("Please implement this method in the subclass.")
class Reader(BaseReader):
def __init__(self, filenames: List[str], parser: Parser, builder: Builder):
self.filenames = filenames
self.parser = parser
self.builder = builder
self._errors: List[FileParseException] = []
def __iter__(self) -> Iterator[Record]:
for filename in self.filenames:
rows = self.parser.parse(filename)
for line_num, row in enumerate(rows, start=1):
try:
yield self.builder.build(row, filename, line_num)
except FileParseException as e:
self._errors.append(e)
@property
def errors(self) -> List[FileParseException]:
"""Aggregates parser and builder errors."""
errors = self.parser.errors + self._errors
return errors