|
@ -0,0 +1,76 @@ |
|
|
|
|
|
import abc |
|
|
|
|
|
import collections.abc |
|
|
|
|
|
from typing import Any, Dict, Iterator, List, Type |
|
|
|
|
|
|
|
|
|
|
|
from .data import BaseData |
|
|
|
|
|
from .label import Label |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_TEXT_COLUMN = 'text' |
|
|
|
|
|
DEFAULT_LABEL_COLUMN = 'labels' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Record: |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, data: Type[BaseData], label: List[Label] = None, line_num: int = -1): |
|
|
|
|
|
if label is None: |
|
|
|
|
|
label = [] |
|
|
|
|
|
self._data = data |
|
|
|
|
|
self._label = label |
|
|
|
|
|
self._line_num = line_num |
|
|
|
|
|
|
|
|
|
|
|
def __str__(self): |
|
|
|
|
|
return f'{self._data}\t{self._label}' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseReader(collections.abc.Iterable): |
|
|
|
|
|
|
|
|
|
|
|
@abc.abstractmethod |
|
|
|
|
|
def __iter__(self) -> Iterator[Record]: |
|
|
|
|
|
"""Creates an iterator for elements of this dataset. |
|
|
|
|
|
|
|
|
|
|
|
Returns: |
|
|
|
|
|
A `Record` for the elements of this dataset. |
|
|
|
|
|
""" |
|
|
|
|
|
raise NotImplementedError('Please implement this method in the subclass.') |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
|
|
|
@abc.abstractmethod |
|
|
|
|
|
def errors(self): |
|
|
|
|
|
raise NotImplementedError('Please implement this method in the subclass.') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Parser(abc.ABC): |
|
|
|
|
|
|
|
|
|
|
|
@abc.abstractmethod |
|
|
|
|
|
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]: |
|
|
|
|
|
"""Parses the file and returns the dictionary.""" |
|
|
|
|
|
raise NotImplementedError('Please implement this method in the subclass.') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Builder(abc.ABC): |
|
|
|
|
|
|
|
|
|
|
|
@abc.abstractmethod |
|
|
|
|
|
def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record: |
|
|
|
|
|
"""Builds the record from the dictionary.""" |
|
|
|
|
|
raise NotImplementedError('Please implement this method in the subclass.') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Reader(BaseReader): |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, filenames: List[str], parser: Parser, builder: Builder): |
|
|
|
|
|
self.filenames = filenames |
|
|
|
|
|
self.parser = parser |
|
|
|
|
|
self.builder = builder |
|
|
|
|
|
self._errors = [] |
|
|
|
|
|
|
|
|
|
|
|
def __iter__(self) -> Iterator[Record]: |
|
|
|
|
|
for filename in self.filenames: |
|
|
|
|
|
rows = self.parser.parse(filename) |
|
|
|
|
|
for line_num, row in enumerate(rows, start=1): |
|
|
|
|
|
record = self.builder.build(row, filename, line_num) |
|
|
|
|
|
yield record |
|
|
|
|
|
|
|
|
|
|
|
@property |
|
|
|
|
|
def errors(self): |
|
|
|
|
|
"""Aggregates parser and builder errors.""" |
|
|
|
|
|
return self._errors |