You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

125 lines
3.7 KiB

2 years ago
2 years ago
2 years ago
  1. import abc
  2. import collections.abc
  3. from typing import Any, Dict, Iterator, List, Type
  4. from .cleaners import Cleaner
  5. from .data import BaseData
  6. from .exceptions import FileParseException
  7. from .labels import Label
  8. DEFAULT_TEXT_COLUMN = 'text'
  9. DEFAULT_LABEL_COLUMN = 'label'
  10. class Record:
  11. """Record represents a data."""
  12. def __init__(self,
  13. data: Type[BaseData],
  14. label: List[Label] = None,
  15. meta: Dict[Any, Any] = None,
  16. line_num: int = -1):
  17. if label is None:
  18. label = []
  19. if meta is None:
  20. meta = {}
  21. self._data = data
  22. self._label = label
  23. self._meta = meta
  24. self._line_num = line_num
  25. def __str__(self):
  26. return f'{self._data}\t{self._label}'
  27. def clean(self, cleaner: Cleaner):
  28. label = cleaner.clean(self._label)
  29. changed = len(label) != len(self.label)
  30. self._label = label
  31. if changed:
  32. raise FileParseException(
  33. filename=self._data.filename,
  34. line_num=self._line_num,
  35. message=cleaner.message
  36. )
  37. @property
  38. def data(self):
  39. return self._data
  40. def create_data(self, project):
  41. return self._data.create(project, self._meta)
  42. def create_label(self, project):
  43. return [label.create(project) for label in self._label]
  44. def create_annotation(self, user, example, mapping):
  45. return [label.create_annotation(user, example, mapping) for label in self._label]
  46. @property
  47. def label(self):
  48. return [label.dict() for label in self._label if label.has_name() and label.name]
  49. class BaseReader(collections.abc.Iterable):
  50. """Reader has a role to parse files and return a Record iterator."""
  51. @abc.abstractmethod
  52. def __iter__(self) -> Iterator[Record]:
  53. """Creates an iterator for elements of this dataset.
  54. Returns:
  55. A `Record` for the elements of this dataset.
  56. """
  57. raise NotImplementedError('Please implement this method in the subclass.')
  58. @property
  59. @abc.abstractmethod
  60. def errors(self):
  61. raise NotImplementedError('Please implement this method in the subclass.')
  62. class Parser(abc.ABC):
  63. """The abstract file parser."""
  64. @abc.abstractmethod
  65. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  66. """Parses the file and returns the dictionary."""
  67. raise NotImplementedError('Please implement this method in the subclass.')
  68. @property
  69. def errors(self) -> List[FileParseException]:
  70. """Returns parsing errors."""
  71. return []
  72. class Builder(abc.ABC):
  73. """The abstract Record builder."""
  74. @abc.abstractmethod
  75. def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:
  76. """Builds the record from the dictionary."""
  77. raise NotImplementedError('Please implement this method in the subclass.')
  78. class Reader(BaseReader):
  79. def __init__(self, filenames: List[str], parser: Parser, builder: Builder):
  80. self.filenames = filenames
  81. self.parser = parser
  82. self.builder = builder
  83. self._errors = []
  84. def __iter__(self) -> Iterator[Record]:
  85. for filename in self.filenames:
  86. rows = self.parser.parse(filename)
  87. for line_num, row in enumerate(rows, start=1):
  88. try:
  89. yield self.builder.build(row, filename, line_num)
  90. except FileParseException as e:
  91. self._errors.append(e)
  92. @property
  93. def errors(self) -> List[FileParseException]:
  94. """Aggregates parser and builder errors."""
  95. errors = self.parser.errors + self._errors
  96. return errors