You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
3.6 KiB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. import abc
  2. import collections.abc
  3. from typing import Any, Dict, Iterator, List, Type
  4. from .cleaners import Cleaner
  5. from .data import BaseData
  6. from .exceptions import FileParseException
  7. from .labels import Label
  8. DEFAULT_TEXT_COLUMN = "text"
  9. DEFAULT_LABEL_COLUMN = "label"
  10. class Record:
  11. """Record represents a data."""
  12. def __init__(
  13. self, data: Type[BaseData], label: List[Label] = None, meta: Dict[Any, Any] = None, line_num: int = -1
  14. ):
  15. if label is None:
  16. label = []
  17. if meta is None:
  18. meta = {}
  19. self._data = data
  20. self._label = label
  21. self._meta = meta
  22. self._line_num = line_num
  23. def __str__(self):
  24. return f"{self._data}\t{self._label}"
  25. def clean(self, cleaner: Cleaner):
  26. label = cleaner.clean(self._label)
  27. changed = len(label) != len(self.label)
  28. self._label = label
  29. if changed:
  30. raise FileParseException(filename=self._data.filename, line_num=self._line_num, message=cleaner.message)
  31. @property
  32. def data(self):
  33. return self._data
  34. def create_data(self, project):
  35. return self._data.create(project, self._meta)
  36. def create_label(self, project):
  37. return [label.create(project) for label in self._label]
  38. def create_annotation(self, user, example, mapping):
  39. return [label.create_annotation(user, example, mapping) for label in self._label]
  40. @property
  41. def label(self):
  42. return [label.dict() for label in self._label if label.has_name() and label.name]
  43. class BaseReader(collections.abc.Iterable):
  44. """Reader has a role to parse files and return a Record iterator."""
  45. @abc.abstractmethod
  46. def __iter__(self) -> Iterator[Record]:
  47. """Creates an iterator for elements of this dataset.
  48. Returns:
  49. A `Record` for the elements of this dataset.
  50. """
  51. raise NotImplementedError("Please implement this method in the subclass.")
  52. @property
  53. @abc.abstractmethod
  54. def errors(self):
  55. raise NotImplementedError("Please implement this method in the subclass.")
  56. class Parser(abc.ABC):
  57. """The abstract file parser."""
  58. @abc.abstractmethod
  59. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  60. """Parses the file and returns the dictionary."""
  61. raise NotImplementedError("Please implement this method in the subclass.")
  62. @property
  63. def errors(self) -> List[FileParseException]:
  64. """Returns parsing errors."""
  65. return []
  66. class Builder(abc.ABC):
  67. """The abstract Record builder."""
  68. @abc.abstractmethod
  69. def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:
  70. """Builds the record from the dictionary."""
  71. raise NotImplementedError("Please implement this method in the subclass.")
  72. class Reader(BaseReader):
  73. def __init__(self, filenames: List[str], parser: Parser, builder: Builder):
  74. self.filenames = filenames
  75. self.parser = parser
  76. self.builder = builder
  77. self._errors: List[FileParseException] = []
  78. def __iter__(self) -> Iterator[Record]:
  79. for filename in self.filenames:
  80. rows = self.parser.parse(filename)
  81. for line_num, row in enumerate(rows, start=1):
  82. try:
  83. yield self.builder.build(row, filename, line_num)
  84. except FileParseException as e:
  85. self._errors.append(e)
  86. @property
  87. def errors(self) -> List[FileParseException]:
  88. """Aggregates parser and builder errors."""
  89. errors = self.parser.errors + self._errors
  90. return errors