You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

126 lines
3.7 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. import abc
  2. import collections.abc
  3. import dataclasses
  4. from typing import Any, Dict, Iterator, List, Type
  5. from .cleaners import Cleaner
  6. from .data import BaseData
  7. from .exceptions import FileParseException
  8. from .labels import Label
  9. DEFAULT_TEXT_COLUMN = "text"
  10. DEFAULT_LABEL_COLUMN = "label"
  11. class Record:
  12. """Record represents a data."""
  13. def __init__(
  14. self, data: Type[BaseData], label: List[Label] = None, meta: Dict[Any, Any] = None, line_num: int = -1
  15. ):
  16. if label is None:
  17. label = []
  18. if meta is None:
  19. meta = {}
  20. self._data = data
  21. self._label = label
  22. self._meta = meta
  23. self._line_num = line_num
  24. def __str__(self):
  25. return f"{self._data}\t{self._label}"
  26. def clean(self, cleaner: Cleaner):
  27. label = cleaner.clean(self._label)
  28. changed = len(label) != len(self.label)
  29. self._label = label
  30. if changed:
  31. raise FileParseException(filename=self._data.filename, line_num=self._line_num, message=cleaner.message)
  32. @property
  33. def data(self):
  34. return self._data
  35. def create_data(self, project):
  36. return self._data.create(project, self._meta)
  37. def create_label(self, project):
  38. return [label.create(project) for label in self._label]
  39. def create_annotation(self, user, example, mapping):
  40. return [label.create_annotation(user, example, mapping) for label in self._label]
  41. @property
  42. def label(self):
  43. return [label.dict() for label in self._label if label.has_name() and label.name]
  44. class BaseReader(collections.abc.Iterable):
  45. """Reader has a role to parse files and return a Record iterator."""
  46. @abc.abstractmethod
  47. def __iter__(self) -> Iterator[Record]:
  48. """Creates an iterator for elements of this dataset.
  49. Returns:
  50. A `Record` for the elements of this dataset.
  51. """
  52. raise NotImplementedError("Please implement this method in the subclass.")
  53. @property
  54. @abc.abstractmethod
  55. def errors(self):
  56. raise NotImplementedError("Please implement this method in the subclass.")
  57. class Parser(abc.ABC):
  58. """The abstract file parser."""
  59. @abc.abstractmethod
  60. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  61. """Parses the file and returns the dictionary."""
  62. raise NotImplementedError("Please implement this method in the subclass.")
  63. @property
  64. def errors(self) -> List[FileParseException]:
  65. """Returns parsing errors."""
  66. return []
  67. @dataclasses.dataclass
  68. class FileName:
  69. full_path: str
  70. generated_name: str
  71. upload_name: str
  72. class Builder(abc.ABC):
  73. """The abstract Record builder."""
  74. @abc.abstractmethod
  75. def build(self, row: Dict[Any, Any], filename: FileName, line_num: int) -> Record:
  76. """Builds the record from the dictionary."""
  77. raise NotImplementedError("Please implement this method in the subclass.")
  78. class Reader(BaseReader):
  79. def __init__(self, filenames: List[FileName], parser: Parser, builder: Builder):
  80. self.filenames = filenames
  81. self.parser = parser
  82. self.builder = builder
  83. self._errors: List[FileParseException] = []
  84. def __iter__(self) -> Iterator[Record]:
  85. for filename in self.filenames:
  86. rows = self.parser.parse(filename.full_path)
  87. for line_num, row in enumerate(rows, start=1):
  88. try:
  89. yield self.builder.build(row, filename, line_num)
  90. except FileParseException as e:
  91. self._errors.append(e)
  92. @property
  93. def errors(self) -> List[FileParseException]:
  94. """Aggregates parser and builder errors."""
  95. errors = self.parser.errors + self._errors
  96. return errors