You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

348 lines
11 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. import csv
  2. import io
  3. import json
  4. import os
  5. from typing import Any, Dict, Iterator, List, Tuple
  6. import chardet
  7. import pyexcel
  8. import pyexcel.exceptions
  9. from chardet import UniversalDetector
  10. from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens
  11. from .exceptions import FileParseException
  12. from .readers import (
  13. DEFAULT_LABEL_COLUMN,
  14. DEFAULT_TEXT_COLUMN,
  15. LINE_NUMBER_COLUMN,
  16. Parser,
  17. )
  18. DEFAULT_ENCODING = "Auto"
  19. def detect_encoding(filename: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> str:
  20. """Detects character encoding automatically.
  21. If you want to know the supported encodings, please see the following document:
  22. https://chardet.readthedocs.io/en/latest/supported-encodings.html
  23. Args:
  24. filename: the filename for detecting the encoding.
  25. buffer_size: the buffer size to read file contents incrementally.
  26. Returns:
  27. The character encoding.
  28. """
  29. # For a small file.
  30. if os.path.getsize(filename) < buffer_size:
  31. detected = chardet.detect(open(filename, "rb").read())
  32. return detected.get("encoding", "utf-8")
  33. # For a large file, call the Universal Encoding Detector incrementally.
  34. # It will stop as soon as it is confident enough to report its results.
  35. # See: https://chardet.readthedocs.io/en/latest/usage.html
  36. with open(filename, "rb") as f:
  37. detector = UniversalDetector()
  38. while True:
  39. binary = f.read(buffer_size)
  40. detector.feed(binary)
  41. if binary == b"":
  42. break
  43. if detector.done:
  44. break
  45. if detector.done:
  46. return detector.result["encoding"] or "utf-8"
  47. else:
  48. return "utf-8"
  49. def decide_encoding(filename: str, encoding: str) -> str:
  50. """Decide character encoding automatically.
  51. If the encoding is DEFAULT_ENCODING, detects it automatically.
  52. Otherwise, return it as is.
  53. Args:
  54. filename: The filename for decide the encoding.
  55. encoding: The specified encoding.
  56. Returns:
  57. The character encoding.
  58. """
  59. if encoding == DEFAULT_ENCODING:
  60. return detect_encoding(filename)
  61. else:
  62. return encoding
  63. class LineReader:
  64. """LineReader is a helper class to read a file line by line.
  65. Attributes:
  66. filename: The filename to read.
  67. encoding: The character encoding.
  68. """
  69. def __init__(self, filename: str, encoding: str = DEFAULT_ENCODING):
  70. self.filename = filename
  71. self.encoding = encoding
  72. def __iter__(self) -> Iterator[str]:
  73. encoding = decide_encoding(self.filename, self.encoding)
  74. with open(self.filename, encoding=encoding) as f:
  75. for line in f:
  76. yield line.rstrip()
  77. class PlainParser(Parser):
  78. """PlainParser is a parser simply returns a dictionary.
  79. This is for a task without any text.
  80. """
  81. def __init__(self, **kwargs):
  82. self.kwargs = kwargs
  83. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  84. yield {}
  85. class LineParser(Parser):
  86. """LineParser is a parser to read a file line by line.
  87. Attributes:
  88. encoding: The character encoding.
  89. """
  90. def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
  91. self.encoding = encoding
  92. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  93. reader = LineReader(filename, self.encoding)
  94. for line_num, line in enumerate(reader, start=1):
  95. yield {DEFAULT_TEXT_COLUMN: line, LINE_NUMBER_COLUMN: line_num}
  96. class TextFileParser(Parser):
  97. """TextFileParser is a parser to read an entire file content.
  98. Attributes:
  99. encoding: The character encoding.
  100. """
  101. def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
  102. self.encoding = encoding
  103. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  104. encoding = decide_encoding(filename, self.encoding)
  105. with open(filename, encoding=encoding) as f:
  106. yield {DEFAULT_TEXT_COLUMN: f.read()}
  107. class CSVParser(Parser):
  108. """CSVParser is a parser to read a csv file and return its rows.
  109. Attributes:
  110. encoding: The character encoding.
  111. delimiter: A one-character string used to separate fields. It defaults to ','.
  112. """
  113. def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ",", **kwargs):
  114. self.encoding = encoding
  115. self.delimiter = delimiter
  116. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  117. encoding = decide_encoding(filename, self.encoding)
  118. with open(filename, encoding=encoding) as f:
  119. reader = csv.DictReader(f, delimiter=self.delimiter)
  120. for line_num, row in enumerate(reader, start=1):
  121. yield {LINE_NUMBER_COLUMN: line_num, **row}
  122. class JSONParser(Parser):
  123. """JSONParser is a parser to read a json file and return its rows.
  124. Attributes:
  125. encoding: The character encoding.
  126. """
  127. def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
  128. self.encoding = encoding
  129. self._errors: List[FileParseException] = []
  130. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  131. encoding = decide_encoding(filename, self.encoding)
  132. with open(filename, encoding=encoding) as f:
  133. try:
  134. rows = json.load(f)
  135. for row in rows:
  136. yield row
  137. except json.decoder.JSONDecodeError as e:
  138. error = FileParseException(filename, line_num=1, message=str(e))
  139. self._errors.append(error)
  140. @property
  141. def errors(self) -> List[FileParseException]:
  142. return self._errors
  143. class JSONLParser(Parser):
  144. """JSONLParser is a parser to read a JSONL file and return its rows.
  145. Attributes:
  146. encoding: The character encoding.
  147. """
  148. def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
  149. self.encoding = encoding
  150. self._errors: List[FileParseException] = []
  151. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  152. reader = LineReader(filename, self.encoding)
  153. for line_num, line in enumerate(reader, start=1):
  154. try:
  155. row = json.loads(line)
  156. yield {LINE_NUMBER_COLUMN: line_num, **row}
  157. except json.decoder.JSONDecodeError as e:
  158. error = FileParseException(filename, line_num, str(e))
  159. self._errors.append(error)
  160. @property
  161. def errors(self) -> List[FileParseException]:
  162. return self._errors
  163. class ExcelParser(Parser):
  164. """ExcelParser is a parser to read a excel file."""
  165. def __init__(self, **kwargs):
  166. self._errors = []
  167. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  168. rows = pyexcel.iget_records(file_name=filename)
  169. try:
  170. for line_num, row in enumerate(rows, start=1):
  171. yield {LINE_NUMBER_COLUMN: line_num, **row}
  172. except pyexcel.exceptions.FileTypeNotSupported as e:
  173. error = FileParseException(filename, line_num=1, message=str(e))
  174. self._errors.append(error)
  175. @property
  176. def errors(self) -> List[FileParseException]:
  177. return self._errors
  178. class FastTextParser(Parser):
  179. """FastTextParser is a parser to read a fastText format and returns a text and labels.
  180. The example format is as follows:
  181. __label__positive I really enjoyed this restaurant.
  182. This format expects the category first, with the prefix __label__ before each category,
  183. and then the input text, like so,
  184. Attributes:
  185. encoding: The character encoding.
  186. label: The label prefix. It defaults to `__label__`.
  187. """
  188. def __init__(self, encoding: str = DEFAULT_ENCODING, label: str = "__label__", **kwargs):
  189. self.encoding = encoding
  190. self.label = label
  191. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  192. reader = LineReader(filename, self.encoding)
  193. for line_num, line in enumerate(reader, start=1):
  194. labels = []
  195. tokens = []
  196. for token in line.rstrip().split(" "):
  197. if token.startswith(self.label):
  198. label_name = token[len(self.label) :]
  199. labels.append(label_name)
  200. else:
  201. tokens.append(token)
  202. text = " ".join(tokens)
  203. yield {DEFAULT_TEXT_COLUMN: text, DEFAULT_LABEL_COLUMN: labels, LINE_NUMBER_COLUMN: line_num}
  204. class CoNLLParser(Parser):
  205. """CoNLLParser is a parser to read conll like format and returns a text and labels.
  206. The example format is as follows:
  207. EU B-ORG
  208. rejects O
  209. German B-MISC
  210. call O
  211. to O
  212. boycott O
  213. British B-MISC
  214. lamb O
  215. . O
  216. Peter B-PER
  217. Blackburn I-PER
  218. This format expects a token in the first column, and a tag in the second column.
  219. The each data is separated by a new line.
  220. Attributes:
  221. encoding: The character encoding.
  222. delimiter: A one-character string used to separate fields. It defaults to ' '.
  223. scheme: The tagging scheme. It supports `IOB2`, `IOE2`, `IOBES`, and `BILOU`.
  224. """
  225. def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = " ", scheme: str = "IOB2", **kwargs):
  226. self.encoding = encoding
  227. self.delimiter = delimiter
  228. mapping = {"IOB2": IOB2, "IOE2": IOE2, "IOBES": IOBES, "BILOU": BILOU}
  229. self._errors: List[FileParseException] = []
  230. if scheme in mapping:
  231. self.scheme = mapping[scheme]
  232. else:
  233. self.scheme = None
  234. @property
  235. def errors(self) -> List[FileParseException]:
  236. return self._errors
  237. def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
  238. if not self.scheme:
  239. message = "The specified scheme is not supported."
  240. error = FileParseException(filename, line_num=1, message=message)
  241. self._errors.append(error)
  242. return
  243. reader = LineReader(filename, self.encoding)
  244. words, tags = [], []
  245. for line_num, line in enumerate(reader, start=1):
  246. line = line.rstrip()
  247. if line:
  248. tokens = line.split("\t")
  249. if len(tokens) != 2:
  250. message = "A line must be separated by tab and has two columns."
  251. self._errors.append(FileParseException(filename, line_num, message))
  252. return
  253. word, tag = tokens
  254. words.append(word)
  255. tags.append(tag)
  256. else:
  257. yield self.create_record(tags, words)
  258. words, tags = [], []
  259. if words:
  260. yield self.create_record(tags, words)
  261. def create_record(self, tags, words):
  262. text = self.delimiter.join(words)
  263. labels = self.align_span(words, tags)
  264. return {DEFAULT_TEXT_COLUMN: text, DEFAULT_LABEL_COLUMN: labels}
  265. def align_span(self, words: List[str], tags: List[str]) -> List[Tuple[int, int, str]]:
  266. tokens = Tokens(tags, self.scheme)
  267. labels = []
  268. for entity in tokens.entities:
  269. text = self.delimiter.join(words[: entity.start])
  270. start = len(text) + len(self.delimiter) if text else len(text)
  271. chunk = words[entity.start : entity.end]
  272. text = self.delimiter.join(chunk)
  273. end = start + len(text)
  274. labels.append((start, end, entity.tag))
  275. return labels