Browse Source

Return line number from parser

pull/1823/head
Hironsan 3 years ago
parent
commit
0465c4f982
2 changed files with 17 additions and 9 deletions
  1. 24
      backend/data_import/pipeline/parsers.py
  2. 2
      backend/data_import/tests/test_parser.py

24
backend/data_import/pipeline/parsers.py

@ -11,7 +11,12 @@ from chardet import UniversalDetector
from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens
from .exceptions import FileParseException from .exceptions import FileParseException
from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser
from .readers import (
DEFAULT_LABEL_COLUMN,
DEFAULT_TEXT_COLUMN,
LINE_NUMBER_COLUMN,
Parser,
)
DEFAULT_ENCODING = "Auto" DEFAULT_ENCODING = "Auto"
@ -115,8 +120,8 @@ class LineParser(Parser):
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]: def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
reader = LineReader(filename, self.encoding) reader = LineReader(filename, self.encoding)
for line in reader:
yield {DEFAULT_TEXT_COLUMN: line}
for line_num, line in enumerate(reader, start=1):
yield {DEFAULT_TEXT_COLUMN: line, LINE_NUMBER_COLUMN: line_num}
class TextFileParser(Parser): class TextFileParser(Parser):
@ -151,8 +156,8 @@ class CSVParser(Parser):
encoding = decide_encoding(filename, self.encoding) encoding = decide_encoding(filename, self.encoding)
with open(filename, encoding=encoding) as f: with open(filename, encoding=encoding) as f:
reader = csv.DictReader(f, delimiter=self.delimiter) reader = csv.DictReader(f, delimiter=self.delimiter)
for row in reader:
yield row
for line_num, row in enumerate(reader, start=1):
yield {LINE_NUMBER_COLUMN: line_num, **row}
class JSONParser(Parser): class JSONParser(Parser):
@ -197,7 +202,8 @@ class JSONLParser(Parser):
reader = LineReader(filename, self.encoding) reader = LineReader(filename, self.encoding)
for line_num, line in enumerate(reader, start=1): for line_num, line in enumerate(reader, start=1):
try: try:
yield json.loads(line)
row = json.loads(line)
yield {LINE_NUMBER_COLUMN: line_num, **row}
except json.decoder.JSONDecodeError as e: except json.decoder.JSONDecodeError as e:
error = FileParseException(filename, line_num, str(e)) error = FileParseException(filename, line_num, str(e))
self._errors.append(error) self._errors.append(error)
@ -217,7 +223,7 @@ class ExcelParser(Parser):
rows = pyexcel.iget_records(file_name=filename) rows = pyexcel.iget_records(file_name=filename)
try: try:
for line_num, row in enumerate(rows, start=1): for line_num, row in enumerate(rows, start=1):
yield row
yield {LINE_NUMBER_COLUMN: line_num, **row}
except pyexcel.exceptions.FileTypeNotSupported as e: except pyexcel.exceptions.FileTypeNotSupported as e:
error = FileParseException(filename, line_num=1, message=str(e)) error = FileParseException(filename, line_num=1, message=str(e))
self._errors.append(error) self._errors.append(error)
@ -246,7 +252,7 @@ class FastTextParser(Parser):
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]: def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
reader = LineReader(filename, self.encoding) reader = LineReader(filename, self.encoding)
for line in reader:
for line_num, line in enumerate(reader, start=1):
labels = [] labels = []
tokens = [] tokens = []
for token in line.rstrip().split(" "): for token in line.rstrip().split(" "):
@ -256,7 +262,7 @@ class FastTextParser(Parser):
else: else:
tokens.append(token) tokens.append(token)
text = " ".join(tokens) text = " ".join(tokens)
yield {DEFAULT_TEXT_COLUMN: text, DEFAULT_LABEL_COLUMN: labels}
yield {DEFAULT_TEXT_COLUMN: text, DEFAULT_LABEL_COLUMN: labels, LINE_NUMBER_COLUMN: line_num}
class CoNLLParser(Parser): class CoNLLParser(Parser):

2
backend/data_import/tests/test_parser.py

@ -5,6 +5,7 @@ import tempfile
import unittest import unittest
from data_import.pipeline import parsers from data_import.pipeline import parsers
from data_import.pipeline.readers import LINE_NUMBER_COLUMN
class TestParser(unittest.TestCase): class TestParser(unittest.TestCase):
@ -24,6 +25,7 @@ class TestParser(unittest.TestCase):
it = parser.parse(self.test_file) it = parser.parse(self.test_file)
for expect in expected: for expect in expected:
row = next(it) row = next(it)
row.pop(LINE_NUMBER_COLUMN, None)
self.assertEqual(row, expect) self.assertEqual(row, expect)
with self.assertRaises(StopIteration): with self.assertRaises(StopIteration):
next(it) next(it)

Loading…
Cancel
Save