Browse Source

Add docstring to parsers mainly

pull/1619/head
Hironsan 3 years ago
parent
commit
b874982199
3 changed files with 96 additions and 9 deletions
  1. 91
      backend/api/views/upload/parsers.py
  2. 4
      backend/api/views/upload/readers.py
  3. 10
      backend/api/views/upload/writers.py

91
backend/api/views/upload/parsers.py

@ -16,15 +16,15 @@ from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser
DEFAULT_ENCODING = 'Auto'
def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
def detect_encoding(filename: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> str:
"""Detects character encoding automatically.
If you want to know the supported encodings, please see the following document:
https://chardet.readthedocs.io/en/latest/supported-encodings.html
Args:
filename (str): the filename for detecting the encoding.
buffer_size (int): the buffer size to read file contents incrementally.
filename: the filename for detecting the encoding.
buffer_size: the buffer size to read file contents incrementally.
Returns:
The character encoding.
@ -52,7 +52,19 @@ def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
return 'utf-8'
def decide_encoding(filename: str, encoding: str):
def decide_encoding(filename: str, encoding: str) -> str:
"""Decide character encoding automatically.
If the encoding is DEFAULT_ENCODING, detects it automatically.
Otherwise, return it as is.
Args:
filename: The filename for decide the encoding.
encoding: The specified encoding.
Returns:
The character encoding.
"""
if encoding == DEFAULT_ENCODING:
return detect_encoding(filename)
else:
@ -60,6 +72,12 @@ def decide_encoding(filename: str, encoding: str):
class LineReader:
"""LineReader is a helper class to read a file line by line.
Attributes:
filename: The filename to read.
encoding: The character encoding.
"""
def __init__(self, filename: str, encoding: str = DEFAULT_ENCODING):
self.filename = filename
@ -73,12 +91,21 @@ class LineReader:
class PlainParser(Parser):
"""PlainParser is a parser simply returns a dictionary.
This is for a task without any text.
"""
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
yield {}
class LineParser(Parser):
"""LineParser is a parser to read a file line by line.
Attributes:
encoding: The character encoding.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
self.encoding = encoding
@ -90,6 +117,11 @@ class LineParser(Parser):
class TextFileParser(Parser):
"""TextFileParser is a parser to read an entire file content.
Attributes:
encoding: The character encoding.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
self.encoding = encoding
@ -101,6 +133,12 @@ class TextFileParser(Parser):
class CSVParser(Parser):
"""CSVParser is a parser to read a csv file and return its rows.
Attributes:
encoding: The character encoding.
delimiter: A one-character string used to separate fields. It defaults to ','.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ',', **kwargs):
self.encoding = encoding
@ -115,6 +153,11 @@ class CSVParser(Parser):
class JSONParser(Parser):
"""JSONParser is a parser to read a json file and return its rows.
Attributes:
encoding: The character encoding.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
self.encoding = encoding
@ -137,6 +180,11 @@ class JSONParser(Parser):
class JSONLParser(Parser):
"""JSONLParser is a parser to read a JSONL file and return its rows.
Attributes:
encoding: The character encoding.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
self.encoding = encoding
@ -157,6 +205,7 @@ class JSONLParser(Parser):
class ExcelParser(Parser):
"""ExcelParser is a parser to read a excel file."""
def __init__(self, **kwargs):
self._errors = []
@ -176,6 +225,17 @@ class ExcelParser(Parser):
class FastTextParser(Parser):
"""FastTextParser is a parser to read a fastText format and returns a text and labels.
The example format is as follows:
__label__positive I really enjoyed this restaurant.
This format expects the category first, with the prefix __label__ before each category,
and then the input text, like so,
Attributes:
encoding: The character encoding.
label: The label prefix. It defaults to `__label__`.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, label: str = '__label__', **kwargs):
self.encoding = encoding
@ -197,6 +257,29 @@ class FastTextParser(Parser):
class CoNLLParser(Parser):
"""CoNLLParser is a parser to read conll like format and returns a text and labels.
The example format is as follows:
EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O
Peter B-PER
Blackburn I-PER
This format expects a token in the first column, and a tag in the second column.
The each data is separated by a new line.
Attributes:
encoding: The character encoding.
delimiter: A one-character string used to separate fields. It defaults to ' '.
scheme: The tagging scheme. It supports `IOB2`, `IOE2`, `IOBES`, and `BILOU`.
"""
def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ' ', scheme: str = 'IOB2', **kwargs):
self.encoding = encoding

4
backend/api/views/upload/readers.py

@ -12,6 +12,7 @@ DEFAULT_LABEL_COLUMN = 'label'
class Record:
"""Record represents a data."""
def __init__(self,
data: Type[BaseData],
@ -65,6 +66,7 @@ class Record:
class BaseReader(collections.abc.Iterable):
"""Reader has a role to parse files and return a Record iterator."""
@abc.abstractmethod
def __iter__(self) -> Iterator[Record]:
@ -82,6 +84,7 @@ class BaseReader(collections.abc.Iterable):
class Parser(abc.ABC):
"""The abstract file parser."""
@abc.abstractmethod
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
@ -95,6 +98,7 @@ class Parser(abc.ABC):
class Builder(abc.ABC):
"""The abstract Record builder."""
@abc.abstractmethod
def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:

10
backend/api/views/upload/writers.py

@ -27,7 +27,7 @@ def group_by_class(instances):
class Examples:
def __init__(self, buffer_size=settings.IMPORT_BATCH_SIZE):
def __init__(self, buffer_size: int = settings.IMPORT_BATCH_SIZE):
self.buffer_size = buffer_size
self.buffer = []
@ -59,7 +59,7 @@ class Examples:
examples = [example.create_data(project) for example in self.buffer]
return Example.objects.bulk_create(examples)
def save_annotation(self, project, user, examples):
def save_annotation(self, project: Project, user, examples):
mapping = {(label.text, label.task_type): label for label in project.labels.all()}
annotations = list(itertools.chain.from_iterable([
data.create_annotation(user, example, mapping) for data, example in zip(self.buffer, examples)
@ -71,11 +71,11 @@ class Examples:
class BulkWriter(Writer):
def __init__(self, batch_size):
def __init__(self, batch_size: int):
self.examples = Examples(batch_size)
self._errors = []
def save(self, reader: BaseReader, project, user, cleaner):
def save(self, reader: BaseReader, project: Project, user, cleaner):
it = iter(reader)
while True:
try:
@ -101,7 +101,7 @@ class BulkWriter(Writer):
self._errors.sort(key=lambda e: e.line_num)
return self._errors
def create(self, project, user):
def create(self, project: Project, user):
self.examples.save_label(project)
ids = self.examples.save_data(project)
self.examples.save_annotation(project, user, ids)
Loading…
Cancel
Save