From b8749821995deb0b5a19aefef2992b4a416d2757 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Mon, 20 Dec 2021 11:25:37 +0900 Subject: [PATCH] Add docstring to parsers mainly --- backend/api/views/upload/parsers.py | 91 +++++++++++++++++++++++++++-- backend/api/views/upload/readers.py | 4 ++ backend/api/views/upload/writers.py | 10 ++-- 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/backend/api/views/upload/parsers.py b/backend/api/views/upload/parsers.py index bbfe06de..d1cd84b8 100644 --- a/backend/api/views/upload/parsers.py +++ b/backend/api/views/upload/parsers.py @@ -16,15 +16,15 @@ from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser DEFAULT_ENCODING = 'Auto' -def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE): +def detect_encoding(filename: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> str: """Detects character encoding automatically. If you want to know the supported encodings, please see the following document: https://chardet.readthedocs.io/en/latest/supported-encodings.html Args: - filename (str): the filename for detecting the encoding. - buffer_size (int): the buffer size to read file contents incrementally. + filename: the filename for detecting the encoding. + buffer_size: the buffer size to read file contents incrementally. Returns: The character encoding. @@ -52,7 +52,19 @@ def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE): return 'utf-8' -def decide_encoding(filename: str, encoding: str): +def decide_encoding(filename: str, encoding: str) -> str: + """Decide character encoding automatically. + + If the encoding is DEFAULT_ENCODING, detects it automatically. + Otherwise, return it as is. + + Args: + filename: The filename for decide the encoding. + encoding: The specified encoding. + + Returns: + The character encoding. + """ if encoding == DEFAULT_ENCODING: return detect_encoding(filename) else: @@ -60,6 +72,12 @@ def decide_encoding(filename: str, encoding: str): class LineReader: + """LineReader is a helper class to read a file line by line. + + Attributes: + filename: The filename to read. + encoding: The character encoding. + """ def __init__(self, filename: str, encoding: str = DEFAULT_ENCODING): self.filename = filename @@ -73,12 +91,21 @@ class LineReader: class PlainParser(Parser): + """PlainParser is a parser simply returns a dictionary. + + This is for a task without any text. + """ def parse(self, filename: str) -> Iterator[Dict[Any, Any]]: yield {} class LineParser(Parser): + """LineParser is a parser to read a file line by line. + + Attributes: + encoding: The character encoding. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs): self.encoding = encoding @@ -90,6 +117,11 @@ class LineParser(Parser): class TextFileParser(Parser): + """TextFileParser is a parser to read an entire file content. + + Attributes: + encoding: The character encoding. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs): self.encoding = encoding @@ -101,6 +133,12 @@ class TextFileParser(Parser): class CSVParser(Parser): + """CSVParser is a parser to read a csv file and return its rows. + + Attributes: + encoding: The character encoding. + delimiter: A one-character string used to separate fields. It defaults to ','. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ',', **kwargs): self.encoding = encoding @@ -115,6 +153,11 @@ class CSVParser(Parser): class JSONParser(Parser): + """JSONParser is a parser to read a json file and return its rows. + + Attributes: + encoding: The character encoding. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs): self.encoding = encoding @@ -137,6 +180,11 @@ class JSONParser(Parser): class JSONLParser(Parser): + """JSONLParser is a parser to read a JSONL file and return its rows. + + Attributes: + encoding: The character encoding. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs): self.encoding = encoding @@ -157,6 +205,7 @@ class JSONLParser(Parser): class ExcelParser(Parser): + """ExcelParser is a parser to read a excel file.""" def __init__(self, **kwargs): self._errors = [] @@ -176,6 +225,17 @@ class ExcelParser(Parser): class FastTextParser(Parser): + """FastTextParser is a parser to read a fastText format and returns a text and labels. + + The example format is as follows: + __label__positive I really enjoyed this restaurant. + This format expects the category first, with the prefix ‘__label__’ before each category, + and then the input text, like so, + + Attributes: + encoding: The character encoding. + label: The label prefix. It defaults to `__label__`. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, label: str = '__label__', **kwargs): self.encoding = encoding @@ -197,6 +257,29 @@ class FastTextParser(Parser): class CoNLLParser(Parser): + """CoNLLParser is a parser to read conll like format and returns a text and labels. + + The example format is as follows: + EU B-ORG + rejects O + German B-MISC + call O + to O + boycott O + British B-MISC + lamb O + . O + + Peter B-PER + Blackburn I-PER + This format expects a token in the first column, and a tag in the second column. + The each data is separated by a new line. + + Attributes: + encoding: The character encoding. + delimiter: A one-character string used to separate fields. It defaults to ' '. + scheme: The tagging scheme. It supports `IOB2`, `IOE2`, `IOBES`, and `BILOU`. + """ def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ' ', scheme: str = 'IOB2', **kwargs): self.encoding = encoding diff --git a/backend/api/views/upload/readers.py b/backend/api/views/upload/readers.py index c8e9786b..0a551859 100644 --- a/backend/api/views/upload/readers.py +++ b/backend/api/views/upload/readers.py @@ -12,6 +12,7 @@ DEFAULT_LABEL_COLUMN = 'label' class Record: + """Record represents a data.""" def __init__(self, data: Type[BaseData], @@ -65,6 +66,7 @@ class Record: class BaseReader(collections.abc.Iterable): + """Reader has a role to parse files and return a Record iterator.""" @abc.abstractmethod def __iter__(self) -> Iterator[Record]: @@ -82,6 +84,7 @@ class BaseReader(collections.abc.Iterable): class Parser(abc.ABC): + """The abstract file parser.""" @abc.abstractmethod def parse(self, filename: str) -> Iterator[Dict[Any, Any]]: @@ -95,6 +98,7 @@ class Parser(abc.ABC): class Builder(abc.ABC): + """The abstract Record builder.""" @abc.abstractmethod def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record: diff --git a/backend/api/views/upload/writers.py b/backend/api/views/upload/writers.py index 9b6785e9..9e6eab8c 100644 --- a/backend/api/views/upload/writers.py +++ b/backend/api/views/upload/writers.py @@ -27,7 +27,7 @@ def group_by_class(instances): class Examples: - def __init__(self, buffer_size=settings.IMPORT_BATCH_SIZE): + def __init__(self, buffer_size: int = settings.IMPORT_BATCH_SIZE): self.buffer_size = buffer_size self.buffer = [] @@ -59,7 +59,7 @@ class Examples: examples = [example.create_data(project) for example in self.buffer] return Example.objects.bulk_create(examples) - def save_annotation(self, project, user, examples): + def save_annotation(self, project: Project, user, examples): mapping = {(label.text, label.task_type): label for label in project.labels.all()} annotations = list(itertools.chain.from_iterable([ data.create_annotation(user, example, mapping) for data, example in zip(self.buffer, examples) @@ -71,11 +71,11 @@ class Examples: class BulkWriter(Writer): - def __init__(self, batch_size): + def __init__(self, batch_size: int): self.examples = Examples(batch_size) self._errors = [] - def save(self, reader: BaseReader, project, user, cleaner): + def save(self, reader: BaseReader, project: Project, user, cleaner): it = iter(reader) while True: try: @@ -101,7 +101,7 @@ class BulkWriter(Writer): self._errors.sort(key=lambda e: e.line_num) return self._errors - def create(self, project, user): + def create(self, project: Project, user): self.examples.save_label(project) ids = self.examples.save_data(project) self.examples.save_annotation(project, user, ids)