Add docstring to parsers mainly

3 years ago · b874982199
3 changed files with 96 additions and 9 deletions
--- a/backend/api/views/upload/parsers.py
+++ b/backend/api/views/upload/parsers.py
@ -16,15 +16,15 @@ from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser
 DEFAULT_ENCODING = 'Auto'


-def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
+def detect_encoding(filename: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> str:
    """Detects character encoding automatically.

    If you want to know the supported encodings, please see the following document:
    https://chardet.readthedocs.io/en/latest/supported-encodings.html

    Args:
-        filename (str): the filename for detecting the encoding.
-        buffer_size (int): the buffer size to read file contents incrementally.
+        filename: the filename for detecting the encoding.
+        buffer_size: the buffer size to read file contents incrementally.

    Returns:
        The character encoding.
@ -52,7 +52,19 @@ def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
            return 'utf-8'


-def decide_encoding(filename: str, encoding: str):
+def decide_encoding(filename: str, encoding: str) -> str:
+    """Decide character encoding automatically.
+
+    If the encoding is DEFAULT_ENCODING, detects it automatically.
+    Otherwise, return it as is.
+
+    Args:
+         filename: The filename for decide the encoding.
+         encoding: The specified encoding.
+
+    Returns:
+        The character encoding.
+    """
    if encoding == DEFAULT_ENCODING:
        return detect_encoding(filename)
    else:
@ -60,6 +72,12 @@ def decide_encoding(filename: str, encoding: str):


 class LineReader:
+    """LineReader is a helper class to read a file line by line.
+
+    Attributes:
+        filename: The filename to read.
+        encoding: The character encoding.
+    """

    def __init__(self, filename: str, encoding: str = DEFAULT_ENCODING):
        self.filename = filename
@ -73,12 +91,21 @@ class LineReader:


 class PlainParser(Parser):
+    """PlainParser is a parser simply returns a dictionary.
+
+    This is for a task without any text.
+    """

    def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
        yield {}


 class LineParser(Parser):
+    """LineParser is a parser to read a file line by line.
+
+    Attributes:
+        encoding: The character encoding.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
        self.encoding = encoding
@ -90,6 +117,11 @@ class LineParser(Parser):


 class TextFileParser(Parser):
+    """TextFileParser is a parser to read an entire file content.
+
+    Attributes:
+        encoding: The character encoding.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
        self.encoding = encoding
@ -101,6 +133,12 @@ class TextFileParser(Parser):


 class CSVParser(Parser):
+    """CSVParser is a parser to read a csv file and return its rows.
+
+    Attributes:
+        encoding: The character encoding.
+        delimiter: A one-character string used to separate fields. It defaults to ','.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ',', **kwargs):
        self.encoding = encoding
@ -115,6 +153,11 @@ class CSVParser(Parser):


 class JSONParser(Parser):
+    """JSONParser is a parser to read a json file and return its rows.
+
+    Attributes:
+        encoding: The character encoding.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
        self.encoding = encoding
@ -137,6 +180,11 @@ class JSONParser(Parser):


 class JSONLParser(Parser):
+    """JSONLParser is a parser to read a JSONL file and return its rows.
+
+    Attributes:
+        encoding: The character encoding.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
        self.encoding = encoding
@ -157,6 +205,7 @@ class JSONLParser(Parser):


 class ExcelParser(Parser):
+    """ExcelParser is a parser to read a excel file."""

    def __init__(self, **kwargs):
        self._errors = []
@ -176,6 +225,17 @@ class ExcelParser(Parser):


 class FastTextParser(Parser):
+    """FastTextParser is a parser to read a fastText format and returns a text and labels.
+
+    The example format is as follows:
+        __label__positive I really enjoyed this restaurant.
+    This format expects the category first, with the prefix ‘__label__’ before each category,
+    and then the input text, like so,
+
+    Attributes:
+        encoding: The character encoding.
+        label: The label prefix. It defaults to `__label__`.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, label: str = '__label__', **kwargs):
        self.encoding = encoding
@ -197,6 +257,29 @@ class FastTextParser(Parser):


 class CoNLLParser(Parser):
+    """CoNLLParser is a parser to read conll like format and returns a text and labels.
+
+    The example format is as follows:
+        EU  B-ORG
+        rejects O
+        German  B-MISC
+        call  O
+        to  O
+        boycott O
+        British B-MISC
+        lamb  O
+        . O
+
+        Peter B-PER
+        Blackburn I-PER
+    This format expects a token in the first column, and a tag in the second column.
+    The each data is separated by a new line.
+
+    Attributes:
+        encoding: The character encoding.
+        delimiter: A one-character string used to separate fields. It defaults to ' '.
+        scheme: The tagging scheme. It supports `IOB2`, `IOE2`, `IOBES`, and `BILOU`.
+    """

    def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ' ', scheme: str = 'IOB2', **kwargs):
        self.encoding = encoding
--- a/backend/api/views/upload/readers.py
+++ b/backend/api/views/upload/readers.py
@ -12,6 +12,7 @@ DEFAULT_LABEL_COLUMN = 'label'


 class Record:
+    """Record represents a data."""

    def __init__(self,
                 data: Type[BaseData],
@ -65,6 +66,7 @@ class Record:


 class BaseReader(collections.abc.Iterable):
+    """Reader has a role to parse files and return a Record iterator."""

    @abc.abstractmethod
    def __iter__(self) -> Iterator[Record]:
@ -82,6 +84,7 @@ class BaseReader(collections.abc.Iterable):


 class Parser(abc.ABC):
+    """The abstract file parser."""

    @abc.abstractmethod
    def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
@ -95,6 +98,7 @@ class Parser(abc.ABC):


 class Builder(abc.ABC):
+    """The abstract Record builder."""

    @abc.abstractmethod
    def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:
--- a/backend/api/views/upload/writers.py
+++ b/backend/api/views/upload/writers.py
@ -27,7 +27,7 @@ def group_by_class(instances):

 class Examples:

-    def __init__(self, buffer_size=settings.IMPORT_BATCH_SIZE):
+    def __init__(self, buffer_size: int = settings.IMPORT_BATCH_SIZE):
        self.buffer_size = buffer_size
        self.buffer = []

@ -59,7 +59,7 @@ class Examples:
        examples = [example.create_data(project) for example in self.buffer]
        return Example.objects.bulk_create(examples)

-    def save_annotation(self, project, user, examples):
+    def save_annotation(self, project: Project, user, examples):
        mapping = {(label.text, label.task_type): label for label in project.labels.all()}
        annotations = list(itertools.chain.from_iterable([
            data.create_annotation(user, example, mapping) for data, example in zip(self.buffer, examples)
@ -71,11 +71,11 @@ class Examples:

 class BulkWriter(Writer):

-    def __init__(self, batch_size):
+    def __init__(self, batch_size: int):
        self.examples = Examples(batch_size)
        self._errors = []

-    def save(self, reader: BaseReader, project, user, cleaner):
+    def save(self, reader: BaseReader, project: Project, user, cleaner):
        it = iter(reader)
        while True:
            try:
@ -101,7 +101,7 @@ class BulkWriter(Writer):
        self._errors.sort(key=lambda e: e.line_num)
        return self._errors

-    def create(self, project, user):
+    def create(self, project: Project, user):
        self.examples.save_label(project)
        ids = self.examples.save_data(project)
        self.examples.save_annotation(project, user, ids)