From b8749821995deb0b5a19aefef2992b4a416d2757 Mon Sep 17 00:00:00 2001
From: Hironsan <light.tree.1.13@gmail.com>
Date: Mon, 20 Dec 2021 11:25:37 +0900
Subject: [PATCH] Add docstring to parsers mainly

---
 backend/api/views/upload/parsers.py | 91 +++++++++++++++++++++++++++--
 backend/api/views/upload/readers.py |  4 ++
 backend/api/views/upload/writers.py | 10 ++--
 3 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/backend/api/views/upload/parsers.py b/backend/api/views/upload/parsers.py
index bbfe06de..d1cd84b8 100644
--- a/backend/api/views/upload/parsers.py
+++ b/backend/api/views/upload/parsers.py
@@ -16,15 +16,15 @@ from .readers import DEFAULT_LABEL_COLUMN, DEFAULT_TEXT_COLUMN, Parser
 DEFAULT_ENCODING = 'Auto'
 
 
-def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
+def detect_encoding(filename: str, buffer_size: int = io.DEFAULT_BUFFER_SIZE) -> str:
     """Detects character encoding automatically.
 
     If you want to know the supported encodings, please see the following document:
     https://chardet.readthedocs.io/en/latest/supported-encodings.html
 
     Args:
-        filename (str): the filename for detecting the encoding.
-        buffer_size (int): the buffer size to read file contents incrementally.
+        filename: the filename for detecting the encoding.
+        buffer_size: the buffer size to read file contents incrementally.
 
     Returns:
         The character encoding.
@@ -52,7 +52,19 @@ def detect_encoding(filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
             return 'utf-8'
 
 
-def decide_encoding(filename: str, encoding: str):
+def decide_encoding(filename: str, encoding: str) -> str:
+    """Decide character encoding automatically.
+
+    If the encoding is DEFAULT_ENCODING, detects it automatically.
+    Otherwise, return it as is.
+
+    Args:
+         filename: The filename for decide the encoding.
+         encoding: The specified encoding.
+
+    Returns:
+        The character encoding.
+    """
     if encoding == DEFAULT_ENCODING:
         return detect_encoding(filename)
     else:
@@ -60,6 +72,12 @@ def decide_encoding(filename: str, encoding: str):
 
 
 class LineReader:
+    """LineReader is a helper class to read a file line by line.
+
+    Attributes:
+        filename: The filename to read.
+        encoding: The character encoding.
+    """
 
     def __init__(self, filename: str, encoding: str = DEFAULT_ENCODING):
         self.filename = filename
@@ -73,12 +91,21 @@ class LineReader:
 
 
 class PlainParser(Parser):
+    """PlainParser is a parser simply returns a dictionary.
+
+    This is for a task without any text.
+    """
 
     def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
         yield {}
 
 
 class LineParser(Parser):
+    """LineParser is a parser to read a file line by line.
+
+    Attributes:
+        encoding: The character encoding.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
         self.encoding = encoding
@@ -90,6 +117,11 @@ class LineParser(Parser):
 
 
 class TextFileParser(Parser):
+    """TextFileParser is a parser to read an entire file content.
+
+    Attributes:
+        encoding: The character encoding.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
         self.encoding = encoding
@@ -101,6 +133,12 @@ class TextFileParser(Parser):
 
 
 class CSVParser(Parser):
+    """CSVParser is a parser to read a csv file and return its rows.
+
+    Attributes:
+        encoding: The character encoding.
+        delimiter: A one-character string used to separate fields. It defaults to ','.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ',', **kwargs):
         self.encoding = encoding
@@ -115,6 +153,11 @@ class CSVParser(Parser):
 
 
 class JSONParser(Parser):
+    """JSONParser is a parser to read a json file and return its rows.
+
+    Attributes:
+        encoding: The character encoding.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
         self.encoding = encoding
@@ -137,6 +180,11 @@ class JSONParser(Parser):
 
 
 class JSONLParser(Parser):
+    """JSONLParser is a parser to read a JSONL file and return its rows.
+
+    Attributes:
+        encoding: The character encoding.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, **kwargs):
         self.encoding = encoding
@@ -157,6 +205,7 @@ class JSONLParser(Parser):
 
 
 class ExcelParser(Parser):
+    """ExcelParser is a parser to read a excel file."""
 
     def __init__(self, **kwargs):
         self._errors = []
@@ -176,6 +225,17 @@ class ExcelParser(Parser):
 
 
 class FastTextParser(Parser):
+    """FastTextParser is a parser to read a fastText format and returns a text and labels.
+
+    The example format is as follows:
+        __label__positive I really enjoyed this restaurant.
+    This format expects the category first, with the prefix ‘__label__’ before each category,
+    and then the input text, like so,
+
+    Attributes:
+        encoding: The character encoding.
+        label: The label prefix. It defaults to `__label__`.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, label: str = '__label__', **kwargs):
         self.encoding = encoding
@@ -197,6 +257,29 @@ class FastTextParser(Parser):
 
 
 class CoNLLParser(Parser):
+    """CoNLLParser is a parser to read conll like format and returns a text and labels.
+
+    The example format is as follows:
+        EU  B-ORG
+        rejects O
+        German  B-MISC
+        call  O
+        to  O
+        boycott O
+        British B-MISC
+        lamb  O
+        . O
+
+        Peter B-PER
+        Blackburn I-PER
+    This format expects a token in the first column, and a tag in the second column.
+    The each data is separated by a new line.
+
+    Attributes:
+        encoding: The character encoding.
+        delimiter: A one-character string used to separate fields. It defaults to ' '.
+        scheme: The tagging scheme. It supports `IOB2`, `IOE2`, `IOBES`, and `BILOU`.
+    """
 
     def __init__(self, encoding: str = DEFAULT_ENCODING, delimiter: str = ' ', scheme: str = 'IOB2', **kwargs):
         self.encoding = encoding
diff --git a/backend/api/views/upload/readers.py b/backend/api/views/upload/readers.py
index c8e9786b..0a551859 100644
--- a/backend/api/views/upload/readers.py
+++ b/backend/api/views/upload/readers.py
@@ -12,6 +12,7 @@ DEFAULT_LABEL_COLUMN = 'label'
 
 
 class Record:
+    """Record represents a data."""
 
     def __init__(self,
                  data: Type[BaseData],
@@ -65,6 +66,7 @@ class Record:
 
 
 class BaseReader(collections.abc.Iterable):
+    """Reader has a role to parse files and return a Record iterator."""
 
     @abc.abstractmethod
     def __iter__(self) -> Iterator[Record]:
@@ -82,6 +84,7 @@ class BaseReader(collections.abc.Iterable):
 
 
 class Parser(abc.ABC):
+    """The abstract file parser."""
 
     @abc.abstractmethod
     def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
@@ -95,6 +98,7 @@ class Parser(abc.ABC):
 
 
 class Builder(abc.ABC):
+    """The abstract Record builder."""
 
     @abc.abstractmethod
     def build(self, row: Dict[Any, Any], filename: str, line_num: int) -> Record:
diff --git a/backend/api/views/upload/writers.py b/backend/api/views/upload/writers.py
index 9b6785e9..9e6eab8c 100644
--- a/backend/api/views/upload/writers.py
+++ b/backend/api/views/upload/writers.py
@@ -27,7 +27,7 @@ def group_by_class(instances):
 
 class Examples:
 
-    def __init__(self, buffer_size=settings.IMPORT_BATCH_SIZE):
+    def __init__(self, buffer_size: int = settings.IMPORT_BATCH_SIZE):
         self.buffer_size = buffer_size
         self.buffer = []
 
@@ -59,7 +59,7 @@ class Examples:
         examples = [example.create_data(project) for example in self.buffer]
         return Example.objects.bulk_create(examples)
 
-    def save_annotation(self, project, user, examples):
+    def save_annotation(self, project: Project, user, examples):
         mapping = {(label.text, label.task_type): label for label in project.labels.all()}
         annotations = list(itertools.chain.from_iterable([
             data.create_annotation(user, example, mapping) for data, example in zip(self.buffer, examples)
@@ -71,11 +71,11 @@ class Examples:
 
 class BulkWriter(Writer):
 
-    def __init__(self, batch_size):
+    def __init__(self, batch_size: int):
         self.examples = Examples(batch_size)
         self._errors = []
 
-    def save(self, reader: BaseReader, project, user, cleaner):
+    def save(self, reader: BaseReader, project: Project, user, cleaner):
         it = iter(reader)
         while True:
             try:
@@ -101,7 +101,7 @@ class BulkWriter(Writer):
         self._errors.sort(key=lambda e: e.line_num)
         return self._errors
 
-    def create(self, project, user):
+    def create(self, project: Project, user):
         self.examples.save_label(project)
         ids = self.examples.save_data(project)
         self.examples.save_annotation(project, user, ids)