doccano/backend/data_import/pipeline/readers.py


								import abc

								import collections.abc

								import dataclasses

								import uuid

								from typing import Any, Dict, Iterator, List


								import pandas as pd


								from .exceptions import FileParseException

								from .labeled_examples import Record


								DEFAULT_TEXT_COLUMN = "text"

								DEFAULT_LABEL_COLUMN = "label"

								LINE_NUM_COLUMN = "#line_num"

								FILE_NAME_COLUMN = "filename"

								UPLOAD_NAME_COLUMN = "upload_name"

								UUID_COLUMN = "uuid"


								class BaseReader(collections.abc.Iterable):

								    """Reader has a role to parse files and return a Record iterator."""


								    @abc.abstractmethod

								    def __iter__(self) -> Iterator[Dict[Any, Any]]:

								        """Creates an iterator for elements of this dataset.


								        Returns:

								            A `dict` for the elements of this dataset.

								        """

								        raise NotImplementedError("Please implement this method in the subclass.")


								    @property

								    @abc.abstractmethod

								    def errors(self):

								        raise NotImplementedError("Please implement this method in the subclass.")


								    @abc.abstractmethod

								    def batch(self, batch_size: int) -> Iterator[pd.DataFrame]:

								        raise NotImplementedError("Please implement this method in the subclass.")


								class Parser(abc.ABC):

								    """The abstract file parser."""


								    @abc.abstractmethod

								    def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:

								        """Parses the file and returns the dictionary."""

								        raise NotImplementedError("Please implement this method in the subclass.")


								    @property

								    def errors(self) -> List[FileParseException]:

								        """Returns parsing errors."""

								        return []


								@dataclasses.dataclass

								class FileName:

								    full_path: str

								    generated_name: str

								    upload_name: str


								class Builder(abc.ABC):

								    """The abstract Record builder."""


								    @abc.abstractmethod

								    def build(self, row: Dict[Any, Any], filename: FileName, line_num: int) -> Record:

								        """Builds the record from the dictionary."""

								        raise NotImplementedError("Please implement this method in the subclass.")


								class Reader(BaseReader):

								    def __init__(self, filenames: List[FileName], parser: Parser):

								        self.filenames = filenames

								        self.parser = parser

								        self._errors: List[FileParseException] = []


								    def __iter__(self) -> Iterator[Dict[Any, Any]]:

								        for filename in self.filenames:

								            rows = self.parser.parse(filename.full_path)

								            for line_num, row in enumerate(rows, start=1):

								                try:

								                    yield {

								                        LINE_NUM_COLUMN: line_num,

								                        UUID_COLUMN: uuid.uuid4(),

								                        FILE_NAME_COLUMN: filename.generated_name,

								                        UPLOAD_NAME_COLUMN: filename.upload_name,

								                        **row,

								                    }

								                except FileParseException as e:

								                    self._errors.append(e)


								    def batch(self, batch_size: int) -> Iterator[pd.DataFrame]:

								        batch = []

								        for record in self:

								            batch.append(record)

								            if len(batch) == batch_size:

								                yield pd.DataFrame(batch)

								                batch = []

								        if batch:

								            yield pd.DataFrame(batch)


								    @property

								    def errors(self) -> List[FileParseException]:

								        """Aggregates parser and builder errors."""

								        errors = self.parser.errors + self._errors

								        errors.sort(key=lambda error: error.line_num)

								        return [error.dict() for error in errors]