You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
import abc import collections.abc import dataclasses import uuid from typing import Any, Dict, Iterator, List
import pandas as pd
from .exceptions import FileParseException
DEFAULT_TEXT_COLUMN = "text" DEFAULT_LABEL_COLUMN = "label" FILE_NAME_COLUMN = "filename" UPLOAD_NAME_COLUMN = "upload_name" UUID_COLUMN = "example_uuid" LINE_NUMBER_COLUMN = "#line_number"
class BaseReader(collections.abc.Iterable): """Reader has a role to parse files and return a Record iterator."""
@abc.abstractmethod def __iter__(self) -> Iterator[Dict[Any, Any]]: """Creates an iterator for elements of this dataset.
Returns: A `dict` for the elements of this dataset. """
raise NotImplementedError("Please implement this method in the subclass.")
@property @abc.abstractmethod def errors(self): raise NotImplementedError("Please implement this method in the subclass.")
@abc.abstractmethod def batch(self, batch_size: int) -> Iterator[pd.DataFrame]: raise NotImplementedError("Please implement this method in the subclass.")
class Parser(abc.ABC): """The abstract file parser."""
@abc.abstractmethod def parse(self, filename: str) -> Iterator[Dict[Any, Any]]: """Parses the file and returns the dictionary.""" raise NotImplementedError("Please implement this method in the subclass.")
@property def errors(self) -> List[FileParseException]: """Returns parsing errors.""" return []
@dataclasses.dataclass class FileName: full_path: str generated_name: str upload_name: str
class Reader(BaseReader): def __init__(self, filenames: List[FileName], parser: Parser): self.filenames = filenames self.parser = parser
def __iter__(self) -> Iterator[Dict[Any, Any]]: for filename in self.filenames: rows = self.parser.parse(filename.full_path) for row in rows: yield { UUID_COLUMN: uuid.uuid4(), FILE_NAME_COLUMN: filename.generated_name, UPLOAD_NAME_COLUMN: filename.upload_name, **row, }
def batch(self, batch_size: int) -> Iterator[pd.DataFrame]: batch = [] for record in self: batch.append(record) if len(batch) == batch_size: yield pd.DataFrame(batch) batch = [] if batch: yield pd.DataFrame(batch)
@property def errors(self) -> List[FileParseException]: return self.parser.errors
|