You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

90 lines
2.6 KiB

import abc
import collections.abc
import dataclasses
import uuid
from typing import Any, Dict, Iterator, List
import pandas as pd
from .exceptions import FileParseException
DEFAULT_TEXT_COLUMN = "text"
DEFAULT_LABEL_COLUMN = "label"
FILE_NAME_COLUMN = "filename"
UPLOAD_NAME_COLUMN = "upload_name"
UUID_COLUMN = "example_uuid"
LINE_NUMBER_COLUMN = "#line_number"
class BaseReader(collections.abc.Iterable):
"""Reader has a role to parse files and return a Record iterator."""
@abc.abstractmethod
def __iter__(self) -> Iterator[Dict[Any, Any]]:
"""Creates an iterator for elements of this dataset.
Returns:
A `dict` for the elements of this dataset.
"""
raise NotImplementedError("Please implement this method in the subclass.")
@property
@abc.abstractmethod
def errors(self):
raise NotImplementedError("Please implement this method in the subclass.")
@abc.abstractmethod
def batch(self, batch_size: int) -> Iterator[pd.DataFrame]:
raise NotImplementedError("Please implement this method in the subclass.")
class Parser(abc.ABC):
"""The abstract file parser."""
@abc.abstractmethod
def parse(self, filename: str) -> Iterator[Dict[Any, Any]]:
"""Parses the file and returns the dictionary."""
raise NotImplementedError("Please implement this method in the subclass.")
@property
def errors(self) -> List[FileParseException]:
"""Returns parsing errors."""
return []
@dataclasses.dataclass
class FileName:
full_path: str
generated_name: str
upload_name: str
class Reader(BaseReader):
def __init__(self, filenames: List[FileName], parser: Parser):
self.filenames = filenames
self.parser = parser
def __iter__(self) -> Iterator[Dict[Any, Any]]:
for filename in self.filenames:
rows = self.parser.parse(filename.full_path)
for row in rows:
yield {
UUID_COLUMN: uuid.uuid4(),
FILE_NAME_COLUMN: filename.generated_name,
UPLOAD_NAME_COLUMN: filename.upload_name,
**row,
}
def batch(self, batch_size: int) -> Iterator[pd.DataFrame]:
batch = []
for record in self:
batch.append(record)
if len(batch) == batch_size:
yield pd.DataFrame(batch)
batch = []
if batch:
yield pd.DataFrame(batch)
@property
def errors(self) -> List[FileParseException]:
return self.parser.errors