From 55f73e0cda3cb146af4a317c81c5e267f4c96cc7 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Fri, 2 Apr 2021 14:51:06 +0900 Subject: [PATCH] Add datasets for upload --- app/api/views/upload/__init__.py | 0 app/api/views/upload/dataset.py | 133 +++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 app/api/views/upload/__init__.py create mode 100644 app/api/views/upload/dataset.py diff --git a/app/api/views/upload/__init__.py b/app/api/views/upload/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/api/views/upload/dataset.py b/app/api/views/upload/dataset.py new file mode 100644 index 00000000..c14610df --- /dev/null +++ b/app/api/views/upload/dataset.py @@ -0,0 +1,133 @@ +import csv +import json +from typing import Any, Dict, Iterator, List, Optional + +import pyexcel + + +class Record: + + def __init__(self, + filename: str, + data: str = '', + label: Any = None, + metadata: Optional[Dict] = None): + if metadata is None: + metadata = {} + self.filename = filename + self.data = data + self.label = label + self.metadata = metadata + + +class Dataset: + + def __init__(self, + filenames: List[str], + column_data: str = 'text', + column_label: str = 'label', + **kwargs): + self.filenames = filenames + self.column_data = column_data + self.column_label = column_label + self.kwargs = kwargs + + def __iter__(self) -> Iterator[Record]: + for filename in self.filenames: + yield from self.load(filename) + + def load(self, filename: str) -> Iterator[Record]: + """Loads a file content.""" + raise NotImplementedError() + + def from_row(self, filename: str, row: Dict) -> Record: + data = row.pop(self.column_data) + label = row.pop(self.column_label) + record = Record(filename=filename, data=data, label=label, metadata=row) + return record + + +class FileBaseDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + record = Record(filename=filename, data=filename) + yield record + + +class TextFileDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + record = Record(filename=filename, data=f.read()) + yield record + + +class TextLineDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + for line in f: + record = Record(filename=filename, data=line.rstrip()) + yield record + + +class CsvDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + delimiter = self.kwargs.get('delimiter', ',') + reader = csv.reader(f, delimiter=delimiter) + header = next(reader) + for row in reader: + row = dict(zip(header, row)) + yield self.from_row(filename, row) + + +class JSONDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + dataset = json.load(f) + for row in dataset: + yield self.from_row(filename, row) + + +class JSONLDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + for line in f: + row = json.loads(line) + yield self.from_row(filename, row) + + +class ExcelDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + records = pyexcel.iget_records(filename) + for row in records: + yield self.from_row(filename, row) + + +class FastTextDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + for i, line in enumerate(f, start=1): + labels = [] + tokens = [] + for token in line.rstrip().split(' '): + if token.startswith('__label__'): + labels.append(token[len('__label__'):]) + else: + tokens.append(token) + data = ' '.join(tokens) + record = Record(filename=filename, data=data, label=labels) + yield record + + +class ConllDataset(Dataset): + + def load(self, filename: str) -> Iterator[Record]: + with open(filename) as f: + pass