Browse Source

Add datasets for upload

pull/1310/head
Hironsan 3 years ago
parent
commit
55f73e0cda
2 changed files with 133 additions and 0 deletions
  1. 0
      app/api/views/upload/__init__.py
  2. 133
      app/api/views/upload/dataset.py

0
app/api/views/upload/__init__.py

133
app/api/views/upload/dataset.py

@ -0,0 +1,133 @@
import csv
import json
from typing import Any, Dict, Iterator, List, Optional
import pyexcel
class Record:
def __init__(self,
filename: str,
data: str = '',
label: Any = None,
metadata: Optional[Dict] = None):
if metadata is None:
metadata = {}
self.filename = filename
self.data = data
self.label = label
self.metadata = metadata
class Dataset:
def __init__(self,
filenames: List[str],
column_data: str = 'text',
column_label: str = 'label',
**kwargs):
self.filenames = filenames
self.column_data = column_data
self.column_label = column_label
self.kwargs = kwargs
def __iter__(self) -> Iterator[Record]:
for filename in self.filenames:
yield from self.load(filename)
def load(self, filename: str) -> Iterator[Record]:
"""Loads a file content."""
raise NotImplementedError()
def from_row(self, filename: str, row: Dict) -> Record:
data = row.pop(self.column_data)
label = row.pop(self.column_label)
record = Record(filename=filename, data=data, label=label, metadata=row)
return record
class FileBaseDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
record = Record(filename=filename, data=filename)
yield record
class TextFileDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
record = Record(filename=filename, data=f.read())
yield record
class TextLineDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
for line in f:
record = Record(filename=filename, data=line.rstrip())
yield record
class CsvDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
delimiter = self.kwargs.get('delimiter', ',')
reader = csv.reader(f, delimiter=delimiter)
header = next(reader)
for row in reader:
row = dict(zip(header, row))
yield self.from_row(filename, row)
class JSONDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
dataset = json.load(f)
for row in dataset:
yield self.from_row(filename, row)
class JSONLDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
for line in f:
row = json.loads(line)
yield self.from_row(filename, row)
class ExcelDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
records = pyexcel.iget_records(filename)
for row in records:
yield self.from_row(filename, row)
class FastTextDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
for i, line in enumerate(f, start=1):
labels = []
tokens = []
for token in line.rstrip().split(' '):
if token.startswith('__label__'):
labels.append(token[len('__label__'):])
else:
tokens.append(token)
data = ' '.join(tokens)
record = Record(filename=filename, data=data, label=labels)
yield record
class ConllDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
with open(filename) as f:
pass
Loading…
Cancel
Save