mirror of https://github.com/doccano/doccano.git
Hironsan
3 years ago
2 changed files with 133 additions and 0 deletions
Split View
Diff Options
@ -0,0 +1,133 @@ |
|||
import csv |
|||
import json |
|||
from typing import Any, Dict, Iterator, List, Optional |
|||
|
|||
import pyexcel |
|||
|
|||
|
|||
class Record: |
|||
|
|||
def __init__(self, |
|||
filename: str, |
|||
data: str = '', |
|||
label: Any = None, |
|||
metadata: Optional[Dict] = None): |
|||
if metadata is None: |
|||
metadata = {} |
|||
self.filename = filename |
|||
self.data = data |
|||
self.label = label |
|||
self.metadata = metadata |
|||
|
|||
|
|||
class Dataset: |
|||
|
|||
def __init__(self, |
|||
filenames: List[str], |
|||
column_data: str = 'text', |
|||
column_label: str = 'label', |
|||
**kwargs): |
|||
self.filenames = filenames |
|||
self.column_data = column_data |
|||
self.column_label = column_label |
|||
self.kwargs = kwargs |
|||
|
|||
def __iter__(self) -> Iterator[Record]: |
|||
for filename in self.filenames: |
|||
yield from self.load(filename) |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
"""Loads a file content.""" |
|||
raise NotImplementedError() |
|||
|
|||
def from_row(self, filename: str, row: Dict) -> Record: |
|||
data = row.pop(self.column_data) |
|||
label = row.pop(self.column_label) |
|||
record = Record(filename=filename, data=data, label=label, metadata=row) |
|||
return record |
|||
|
|||
|
|||
class FileBaseDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
record = Record(filename=filename, data=filename) |
|||
yield record |
|||
|
|||
|
|||
class TextFileDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
record = Record(filename=filename, data=f.read()) |
|||
yield record |
|||
|
|||
|
|||
class TextLineDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
for line in f: |
|||
record = Record(filename=filename, data=line.rstrip()) |
|||
yield record |
|||
|
|||
|
|||
class CsvDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
delimiter = self.kwargs.get('delimiter', ',') |
|||
reader = csv.reader(f, delimiter=delimiter) |
|||
header = next(reader) |
|||
for row in reader: |
|||
row = dict(zip(header, row)) |
|||
yield self.from_row(filename, row) |
|||
|
|||
|
|||
class JSONDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
dataset = json.load(f) |
|||
for row in dataset: |
|||
yield self.from_row(filename, row) |
|||
|
|||
|
|||
class JSONLDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
for line in f: |
|||
row = json.loads(line) |
|||
yield self.from_row(filename, row) |
|||
|
|||
|
|||
class ExcelDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
records = pyexcel.iget_records(filename) |
|||
for row in records: |
|||
yield self.from_row(filename, row) |
|||
|
|||
|
|||
class FastTextDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
for i, line in enumerate(f, start=1): |
|||
labels = [] |
|||
tokens = [] |
|||
for token in line.rstrip().split(' '): |
|||
if token.startswith('__label__'): |
|||
labels.append(token[len('__label__'):]) |
|||
else: |
|||
tokens.append(token) |
|||
data = ' '.join(tokens) |
|||
record = Record(filename=filename, data=data, label=labels) |
|||
yield record |
|||
|
|||
|
|||
class ConllDataset(Dataset): |
|||
|
|||
def load(self, filename: str) -> Iterator[Record]: |
|||
with open(filename) as f: |
|||
pass |
Write
Preview
Loading…
Cancel
Save