Browse Source

Remove unnecessary code

pull/1619/head
Hironsan 3 years ago
parent
commit
24ce34f7f2
7 changed files with 1 additions and 543 deletions
  1. 48
      backend/api/tests/upload/test_conll.py
  2. 56
      backend/api/tests/upload/test_csv.py
  3. 41
      backend/api/tests/upload/test_dataset.py
  4. 36
      backend/api/tests/upload/test_fasttext.py
  5. 346
      backend/api/views/upload/dataset.py
  6. 16
      backend/api/views/upload/exception.py
  7. 1
      backend/api/views/upload/parsers.py

48
backend/api/tests/upload/test_conll.py

@ -1,48 +0,0 @@
import os
import shutil
import tempfile
import unittest
from ...views.upload.data import TextData
from ...views.upload.dataset import CoNLLDataset
from ...views.upload.label import OffsetLabel
class TestCoNLLDataset(unittest.TestCase):
def setUp(self):
self.test_dir = tempfile.mkdtemp()
self.test_file = os.path.join(self.test_dir, 'test_file.txt')
self.content = """EU\tB-ORG
rejects\tO
German\tB-MISC
call\tO
to\tO
boycott\tO
British\tB-MISC
lamb\tO
.\tO
Peter\tB-PER
Blackburn\tI-PER
"""
def tearDown(self):
shutil.rmtree(self.test_dir)
def create_file(self, encoding=None):
with open(self.test_file, 'w', encoding=encoding) as f:
f.write(self.content)
def test_can_load(self):
self.create_file()
dataset = CoNLLDataset(
filenames=[self.test_file],
label_class=OffsetLabel,
data_class=TextData
)
it = dataset.load(self.test_file)
record = next(it)
expected = 'EU rejects German call to boycott British lamb .'
self.assertEqual(record.data.text, expected)

56
backend/api/tests/upload/test_csv.py

@ -1,56 +0,0 @@
import os
import shutil
import tempfile
import unittest
from ...views.upload.data import TextData
from ...views.upload.dataset import CsvDataset
from ...views.upload.label import CategoryLabel
class TestCsvDataset(unittest.TestCase):
def setUp(self):
self.test_dir = tempfile.mkdtemp()
self.test_file = os.path.join(self.test_dir, 'test_file.csv')
def tearDown(self):
shutil.rmtree(self.test_dir)
def create_file(self, content):
with open(self.test_file, 'w') as f:
f.write(content)
def assert_record(self, content, dataset, data='Text', label=None):
if label is None:
label = [{'text': 'Label'}]
self.create_file(content)
record = next(dataset.load(self.test_file))
self.assertEqual(record.data.text, data)
self.assertEqual(record.label, label)
def test_can_load_default_column_names(self):
content = 'label,text\nLabel,Text'
dataset = CsvDataset(filenames=[], label_class=CategoryLabel, data_class=TextData)
self.assert_record(content, dataset)
def test_can_change_delimiter(self):
content = 'label\ttext\nLabel\tText'
dataset = CsvDataset(filenames=[], label_class=CategoryLabel, data_class=TextData, delimiter='\t')
self.assert_record(content, dataset)
def test_can_specify_column_name(self):
content = 'star,body\nLabel,Text'
dataset = CsvDataset(filenames=[], label_class=CategoryLabel, data_class=TextData,
column_data='body', column_label='star')
self.assert_record(content, dataset)
def test_can_load_only_text_column(self):
content = 'star,text\nLabel,Text'
dataset = CsvDataset(filenames=[], label_class=CategoryLabel, data_class=TextData)
self.assert_record(content, dataset, label=[])
def test_does_not_match_column_and_row(self):
content = 'text,label\nText'
dataset = CsvDataset(filenames=[], label_class=CategoryLabel, data_class=TextData)
self.assert_record(content, dataset, label=[])

41
backend/api/tests/upload/test_dataset.py

@ -1,41 +0,0 @@
import os
import shutil
import tempfile
import unittest
from ...views.upload.data import TextData
from ...views.upload.dataset import Dataset
from ...views.upload.label import Label
class TestDataset(unittest.TestCase):
def setUp(self):
self.test_dir = tempfile.mkdtemp()
self.test_file = os.path.join(self.test_dir, 'test_file.txt')
self.content = 'こんにちは、世界!'
def tearDown(self):
shutil.rmtree(self.test_dir)
def create_file(self, encoding=None):
with open(self.test_file, 'w', encoding=encoding) as f:
f.write(self.content)
def test_can_load_utf8(self):
self.create_file()
dataset = Dataset(filenames=[], label_class=Label, data_class=TextData)
record = next(dataset.load(self.test_file))
self.assertEqual(record.data.filename, self.test_file)
def test_cannot_load_shiftjis_without_specifying_encoding(self):
self.create_file('shift_jis')
dataset = Dataset(filenames=[], label_class=Label, data_class=TextData)
with self.assertRaises(UnicodeDecodeError):
next(dataset.load(self.test_file))
def test_can_load_shiftjis_with_specifying_encoding(self):
self.create_file('shift_jis')
dataset = Dataset(filenames=[], label_class=Label, data_class=TextData, encoding='shift_jis')
record = next(dataset.load(self.test_file))
self.assertEqual(record.data.filename, self.test_file)

36
backend/api/tests/upload/test_fasttext.py

@ -1,36 +0,0 @@
import os
import shutil
import tempfile
import unittest
from ...views.upload.data import TextData
from ...views.upload.dataset import FastTextDataset
from ...views.upload.label import CategoryLabel
class TestFastTextDataset(unittest.TestCase):
def setUp(self):
self.test_dir = tempfile.mkdtemp()
self.test_file = os.path.join(self.test_dir, 'test_file.txt')
def tearDown(self):
shutil.rmtree(self.test_dir)
def create_file(self, content):
with open(self.test_file, 'w') as f:
f.write(content)
def assert_record(self, content, dataset, data='Text', label=None):
if label is None:
label = [{'text': 'Label'}]
self.create_file(content)
record = next(dataset.load(self.test_file))
self.assertEqual(record.data.text, data)
self.assertEqual(record.label, label)
def test_can_load_default_column_names(self):
content = '__label__sauce __label__cheese Text'
dataset = FastTextDataset(filenames=[], label_class=CategoryLabel, data_class=TextData)
label = [{'text': 'sauce'}, {'text': 'cheese'}]
self.assert_record(content, dataset, label=label)

346
backend/api/views/upload/dataset.py

@ -1,346 +0,0 @@
import csv
import io
import json
import os
from typing import Any, Dict, Iterator, List, Optional, Type
import chardet
import pyexcel
import pyexcel.exceptions
from chardet.universaldetector import UniversalDetector
from pydantic import ValidationError
from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens
from .cleaners import Cleaner
from .data import BaseData
from .exception import FileParseException, FileParseExceptions
from .label import Label
class Record:
def __init__(self,
data: Type[BaseData],
label: List[Label] = None,
meta: Dict[Any, Any] = None,
line_num: int = -1):
if label is None:
label = []
if meta is None:
meta = {}
self._data = data
self._label = label
self._meta = meta
self._line_num = line_num
def __str__(self):
return f'{self._data}\t{self._label}'
def clean(self, cleaner: Cleaner):
label = cleaner.clean(self._label)
changed = len(label) != len(self.label)
self._label = label
if changed:
raise FileParseException(
filename=self._data.filename,
line_num=self._line_num,
message=cleaner.message
)
@property
def data(self):
return self._data
def create_data(self, project):
return self._data.create(project, self._meta)
def create_label(self, project):
return [label.create(project) for label in self._label]
def create_annotation(self, user, example, mapping):
return [label.create_annotation(user, example, mapping) for label in self._label]
@property
def label(self):
return [
{
'text': label.name
} for label in self._label
if label.has_name() and label.name
]
class Dataset:
def __init__(self,
filenames: List[str],
data_class: Type[BaseData],
label_class: Type[Label],
encoding: Optional[str] = None,
**kwargs):
self.filenames = filenames
self.data_class = data_class
self.label_class = label_class
self.encoding = encoding
self.kwargs = kwargs
def __iter__(self) -> Iterator[Record]:
errors = []
for filename in self.filenames:
try:
yield from self.load(filename)
except (UnicodeDecodeError, FileParseException) as err:
message = str(err)
raise FileParseException(filename, line_num=-1, message=message)
except FileParseExceptions as err:
errors.extend(err.exceptions)
if errors:
raise FileParseExceptions(errors)
def load(self, filename: str) -> Iterator[Record]:
"""Loads a file content."""
encoding = self.detect_encoding(filename)
with open(filename, encoding=encoding) as f:
data = self.data_class.parse(filename=filename, text=f.read())
record = Record(data=data)
yield record
def detect_encoding(self, filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
if self.encoding != 'Auto':
return self.encoding
# For a small file.
if os.path.getsize(filename) < buffer_size:
detected = chardet.detect(open(filename, 'rb').read())
return detected.get('encoding', 'utf-8')
# For a large file.
with open(filename, 'rb') as f:
detector = UniversalDetector()
while True:
binary = f.read(buffer_size)
detector.feed(binary)
if binary == b'':
break
if detector.done:
break
if detector.done:
return detector.result['encoding']
else:
return 'utf-8'
def from_row(self, filename: str, row: Dict, line_num: int) -> Record:
column_data = self.kwargs.get('column_data', 'text')
if column_data not in row:
message = f'{column_data} does not exist.'
raise FileParseException(filename, line_num, message)
text = row.pop(column_data)
label = row.pop(self.kwargs.get('column_label', 'label'), [])
label = [label] if isinstance(label, str) else label
try:
label = [self.label_class.parse(o) for o in label]
except (ValidationError, TypeError):
label = []
try:
data = self.data_class.parse(text=text, filename=filename)
except ValidationError:
message = 'The empty text is not allowed.'
raise FileParseException(filename, line_num, message)
record = Record(data=data, label=label, line_num=line_num, meta=row)
return record
class FileBaseDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
data = self.data_class.parse(filename=filename)
record = Record(data=data)
yield record
class TextFileDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
with open(filename, encoding=encoding) as f:
data = self.data_class.parse(filename=filename, text=f.read())
record = Record(data=data)
yield record
class TextLineDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
for line_num, line in enumerate(f, start=1):
try:
data = self.data_class.parse(filename=filename, text=line.rstrip())
record = Record(data=data, line_num=line_num)
yield record
except ValidationError:
message = 'The empty text is not allowed.'
errors.append(FileParseException(filename, line_num, message))
if errors:
raise FileParseExceptions(errors)
class CsvDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
delimiter = self.kwargs.get('delimiter', ',')
reader = csv.reader(f, delimiter=delimiter)
header = next(reader)
column_data = self.kwargs.get('column_data', 'text')
if column_data not in header:
message = f'Column `{column_data}` does not exist in the header: {header}'
raise FileParseException(filename, 1, message)
for line_num, row in enumerate(reader, start=2):
row = dict(zip(header, row))
try:
yield self.from_row(filename, row, line_num)
except FileParseException as err:
errors.append(err)
if errors:
raise FileParseExceptions(errors)
class JSONDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
with open(filename, encoding=encoding) as f:
try:
dataset = json.load(f)
for line_num, row in enumerate(dataset, start=1):
yield self.from_row(filename, row, line_num)
except json.decoder.JSONDecodeError:
message = 'Failed to decode the json file.'
raise FileParseException(filename, line_num=-1, message=message)
class JSONLDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
for line_num, line in enumerate(f, start=1):
try:
row = json.loads(line)
yield self.from_row(filename, row, line_num)
except json.decoder.JSONDecodeError:
message = 'Failed to decode the line.'
errors.append(FileParseException(filename, line_num, message))
if errors:
raise FileParseExceptions(errors)
class ExcelDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
records = pyexcel.iget_records(file_name=filename)
errors = []
try:
for line_num, row in enumerate(records, start=1):
try:
yield self.from_row(filename, row, line_num)
except FileParseException as err:
errors.append(err)
except pyexcel.exceptions.FileTypeNotSupported:
message = 'This file type is not supported.'
raise FileParseException(filename, line_num=-1, message=message)
if errors:
raise FileParseExceptions(errors)
class FastTextDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
for line_num, line in enumerate(f, start=1):
labels = []
tokens = []
for token in line.rstrip().split(' '):
if token.startswith('__label__'):
if token == '__label__':
message = 'Label name is empty.'
errors.append(FileParseException(filename, line_num, message))
break
label_name = token[len('__label__'):]
labels.append(self.label_class.parse(label_name))
else:
tokens.append(token)
text = ' '.join(tokens)
try:
data = self.data_class.parse(filename=filename, text=text)
record = Record(data=data, label=labels, line_num=line_num)
yield record
except ValidationError:
message = 'The empty text is not allowed.'
errors.append(FileParseException(filename, line_num, message))
if errors:
raise FileParseExceptions(errors)
class CoNLLDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
with open(filename, encoding=encoding) as f:
words, tags = [], []
for line_num, line in enumerate(f, start=1):
line = line.rstrip()
if line:
tokens = line.split('\t')
if len(tokens) != 2:
message = 'A line must be separated by tab and has two columns.'
raise FileParseException(filename, line_num, message)
word, tag = tokens
words.append(word)
tags.append(tag)
else:
yield self.create_record(filename, tags, words)
words, tags = [], []
if words:
yield self.create_record(filename, tags, words)
def create_record(self, filename, tags, words):
delimiter = self.kwargs.get('delimiter', ' ')
text = delimiter.join(words)
data = self.data_class.parse(filename=filename, text=text)
labels = self.get_label(words, tags, delimiter)
record = Record(data=data, label=labels)
return record
def get_scheme(self, scheme: str):
mapping = {
'IOB2': IOB2,
'IOE2': IOE2,
'IOBES': IOBES,
'BILOU': BILOU
}
return mapping[scheme]
def get_label(self, words: List[str], tags: List[str], delimiter: str) -> List[Label]:
scheme = self.get_scheme(self.kwargs.get('scheme', 'IOB2'))
tokens = Tokens(tags, scheme)
labels = []
for entity in tokens.entities:
text = delimiter.join(words[:entity.start])
start = len(text) + len(delimiter) if text else len(text)
chunk = words[entity.start: entity.end]
text = delimiter.join(chunk)
end = start + len(text)
labels.append(self.label_class.parse((start, end, entity.tag)))
return labels

16
backend/api/views/upload/exception.py

@ -1,6 +1,3 @@
from typing import List
class FileParseException(Exception):
def __init__(self, filename: str, line_num: int, message: str):
@ -17,16 +14,3 @@ class FileParseException(Exception):
'line': self.line_num,
'message': self.message
}
class FileParseExceptions(Exception):
def __init__(self, exceptions: List[FileParseException]):
self.exceptions = exceptions
def __str__(self) -> str:
return f'ParseErrors: you failed to parse {len(self.exceptions)} lines.'
def __iter__(self) -> FileParseException:
for e in self.exceptions:
yield e.dict()

1
backend/api/views/upload/parsers.py

@ -6,6 +6,7 @@ from typing import Any, Dict, Iterator, List, Tuple
import chardet
import pyexcel
import pyexcel.exceptions
from chardet import UniversalDetector
from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens

Loading…
Cancel
Save