Browse Source

Enable to ingest lines without errors even if an exception occurs during parsing

pull/1573/head
Hironsan 3 years ago
parent
commit
b81c5e4aab
4 changed files with 58 additions and 11 deletions
  1. 5
      backend/api/tasks.py
  2. 2
      backend/api/tests/data/seq2seq/example.csv
  3. 46
      backend/api/views/upload/dataset.py
  4. 16
      backend/api/views/upload/exception.py

5
backend/api/tasks.py

@ -10,7 +10,7 @@ from django.shortcuts import get_object_or_404
from .models import Example, Label, Project
from .views.download.factory import create_repository, create_writer
from .views.download.service import ExportApplicationService
from .views.upload.exception import FileParseException
from .views.upload.exception import FileParseException, FileParseExceptions
from .views.upload.factory import (get_data_class, get_dataset_class,
get_label_class)
from .views.upload.utils import append_field
@ -117,6 +117,9 @@ def ingest_data(user_id, project_id, filenames, format: str, **kwargs):
except FileParseException as err:
response['error'].append(err.dict())
continue
except FileParseExceptions as err:
response['error'].extend(list(err))
continue
buffer.add(example)
if buffer.is_full():

2
backend/api/tests/data/seq2seq/example.csv

@ -1,5 +1,5 @@
text,label
,label2
exampleA,label1
exampleB,
,label2
,

46
backend/api/views/upload/dataset.py

@ -12,7 +12,7 @@ from pydantic import ValidationError
from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens
from .data import BaseData
from .exception import FileParseException
from .exception import FileParseException, FileParseExceptions
from .label import Label
from .labels import Labels
@ -64,12 +64,17 @@ class Dataset:
self.kwargs = kwargs
def __iter__(self) -> Iterator[Record]:
errors = []
for filename in self.filenames:
try:
yield from self.load(filename)
except (UnicodeDecodeError, ValidationError) as err:
except (UnicodeDecodeError, FileParseException) as err:
message = str(err)
raise FileParseException(filename, line_num=-1, message=message)
except FileParseExceptions as err:
errors.extend(err.exceptions)
if errors:
raise FileParseExceptions(errors)
def load(self, filename: str) -> Iterator[Record]:
"""Loads a file content."""
@ -159,6 +164,7 @@ class CsvDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
delimiter = self.kwargs.get('delimiter', ',')
reader = csv.reader(f, delimiter=delimiter)
@ -171,7 +177,12 @@ class CsvDataset(Dataset):
for line_num, row in enumerate(reader, start=2):
row = dict(zip(header, row))
yield self.from_row(filename, row, line_num)
try:
yield self.from_row(filename, row, line_num)
except FileParseException as err:
errors.append(err)
if errors:
raise FileParseExceptions(errors)
class JSONDataset(Dataset):
@ -192,6 +203,7 @@ class JSONLDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
for line_num, line in enumerate(f, start=1):
try:
@ -199,25 +211,34 @@ class JSONLDataset(Dataset):
yield self.from_row(filename, row, line_num)
except json.decoder.JSONDecodeError:
message = 'Failed to decode the line.'
raise FileParseException(filename, line_num, message)
errors.append(FileParseException(filename, line_num, message))
if errors:
raise FileParseExceptions(errors)
class ExcelDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
records = pyexcel.iget_records(file_name=filename)
errors = []
try:
for line_num, row in enumerate(records, start=1):
yield self.from_row(filename, row, line_num)
try:
yield self.from_row(filename, row, line_num)
except FileParseException as err:
errors.append(err)
except pyexcel.exceptions.FileTypeNotSupported:
message = 'This file type is not supported.'
raise FileParseException(filename, line_num=-1, message=message)
if errors:
raise FileParseExceptions(errors)
class FastTextDataset(Dataset):
def load(self, filename: str) -> Iterator[Record]:
encoding = self.detect_encoding(filename)
errors = []
with open(filename, encoding=encoding) as f:
for line_num, line in enumerate(f, start=1):
labels = []
@ -226,15 +247,22 @@ class FastTextDataset(Dataset):
if token.startswith('__label__'):
if token == '__label__':
message = 'Label name is empty.'
raise FileParseException(filename, line_num, message)
errors.append(FileParseException(filename, line_num, message))
break
label_name = token[len('__label__'):]
labels.append(self.label_class.parse(label_name))
else:
tokens.append(token)
text = ' '.join(tokens)
data = self.data_class.parse(filename=filename, text=text)
record = Record(data=data, label=labels)
yield record
try:
data = self.data_class.parse(filename=filename, text=text)
record = Record(data=data, label=labels)
yield record
except ValidationError:
message = 'The empty text is not allowed.'
errors.append(FileParseException(filename, line_num, message))
if errors:
raise FileParseExceptions(errors)
class CoNLLDataset(Dataset):

16
backend/api/views/upload/exception.py

@ -1,3 +1,6 @@
from typing import List
class FileParseException(Exception):
def __init__(self, filename: str, line_num: int, message: str):
@ -14,3 +17,16 @@ class FileParseException(Exception):
'line': self.line_num,
'message': self.message
}
class FileParseExceptions(Exception):
def __init__(self, exceptions: List[FileParseException]):
self.exceptions = exceptions
def __str__(self) -> str:
return f'ParseErrors: you failed to parse {len(self.exceptions)} lines.'
def __iter__(self) -> FileParseException:
for e in self.exceptions:
yield e.dict()
Loading…
Cancel
Save