From b81c5e4aab1b14b85de87e0c47fa7ef7d4f1dea2 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Fri, 19 Nov 2021 14:18:23 +0900 Subject: [PATCH] Enable to ingest lines without errors even if an exception occurs during parsing --- backend/api/tasks.py | 5 ++- backend/api/tests/data/seq2seq/example.csv | 2 +- backend/api/views/upload/dataset.py | 46 +++++++++++++++++----- backend/api/views/upload/exception.py | 16 ++++++++ 4 files changed, 58 insertions(+), 11 deletions(-) diff --git a/backend/api/tasks.py b/backend/api/tasks.py index 1c6ec0ea..09c35d71 100644 --- a/backend/api/tasks.py +++ b/backend/api/tasks.py @@ -10,7 +10,7 @@ from django.shortcuts import get_object_or_404 from .models import Example, Label, Project from .views.download.factory import create_repository, create_writer from .views.download.service import ExportApplicationService -from .views.upload.exception import FileParseException +from .views.upload.exception import FileParseException, FileParseExceptions from .views.upload.factory import (get_data_class, get_dataset_class, get_label_class) from .views.upload.utils import append_field @@ -117,6 +117,9 @@ def ingest_data(user_id, project_id, filenames, format: str, **kwargs): except FileParseException as err: response['error'].append(err.dict()) continue + except FileParseExceptions as err: + response['error'].extend(list(err)) + continue buffer.add(example) if buffer.is_full(): diff --git a/backend/api/tests/data/seq2seq/example.csv b/backend/api/tests/data/seq2seq/example.csv index 41bae2ef..a5902201 100644 --- a/backend/api/tests/data/seq2seq/example.csv +++ b/backend/api/tests/data/seq2seq/example.csv @@ -1,5 +1,5 @@ text,label +,label2 exampleA,label1 exampleB, -,label2 , diff --git a/backend/api/views/upload/dataset.py b/backend/api/views/upload/dataset.py index 89a235fc..28ff8f48 100644 --- a/backend/api/views/upload/dataset.py +++ b/backend/api/views/upload/dataset.py @@ -12,7 +12,7 @@ from pydantic import ValidationError from seqeval.scheme import BILOU, IOB2, IOBES, IOE2, Tokens from .data import BaseData -from .exception import FileParseException +from .exception import FileParseException, FileParseExceptions from .label import Label from .labels import Labels @@ -64,12 +64,17 @@ class Dataset: self.kwargs = kwargs def __iter__(self) -> Iterator[Record]: + errors = [] for filename in self.filenames: try: yield from self.load(filename) - except (UnicodeDecodeError, ValidationError) as err: + except (UnicodeDecodeError, FileParseException) as err: message = str(err) raise FileParseException(filename, line_num=-1, message=message) + except FileParseExceptions as err: + errors.extend(err.exceptions) + if errors: + raise FileParseExceptions(errors) def load(self, filename: str) -> Iterator[Record]: """Loads a file content.""" @@ -159,6 +164,7 @@ class CsvDataset(Dataset): def load(self, filename: str) -> Iterator[Record]: encoding = self.detect_encoding(filename) + errors = [] with open(filename, encoding=encoding) as f: delimiter = self.kwargs.get('delimiter', ',') reader = csv.reader(f, delimiter=delimiter) @@ -171,7 +177,12 @@ class CsvDataset(Dataset): for line_num, row in enumerate(reader, start=2): row = dict(zip(header, row)) - yield self.from_row(filename, row, line_num) + try: + yield self.from_row(filename, row, line_num) + except FileParseException as err: + errors.append(err) + if errors: + raise FileParseExceptions(errors) class JSONDataset(Dataset): @@ -192,6 +203,7 @@ class JSONLDataset(Dataset): def load(self, filename: str) -> Iterator[Record]: encoding = self.detect_encoding(filename) + errors = [] with open(filename, encoding=encoding) as f: for line_num, line in enumerate(f, start=1): try: @@ -199,25 +211,34 @@ class JSONLDataset(Dataset): yield self.from_row(filename, row, line_num) except json.decoder.JSONDecodeError: message = 'Failed to decode the line.' - raise FileParseException(filename, line_num, message) + errors.append(FileParseException(filename, line_num, message)) + if errors: + raise FileParseExceptions(errors) class ExcelDataset(Dataset): def load(self, filename: str) -> Iterator[Record]: records = pyexcel.iget_records(file_name=filename) + errors = [] try: for line_num, row in enumerate(records, start=1): - yield self.from_row(filename, row, line_num) + try: + yield self.from_row(filename, row, line_num) + except FileParseException as err: + errors.append(err) except pyexcel.exceptions.FileTypeNotSupported: message = 'This file type is not supported.' raise FileParseException(filename, line_num=-1, message=message) + if errors: + raise FileParseExceptions(errors) class FastTextDataset(Dataset): def load(self, filename: str) -> Iterator[Record]: encoding = self.detect_encoding(filename) + errors = [] with open(filename, encoding=encoding) as f: for line_num, line in enumerate(f, start=1): labels = [] @@ -226,15 +247,22 @@ class FastTextDataset(Dataset): if token.startswith('__label__'): if token == '__label__': message = 'Label name is empty.' - raise FileParseException(filename, line_num, message) + errors.append(FileParseException(filename, line_num, message)) + break label_name = token[len('__label__'):] labels.append(self.label_class.parse(label_name)) else: tokens.append(token) text = ' '.join(tokens) - data = self.data_class.parse(filename=filename, text=text) - record = Record(data=data, label=labels) - yield record + try: + data = self.data_class.parse(filename=filename, text=text) + record = Record(data=data, label=labels) + yield record + except ValidationError: + message = 'The empty text is not allowed.' + errors.append(FileParseException(filename, line_num, message)) + if errors: + raise FileParseExceptions(errors) class CoNLLDataset(Dataset): diff --git a/backend/api/views/upload/exception.py b/backend/api/views/upload/exception.py index 6c0d7a57..b00edd5a 100644 --- a/backend/api/views/upload/exception.py +++ b/backend/api/views/upload/exception.py @@ -1,3 +1,6 @@ +from typing import List + + class FileParseException(Exception): def __init__(self, filename: str, line_num: int, message: str): @@ -14,3 +17,16 @@ class FileParseException(Exception): 'line': self.line_num, 'message': self.message } + + +class FileParseExceptions(Exception): + + def __init__(self, exceptions: List[FileParseException]): + self.exceptions = exceptions + + def __str__(self) -> str: + return f'ParseErrors: you failed to parse {len(self.exceptions)} lines.' + + def __iter__(self) -> FileParseException: + for e in self.exceptions: + yield e.dict()