From e81ebb3f0f140ba5effaae716b9d4be1db2a7d6f Mon Sep 17 00:00:00 2001 From: Clemens Wolff Date: Sun, 13 Oct 2019 21:18:50 -0400 Subject: [PATCH] Fix data import from non UTF-8 files --- app/api/tests/data/example.utf16.csv | Bin 0 -> 106 bytes app/api/tests/test_api.py | 6 ++++ app/api/utils.py | 44 ++++++++++++++++++++++++--- requirements.txt | 1 + 4 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 app/api/tests/data/example.utf16.csv diff --git a/app/api/tests/data/example.utf16.csv b/app/api/tests/data/example.utf16.csv new file mode 100644 index 0000000000000000000000000000000000000000..9e10f7a67fbdd3f7722fed4e009f76db1c620d1d GIT binary patch literal 106 zcmezWuY@6$p@N}=L5CrSA(0^o$jf2iW#D3P1VbH$0ET>qVunnhicE$wpe#tv2@J8z SIWvHu4uc<1Z933Im_7he021Q> literal 0 HcmV?d00001 diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py index 703abbe1..adc6b009 100644 --- a/app/api/tests/test_api.py +++ b/app/api/tests/test_api.py @@ -841,6 +841,12 @@ class TestUploader(APITestCase): file_format='csv', expected_status=status.HTTP_201_CREATED) + def test_can_upload_csv_with_non_utf8_encoding(self): + self.upload_test_helper(project_id=self.classification_project.id, + filename='example.utf16.csv', + file_format='csv', + expected_status=status.HTTP_201_CREATED) + def test_can_upload_seq2seq_csv(self): self.upload_test_helper(project_id=self.seq2seq_project.id, filename='example.csv', diff --git a/app/api/utils.py b/app/api/utils.py index cac7a249..267e0b8f 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -7,6 +7,7 @@ from collections import defaultdict from random import Random import conllu +from chardet import UniversalDetector from django.db import transaction from django.conf import settings import pyexcel @@ -245,7 +246,8 @@ class CoNLLParser(FileParser): """ def parse(self, file): data = [] - file = io.TextIOWrapper(file, encoding='utf-8') + file = EncodedIO(file) + file = io.TextIOWrapper(file, encoding=file.encoding) # Add check exception @@ -300,7 +302,8 @@ class PlainTextParser(FileParser): ``` """ def parse(self, file): - file = io.TextIOWrapper(file, encoding='utf-8') + file = EncodedIO(file) + file = io.TextIOWrapper(file, encoding=file.encoding) while True: batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE)) if not batch: @@ -323,7 +326,8 @@ class CSVParser(FileParser): ``` """ def parse(self, file): - file = io.TextIOWrapper(file, encoding='utf-8') + file = EncodedIO(file) + file = io.TextIOWrapper(file, encoding=file.encoding) reader = csv.reader(file) yield from ExcelParser.parse_excel_csv_reader(reader) @@ -364,7 +368,8 @@ class ExcelParser(FileParser): class JSONParser(FileParser): def parse(self, file): - file = io.TextIOWrapper(file, encoding='utf-8') + file = EncodedIO(file) + file = io.TextIOWrapper(file, encoding=file.encoding) data = [] for i, line in enumerate(file, start=1): if len(data) >= settings.IMPORT_BATCH_SIZE: @@ -506,3 +511,34 @@ def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE): return 0 # indicate EOF return io.BufferedReader(IterStream(), buffer_size=buffer_size) + + +class EncodedIO(io.RawIOBase): + def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'): + buffer = b'' + detector = UniversalDetector() + + while True: + read = fobj.read(buffer_size) + detector.feed(read) + buffer += read + if detector.done or len(read) < buffer_size: + break + + if detector.done: + self.encoding = detector.result['encoding'] + else: + self.encoding = default_encoding + + self._fobj = fobj + self._buffer = buffer + + def readable(self): + return self._fobj.readable() + + def readinto(self, b): + l = len(b) + chunk = self._buffer or self._fobj.read(l) + output, self._buffer = chunk[:l], chunk[l:] + b[:len(output)] = output + return len(output) diff --git a/requirements.txt b/requirements.txt index 6625b265..0067b48d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ apache-libcloud==2.4.0 applicationinsights==0.11.7 +chardet==3.0.4 coverage==4.5.3 dj-database-url==0.5.0 Django==2.1.7