Browse Source

Fix data import from non UTF-8 files

pull/399/head
Clemens Wolff 5 years ago
parent
commit
e81ebb3f0f
4 changed files with 47 additions and 4 deletions
  1. BIN
      app/api/tests/data/example.utf16.csv
  2. 6
      app/api/tests/test_api.py
  3. 44
      app/api/utils.py
  4. 1
      requirements.txt

BIN
app/api/tests/data/example.utf16.csv

6
app/api/tests/test_api.py

@ -841,6 +841,12 @@ class TestUploader(APITestCase):
file_format='csv',
expected_status=status.HTTP_201_CREATED)
def test_can_upload_csv_with_non_utf8_encoding(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.utf16.csv',
file_format='csv',
expected_status=status.HTTP_201_CREATED)
def test_can_upload_seq2seq_csv(self):
self.upload_test_helper(project_id=self.seq2seq_project.id,
filename='example.csv',

44
app/api/utils.py

@ -7,6 +7,7 @@ from collections import defaultdict
from random import Random
import conllu
from chardet import UniversalDetector
from django.db import transaction
from django.conf import settings
import pyexcel
@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
"""
def parse(self, file):
data = []
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
# Add check exception
@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
```
"""
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
while True:
batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
if not batch:
@ -323,7 +326,8 @@ class CSVParser(FileParser):
```
"""
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
reader = csv.reader(file)
yield from ExcelParser.parse_excel_csv_reader(reader)
@ -364,7 +368,8 @@ class ExcelParser(FileParser):
class JSONParser(FileParser):
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
data = []
for i, line in enumerate(file, start=1):
if len(data) >= settings.IMPORT_BATCH_SIZE:
@ -506,3 +511,34 @@ def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
return 0 # indicate EOF
return io.BufferedReader(IterStream(), buffer_size=buffer_size)
class EncodedIO(io.RawIOBase):
def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
buffer = b''
detector = UniversalDetector()
while True:
read = fobj.read(buffer_size)
detector.feed(read)
buffer += read
if detector.done or len(read) < buffer_size:
break
if detector.done:
self.encoding = detector.result['encoding']
else:
self.encoding = default_encoding
self._fobj = fobj
self._buffer = buffer
def readable(self):
return self._fobj.readable()
def readinto(self, b):
l = len(b)
chunk = self._buffer or self._fobj.read(l)
output, self._buffer = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)

1
requirements.txt

@ -1,5 +1,6 @@
apache-libcloud==2.4.0
applicationinsights==0.11.7
chardet==3.0.4
coverage==4.5.3
dj-database-url==0.5.0
Django==2.1.7

Loading…
Cancel
Save