Fix data import from non UTF-8 files

5 years ago · e81ebb3f0f
4 changed files with 47 additions and 4 deletions
--- a/app/api/tests/data/example.utf16.csv
+++ b/app/api/tests/data/example.utf16.csv
--- a/app/api/tests/test_api.py
+++ b/app/api/tests/test_api.py
@ -841,6 +841,12 @@ class TestUploader(APITestCase):
                                file_format='csv',
                                expected_status=status.HTTP_201_CREATED)

+    def test_can_upload_csv_with_non_utf8_encoding(self):
+        self.upload_test_helper(project_id=self.classification_project.id,
+                                filename='example.utf16.csv',
+                                file_format='csv',
+                                expected_status=status.HTTP_201_CREATED)
+
    def test_can_upload_seq2seq_csv(self):
        self.upload_test_helper(project_id=self.seq2seq_project.id,
                                filename='example.csv',
--- a/app/api/utils.py
+++ b/app/api/utils.py
@ -7,6 +7,7 @@ from collections import defaultdict
 from random import Random

 import conllu
+from chardet import UniversalDetector
 from django.db import transaction
 from django.conf import settings
 import pyexcel
@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
    """
    def parse(self, file):
        data = []
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)

        # Add check exception

@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
    ```
    """
    def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
        while True:
            batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
            if not batch:
@ -323,7 +326,8 @@ class CSVParser(FileParser):
    ```
    """
    def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
        reader = csv.reader(file)
        yield from ExcelParser.parse_excel_csv_reader(reader)

@ -364,7 +368,8 @@ class ExcelParser(FileParser):
 class JSONParser(FileParser):

    def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
        data = []
        for i, line in enumerate(file, start=1):
            if len(data) >= settings.IMPORT_BATCH_SIZE:
@ -506,3 +511,34 @@ def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
                return 0    # indicate EOF

    return io.BufferedReader(IterStream(), buffer_size=buffer_size)
+
+
+class EncodedIO(io.RawIOBase):
+    def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
+        buffer = b''
+        detector = UniversalDetector()
+
+        while True:
+            read = fobj.read(buffer_size)
+            detector.feed(read)
+            buffer += read
+            if detector.done or len(read) < buffer_size:
+                break
+
+        if detector.done:
+            self.encoding = detector.result['encoding']
+        else:
+            self.encoding = default_encoding
+
+        self._fobj = fobj
+        self._buffer = buffer
+
+    def readable(self):
+        return self._fobj.readable()
+
+    def readinto(self, b):
+        l = len(b)
+        chunk = self._buffer or self._fobj.read(l)
+        output, self._buffer = chunk[:l], chunk[l:]
+        b[:len(output)] = output
+        return len(output)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 apache-libcloud==2.4.0
 applicationinsights==0.11.7
+chardet==3.0.4
 coverage==4.5.3
 dj-database-url==0.5.0
 Django==2.1.7