From e81ebb3f0f140ba5effaae716b9d4be1db2a7d6f Mon Sep 17 00:00:00 2001
From: Clemens Wolff <clewolff@microsoft.com>
Date: Sun, 13 Oct 2019 21:18:50 -0400
Subject: [PATCH] Fix data import from non UTF-8 files

---
 app/api/tests/data/example.utf16.csv | Bin 0 -> 106 bytes
 app/api/tests/test_api.py            |   6 ++++
 app/api/utils.py                     |  44 ++++++++++++++++++++++++---
 requirements.txt                     |   1 +
 4 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 app/api/tests/data/example.utf16.csv

diff --git a/app/api/tests/data/example.utf16.csv b/app/api/tests/data/example.utf16.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9e10f7a67fbdd3f7722fed4e009f76db1c620d1d
GIT binary patch
literal 106
zcmezWuY@6$p@N}=L5CrSA(0^o$jf2iW#D3P1VbH$0ET>qVunnhicE$wpe#tv2@J8z
SIWvHu4uc<1Z933Im_7he021Q>

literal 0
HcmV?d00001

diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py
index 703abbe1..adc6b009 100644
--- a/app/api/tests/test_api.py
+++ b/app/api/tests/test_api.py
@@ -841,6 +841,12 @@ class TestUploader(APITestCase):
                                 file_format='csv',
                                 expected_status=status.HTTP_201_CREATED)
 
+    def test_can_upload_csv_with_non_utf8_encoding(self):
+        self.upload_test_helper(project_id=self.classification_project.id,
+                                filename='example.utf16.csv',
+                                file_format='csv',
+                                expected_status=status.HTTP_201_CREATED)
+
     def test_can_upload_seq2seq_csv(self):
         self.upload_test_helper(project_id=self.seq2seq_project.id,
                                 filename='example.csv',
diff --git a/app/api/utils.py b/app/api/utils.py
index cac7a249..267e0b8f 100644
--- a/app/api/utils.py
+++ b/app/api/utils.py
@@ -7,6 +7,7 @@ from collections import defaultdict
 from random import Random
 
 import conllu
+from chardet import UniversalDetector
 from django.db import transaction
 from django.conf import settings
 import pyexcel
@@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
     """
     def parse(self, file):
         data = []
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
 
         # Add check exception
 
@@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
     ```
     """
     def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
         while True:
             batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE))
             if not batch:
@@ -323,7 +326,8 @@ class CSVParser(FileParser):
     ```
     """
     def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
         reader = csv.reader(file)
         yield from ExcelParser.parse_excel_csv_reader(reader)
 
@@ -364,7 +368,8 @@ class ExcelParser(FileParser):
 class JSONParser(FileParser):
 
     def parse(self, file):
-        file = io.TextIOWrapper(file, encoding='utf-8')
+        file = EncodedIO(file)
+        file = io.TextIOWrapper(file, encoding=file.encoding)
         data = []
         for i, line in enumerate(file, start=1):
             if len(data) >= settings.IMPORT_BATCH_SIZE:
@@ -506,3 +511,34 @@ def iterable_to_io(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
                 return 0    # indicate EOF
 
     return io.BufferedReader(IterStream(), buffer_size=buffer_size)
+
+
+class EncodedIO(io.RawIOBase):
+    def __init__(self, fobj, buffer_size=io.DEFAULT_BUFFER_SIZE, default_encoding='utf-8'):
+        buffer = b''
+        detector = UniversalDetector()
+
+        while True:
+            read = fobj.read(buffer_size)
+            detector.feed(read)
+            buffer += read
+            if detector.done or len(read) < buffer_size:
+                break
+
+        if detector.done:
+            self.encoding = detector.result['encoding']
+        else:
+            self.encoding = default_encoding
+
+        self._fobj = fobj
+        self._buffer = buffer
+
+    def readable(self):
+        return self._fobj.readable()
+
+    def readinto(self, b):
+        l = len(b)
+        chunk = self._buffer or self._fobj.read(l)
+        output, self._buffer = chunk[:l], chunk[l:]
+        b[:len(output)] = output
+        return len(output)
diff --git a/requirements.txt b/requirements.txt
index 6625b265..0067b48d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 apache-libcloud==2.4.0
 applicationinsights==0.11.7
+chardet==3.0.4
 coverage==4.5.3
 dj-database-url==0.5.0
 Django==2.1.7