From a883f3d1c29f9f1eeab2fcfb2529a95f5b957fb6 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Sun, 25 Apr 2021 12:31:38 +0900 Subject: [PATCH] Fix infinite loop if the specified encoding is Auto and the file is small --- backend/api/views/upload/dataset.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/backend/api/views/upload/dataset.py b/backend/api/views/upload/dataset.py index a7da19fb..0293231a 100644 --- a/backend/api/views/upload/dataset.py +++ b/backend/api/views/upload/dataset.py @@ -1,8 +1,10 @@ import csv import io import json +import os from typing import Dict, Iterator, List, Optional, Type +import chardet import pydantic.error_wrappers import pyexcel from chardet.universaldetector import UniversalDetector @@ -79,11 +81,20 @@ class Dataset: def detect_encoding(self, filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE): if self.encoding != 'Auto': return self.encoding + + # For a small file. + if os.path.getsize(filename) < buffer_size: + detected = chardet.detect(open(filename, 'rb').read()) + return detected.get('encoding', 'utf-8') + + # For a large file. with open(filename, 'rb') as f: detector = UniversalDetector() while True: - read = f.read(buffer_size) - detector.feed(read) + binary = f.read(buffer_size) + detector.feed(binary) + if binary == b'': + break if detector.done: break if detector.done: