Browse Source

Fix infinite loop if the specified encoding is Auto and the file is small

pull/1331/head
Hironsan 3 years ago
parent
commit
a883f3d1c2
1 changed files with 13 additions and 2 deletions
  1. 15
      backend/api/views/upload/dataset.py

15
backend/api/views/upload/dataset.py

@ -1,8 +1,10 @@
import csv import csv
import io import io
import json import json
import os
from typing import Dict, Iterator, List, Optional, Type from typing import Dict, Iterator, List, Optional, Type
import chardet
import pydantic.error_wrappers import pydantic.error_wrappers
import pyexcel import pyexcel
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
@ -79,11 +81,20 @@ class Dataset:
def detect_encoding(self, filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE): def detect_encoding(self, filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE):
if self.encoding != 'Auto': if self.encoding != 'Auto':
return self.encoding return self.encoding
# For a small file.
if os.path.getsize(filename) < buffer_size:
detected = chardet.detect(open(filename, 'rb').read())
return detected.get('encoding', 'utf-8')
# For a large file.
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
detector = UniversalDetector() detector = UniversalDetector()
while True: while True:
read = f.read(buffer_size)
detector.feed(read)
binary = f.read(buffer_size)
detector.feed(binary)
if binary == b'':
break
if detector.done: if detector.done:
break break
if detector.done: if detector.done:

Loading…
Cancel
Save