|
@ -1,8 +1,10 @@ |
|
|
import csv |
|
|
import csv |
|
|
import io |
|
|
import io |
|
|
import json |
|
|
import json |
|
|
|
|
|
import os |
|
|
from typing import Dict, Iterator, List, Optional, Type |
|
|
from typing import Dict, Iterator, List, Optional, Type |
|
|
|
|
|
|
|
|
|
|
|
import chardet |
|
|
import pydantic.error_wrappers |
|
|
import pydantic.error_wrappers |
|
|
import pyexcel |
|
|
import pyexcel |
|
|
from chardet.universaldetector import UniversalDetector |
|
|
from chardet.universaldetector import UniversalDetector |
|
@ -79,11 +81,20 @@ class Dataset: |
|
|
def detect_encoding(self, filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE): |
|
|
def detect_encoding(self, filename: str, buffer_size=io.DEFAULT_BUFFER_SIZE): |
|
|
if self.encoding != 'Auto': |
|
|
if self.encoding != 'Auto': |
|
|
return self.encoding |
|
|
return self.encoding |
|
|
|
|
|
|
|
|
|
|
|
# For a small file. |
|
|
|
|
|
if os.path.getsize(filename) < buffer_size: |
|
|
|
|
|
detected = chardet.detect(open(filename, 'rb').read()) |
|
|
|
|
|
return detected.get('encoding', 'utf-8') |
|
|
|
|
|
|
|
|
|
|
|
# For a large file. |
|
|
with open(filename, 'rb') as f: |
|
|
with open(filename, 'rb') as f: |
|
|
detector = UniversalDetector() |
|
|
detector = UniversalDetector() |
|
|
while True: |
|
|
while True: |
|
|
read = f.read(buffer_size) |
|
|
|
|
|
detector.feed(read) |
|
|
|
|
|
|
|
|
binary = f.read(buffer_size) |
|
|
|
|
|
detector.feed(binary) |
|
|
|
|
|
if binary == b'': |
|
|
|
|
|
break |
|
|
if detector.done: |
|
|
if detector.done: |
|
|
break |
|
|
break |
|
|
if detector.done: |
|
|
if detector.done: |
|
|