mirror of https://github.com/doccano/doccano.git
pythondatasetnatural-language-processingdata-labelingmachine-learningannotation-tooldatasetsactive-learningtext-annotation
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
47 lines
1.1 KiB
47 lines
1.1 KiB
import abc
|
|
import os
|
|
import uuid
|
|
import zipfile
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def zip_files(files, dirname):
|
|
save_file = os.path.join(dirname, f"{uuid.uuid4()}.zip")
|
|
with zipfile.ZipFile(save_file, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
for file in files:
|
|
zf.write(filename=file, arcname=os.path.basename(file))
|
|
return save_file
|
|
|
|
|
|
class Writer(abc.ABC):
|
|
extension = ""
|
|
|
|
@staticmethod
|
|
@abc.abstractmethod
|
|
def write(file, dataset: pd.DataFrame):
|
|
raise NotImplementedError("Please implement this method in the subclass.")
|
|
|
|
|
|
class CsvWriter(Writer):
|
|
extension = "csv"
|
|
|
|
@staticmethod
|
|
def write(file, dataset: pd.DataFrame):
|
|
dataset.to_csv(file, index=False, encoding="utf-8")
|
|
|
|
|
|
class JsonWriter(Writer):
|
|
extension = "json"
|
|
|
|
@staticmethod
|
|
def write(file, dataset: pd.DataFrame):
|
|
dataset.to_json(file, orient="records", force_ascii=False)
|
|
|
|
|
|
class JsonlWriter(Writer):
|
|
extension = "jsonl"
|
|
|
|
@staticmethod
|
|
def write(file, dataset: pd.DataFrame):
|
|
dataset.to_json(file, orient="records", force_ascii=False, lines=True)
|