diff --git a/app/classifier/__init__.py b/app/classifier/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/app/classifier/model.py b/app/classifier/model.py deleted file mode 100644 index 62b0d59d..00000000 --- a/app/classifier/model.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Baseline model. -""" - -from sklearn.calibration import CalibratedClassifierCV -from sklearn.svm import LinearSVC - - -def build_model(): - estimator = CalibratedClassifierCV(base_estimator=LinearSVC()) - - return estimator diff --git a/app/classifier/preprocess.py b/app/classifier/preprocess.py deleted file mode 100644 index 548f6e8f..00000000 --- a/app/classifier/preprocess.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Preprocessor. -""" -import MeCab -from sklearn.feature_extraction.text import TfidfVectorizer - -t = MeCab.Tagger('-Owakati') - - -def tokenize(text): - """Tokenize Japanese text. - - Args: - text: Japanese string. - - Returns: - A list of words. - """ - words = t.parse(text).rstrip().split() - - return words - - -def build_vectorizer(): - vectorizer = TfidfVectorizer(tokenizer=tokenize) - - return vectorizer diff --git a/app/classifier/task.py b/app/classifier/task.py deleted file mode 100644 index 32e6b88e..00000000 --- a/app/classifier/task.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Task runner. -""" -import numpy as np - -from doccano.app.classifier.model import build_model -from doccano.app.classifier import build_vectorizer -from doccano.app.classifier import load_dataset, save_dataset, make_output, train_test_split - - -def run(filename): - print('Loading dataset...') - data = load_dataset(filename) - x_train, x_test, y_train, ids = train_test_split(data) - - print('Building vectorizer and model...') - vectorizer = build_vectorizer() - clf = build_model() - - print('Vectorizing...') - x_train = vectorizer.fit_transform(x_train) - x_test = vectorizer.transform(x_test) - - print('Fitting...') - clf.fit(x_train, y_train) - - print('Predicting...') - y_pred = clf.predict(x_test) - y_prob = clf.predict_proba(x_test) - y_prob = np.max(y_prob, axis=-1) - - print('Saving...') - data = make_output(data, ids, y_pred, y_prob) - save_dataset(data, filename) diff --git a/app/classifier/utils.py b/app/classifier/utils.py deleted file mode 100644 index 068c3183..00000000 --- a/app/classifier/utils.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Utilities. -""" -import json - - -def train_test_split(data): - x_train, x_test, y_train, ids = [], [], [], [] - for d in data: - text = d['text'] - label = d['label'] - if d['manual']: - x_train.append(text) - y_train.append(label) - else: - x_test.append(text) - ids.append(d['id']) - - return x_train, x_test, y_train, ids - - -def load_dataset(filename): - with open(filename) as f: - data = [json.loads(line) for line in f] - - return data - - -def save_dataset(obj, filename): - with open(filename, 'w') as f: - for line in obj: - f.write('{}\n'.format(json.dumps(line))) - - -def make_output(data, ids, y_pred, y_prob): - i = 0 - for d in data: - if i == len(ids): - break - if d['id'] == ids[i]: - d['label'] = str(y_pred[i]) - d['prob'] = float(y_prob[i]) - i += 1 - - return data