Browse Source

Delete classifier

pull/1075/head
Hironsan 4 years ago
parent
commit
1adf8644b8
5 changed files with 0 additions and 118 deletions
  1. 0
      app/classifier/__init__.py
  2. 12
      app/classifier/model.py
  3. 27
      app/classifier/preprocess.py
  4. 34
      app/classifier/task.py
  5. 45
      app/classifier/utils.py

0
app/classifier/__init__.py

12
app/classifier/model.py

@ -1,12 +0,0 @@
"""
Baseline model.
"""
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
def build_model():
estimator = CalibratedClassifierCV(base_estimator=LinearSVC())
return estimator

27
app/classifier/preprocess.py

@ -1,27 +0,0 @@
"""
Preprocessor.
"""
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
t = MeCab.Tagger('-Owakati')
def tokenize(text):
"""Tokenize Japanese text.
Args:
text: Japanese string.
Returns:
A list of words.
"""
words = t.parse(text).rstrip().split()
return words
def build_vectorizer():
vectorizer = TfidfVectorizer(tokenizer=tokenize)
return vectorizer

34
app/classifier/task.py

@ -1,34 +0,0 @@
"""
Task runner.
"""
import numpy as np
from doccano.app.classifier.model import build_model
from doccano.app.classifier import build_vectorizer
from doccano.app.classifier import load_dataset, save_dataset, make_output, train_test_split
def run(filename):
print('Loading dataset...')
data = load_dataset(filename)
x_train, x_test, y_train, ids = train_test_split(data)
print('Building vectorizer and model...')
vectorizer = build_vectorizer()
clf = build_model()
print('Vectorizing...')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print('Fitting...')
clf.fit(x_train, y_train)
print('Predicting...')
y_pred = clf.predict(x_test)
y_prob = clf.predict_proba(x_test)
y_prob = np.max(y_prob, axis=-1)
print('Saving...')
data = make_output(data, ids, y_pred, y_prob)
save_dataset(data, filename)

45
app/classifier/utils.py

@ -1,45 +0,0 @@
"""
Utilities.
"""
import json
def train_test_split(data):
x_train, x_test, y_train, ids = [], [], [], []
for d in data:
text = d['text']
label = d['label']
if d['manual']:
x_train.append(text)
y_train.append(label)
else:
x_test.append(text)
ids.append(d['id'])
return x_train, x_test, y_train, ids
def load_dataset(filename):
with open(filename) as f:
data = [json.loads(line) for line in f]
return data
def save_dataset(obj, filename):
with open(filename, 'w') as f:
for line in obj:
f.write('{}\n'.format(json.dumps(line)))
def make_output(data, ids, y_pred, y_prob):
i = 0
for d in data:
if i == len(ids):
break
if d['id'] == ids[i]:
d['label'] = str(y_pred[i])
d['prob'] = float(y_prob[i])
i += 1
return data
Loading…
Cancel
Save