You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

34 lines
910 B

"""
Task runner.
"""
import numpy as np
from doccano.app.classifier.model import build_model
from doccano.app.classifier import build_vectorizer
from doccano.app.classifier import load_dataset, save_dataset, make_output, train_test_split
def run(filename):
print('Loading dataset...')
data = load_dataset(filename)
x_train, x_test, y_train, ids = train_test_split(data)
print('Building vectorizer and model...')
vectorizer = build_vectorizer()
clf = build_model()
print('Vectorizing...')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print('Fitting...')
clf.fit(x_train, y_train)
print('Predicting...')
y_pred = clf.predict(x_test)
y_prob = clf.predict_proba(x_test)
y_prob = np.max(y_prob, axis=-1)
print('Saving...')
data = make_output(data, ids, y_pred, y_prob)
save_dataset(data, filename)