mirror of https://github.com/doccano/doccano.git
pythonannotation-tooldatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learning
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
34 lines
910 B
34 lines
910 B
"""
|
|
Task runner.
|
|
"""
|
|
import numpy as np
|
|
|
|
from doccano.app.classifier.model import build_model
|
|
from doccano.app.classifier import build_vectorizer
|
|
from doccano.app.classifier import load_dataset, save_dataset, make_output, train_test_split
|
|
|
|
|
|
def run(filename):
|
|
print('Loading dataset...')
|
|
data = load_dataset(filename)
|
|
x_train, x_test, y_train, ids = train_test_split(data)
|
|
|
|
print('Building vectorizer and model...')
|
|
vectorizer = build_vectorizer()
|
|
clf = build_model()
|
|
|
|
print('Vectorizing...')
|
|
x_train = vectorizer.fit_transform(x_train)
|
|
x_test = vectorizer.transform(x_test)
|
|
|
|
print('Fitting...')
|
|
clf.fit(x_train, y_train)
|
|
|
|
print('Predicting...')
|
|
y_pred = clf.predict(x_test)
|
|
y_prob = clf.predict_proba(x_test)
|
|
y_prob = np.max(y_prob, axis=-1)
|
|
|
|
print('Saving...')
|
|
data = make_output(data, ids, y_pred, y_prob)
|
|
save_dataset(data, filename)
|