You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

34 lines
910 B

  1. """
  2. Task runner.
  3. """
  4. import numpy as np
  5. from doccano.app.classifier.model import build_model
  6. from doccano.app.classifier import build_vectorizer
  7. from doccano.app.classifier import load_dataset, save_dataset, make_output, train_test_split
  8. def run(filename):
  9. print('Loading dataset...')
  10. data = load_dataset(filename)
  11. x_train, x_test, y_train, ids = train_test_split(data)
  12. print('Building vectorizer and model...')
  13. vectorizer = build_vectorizer()
  14. clf = build_model()
  15. print('Vectorizing...')
  16. x_train = vectorizer.fit_transform(x_train)
  17. x_test = vectorizer.transform(x_test)
  18. print('Fitting...')
  19. clf.fit(x_train, y_train)
  20. print('Predicting...')
  21. y_pred = clf.predict(x_test)
  22. y_prob = clf.predict_proba(x_test)
  23. y_prob = np.max(y_prob, axis=-1)
  24. print('Saving...')
  25. data = make_output(data, ids, y_pred, y_prob)
  26. save_dataset(data, filename)