Implement classifier

7 years ago · 0ec45e48e9
8 changed files with 142 additions and 0 deletions
--- a/doccano/classifier/model.py
+++ b/doccano/classifier/model.py
@ -0,0 +1,12 @@
+"""
+Baseline model.
+"""
+
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.svm import LinearSVC
+
+
+def build_model():
+    estimator = CalibratedClassifierCV(base_estimator=LinearSVC())
+
+    return estimator
--- a/doccano/classifier/preprocess.py
+++ b/doccano/classifier/preprocess.py
@ -0,0 +1,27 @@
+"""
+Preprocessor.
+"""
+import MeCab
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+t = MeCab.Tagger('-Owakati')
+
+
+def tokenize(text):
+    """Tokenize Japanese text.
+
+    Args:
+        text: Japanese string.
+
+    Returns:
+        A list of words.
+    """
+    words = t.parse(text).rstrip().split()
+
+    return words
+
+
+def build_vectorizer():
+    vectorizer = TfidfVectorizer(tokenizer=tokenize)
+
+    return vectorizer
--- a/doccano/classifier/task.py
+++ b/doccano/classifier/task.py
@ -0,0 +1,35 @@
+"""
+Task runner.
+"""
+import numpy as np
+
+from doccano.classifier.model import build_model
+from doccano.classifier.preprocess import build_vectorizer
+from doccano.classifier.utils import load_dataset, save_dataset, train_test_split
+
+
+def run(filename):
+    print('Loading dataset...')
+    data = load_dataset(filename)
+    x_train, x_test, y_train, ids = train_test_split(data)
+
+    print('Building vectorizer and model...')
+    vectorizer = build_vectorizer()
+    clf = build_model()
+
+    print('Vectorizing...')
+    x_train = vectorizer.fit_transform(x_train)
+    x_test = vectorizer.transform(x_test)
+
+    print('Fitting...')
+    clf.fit(x_train, y_train)
+
+    print('Predicting...')
+    y_pred = clf.predict(x_test)
+    y_prob = clf.predict_proba(x_test)
+    y_prob = np.max(y_prob, axis=-1)
+
+    print('Saving...')
+    outputs = {}
+    # data, ids, y_pred, y_prob
+    #save_dataset(outputs, filename)
--- a/doccano/classifier/utils.py
+++ b/doccano/classifier/utils.py
@ -0,0 +1,32 @@
+"""
+Utilities.
+"""
+import json
+
+
+def train_test_split(data):
+    x_train, x_test, y_train, ids = [], [], [], []
+    for d in data:
+        text = d['text']
+        label = d['label']
+        if d['manual']:
+            x_train.append(text)
+            y_train.append(label)
+        else:
+            x_test.append(text)
+            ids.append(d['id'])
+
+    return x_train, x_test, y_train, ids
+
+
+def load_dataset(filename):
+    with open(filename) as f:
+        data = [json.loads(line) for line in f]
+
+    return data
+
+
+def save_dataset(obj, filename):
+    with open(filename, 'w') as f:
+        for line in obj:
+            f.write('{}\n'.format(json.dumps(line)))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+mecab-python3==0.7
+numpy==1.14.3
+scikit-learn==0.19.1
+scipy==1.1.0
--- a/tests/.gitkeep
+++ b/tests/.gitkeep
--- a/tests/data/testdata.jsonl
+++ b/tests/data/testdata.jsonl
@ -0,0 +1,18 @@
+{"id": 5, "label": 2, "text": "\u30a2\u30f3\u30d1\u30b5\u30f3\u30c9", "manual": true, "prob": 0.5, "manual": true, "prob": 0.5}
+{"id": 10, "label": 2, "text": "\u8a00\u8a9e", "manual": true, "prob": 0.5}
+{"id": 11, "label": 1, "text": "\u65e5\u672c\u8a9e", "manual": true, "prob": 0.5}
+{"id": 12, "label": 1, "text": "\u5730\u7406\u5b66", "manual": true, "prob": 0.5}
+{"id": 23, "label": 2, "text": "\u56fd\u306e\u4e00\u89a7", "manual": false, "prob": 0.5}
+{"id": 31, "label": 2, "text": "\u30d1\u30ea", "manual": false, "prob": 0.5}
+{"id": 32, "label": 3, "text": "\u30e8\u30fc\u30ed\u30c3\u30d1", "manual": false, "prob": 0.5}
+{"id": 42, "label": 2, "text": "\u751f\u7269", "manual": true, "prob": 0.5}
+{"id": 43, "label": 3, "text": "\u30b3\u30b1\u690d\u7269", "manual": false, "prob": 0.5}
+{"id": 47, "label": 3, "text": "\u793e\u4f1a\u5b66", "manual": true, "prob": 0.5}
+{"id": 111, "label": 1, "text": "\u65e5\u672c\u8a9e", "manual": true, "prob": 0.5}
+{"id": 112, "label": 1, "text": "\u5730\u7406\u5b66", "manual": true, "prob": 0.5}
+{"id": 123, "label": 2, "text": "\u56fd\u306e\u4e00\u89a7", "manual": false, "prob": 0.5}
+{"id": 131, "label": 2, "text": "\u30d1\u30ea", "manual": false, "prob": 0.5}
+{"id": 132, "label": 3, "text": "\u30e8\u30fc\u30ed\u30c3\u30d1", "manual": true, "prob": 0.5}
+{"id": 142, "label": 2, "text": "\u751f\u7269", "manual": true, "prob": 0.5}
+{"id": 143, "label": 1, "text": "\u30b3\u30b1\u690d\u7269", "manual": false, "prob": 0.5}
+{"id": 147, "label": 3, "text": "\u793e\u4f1a\u5b66", "manual": true, "prob": 0.5}
--- a/tests/test_classifier.py
+++ b/tests/test_classifier.py
@ -0,0 +1,14 @@
+import os
+import unittest
+
+from doccano.classifier.task import run
+
+
+class TestClassifier(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.filename = os.path.join(os.path.dirname(__file__), 'data/testdata.jsonl')
+
+    def test_task_runner(self):
+        run(self.filename)