diff --git a/doccano/classifier/model.py b/doccano/classifier/model.py new file mode 100644 index 00000000..62b0d59d --- /dev/null +++ b/doccano/classifier/model.py @@ -0,0 +1,12 @@ +""" +Baseline model. +""" + +from sklearn.calibration import CalibratedClassifierCV +from sklearn.svm import LinearSVC + + +def build_model(): + estimator = CalibratedClassifierCV(base_estimator=LinearSVC()) + + return estimator diff --git a/doccano/classifier/preprocess.py b/doccano/classifier/preprocess.py new file mode 100644 index 00000000..548f6e8f --- /dev/null +++ b/doccano/classifier/preprocess.py @@ -0,0 +1,27 @@ +""" +Preprocessor. +""" +import MeCab +from sklearn.feature_extraction.text import TfidfVectorizer + +t = MeCab.Tagger('-Owakati') + + +def tokenize(text): + """Tokenize Japanese text. + + Args: + text: Japanese string. + + Returns: + A list of words. + """ + words = t.parse(text).rstrip().split() + + return words + + +def build_vectorizer(): + vectorizer = TfidfVectorizer(tokenizer=tokenize) + + return vectorizer diff --git a/doccano/classifier/task.py b/doccano/classifier/task.py new file mode 100644 index 00000000..7775a557 --- /dev/null +++ b/doccano/classifier/task.py @@ -0,0 +1,35 @@ +""" +Task runner. +""" +import numpy as np + +from doccano.classifier.model import build_model +from doccano.classifier.preprocess import build_vectorizer +from doccano.classifier.utils import load_dataset, save_dataset, train_test_split + + +def run(filename): + print('Loading dataset...') + data = load_dataset(filename) + x_train, x_test, y_train, ids = train_test_split(data) + + print('Building vectorizer and model...') + vectorizer = build_vectorizer() + clf = build_model() + + print('Vectorizing...') + x_train = vectorizer.fit_transform(x_train) + x_test = vectorizer.transform(x_test) + + print('Fitting...') + clf.fit(x_train, y_train) + + print('Predicting...') + y_pred = clf.predict(x_test) + y_prob = clf.predict_proba(x_test) + y_prob = np.max(y_prob, axis=-1) + + print('Saving...') + outputs = {} + # data, ids, y_pred, y_prob + #save_dataset(outputs, filename) diff --git a/doccano/classifier/utils.py b/doccano/classifier/utils.py new file mode 100644 index 00000000..eee319d4 --- /dev/null +++ b/doccano/classifier/utils.py @@ -0,0 +1,32 @@ +""" +Utilities. +""" +import json + + +def train_test_split(data): + x_train, x_test, y_train, ids = [], [], [], [] + for d in data: + text = d['text'] + label = d['label'] + if d['manual']: + x_train.append(text) + y_train.append(label) + else: + x_test.append(text) + ids.append(d['id']) + + return x_train, x_test, y_train, ids + + +def load_dataset(filename): + with open(filename) as f: + data = [json.loads(line) for line in f] + + return data + + +def save_dataset(obj, filename): + with open(filename, 'w') as f: + for line in obj: + f.write('{}\n'.format(json.dumps(line))) diff --git a/requirements.txt b/requirements.txt index e69de29b..a1712730 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,4 @@ +mecab-python3==0.7 +numpy==1.14.3 +scikit-learn==0.19.1 +scipy==1.1.0 diff --git a/tests/.gitkeep b/tests/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/testdata.jsonl b/tests/data/testdata.jsonl new file mode 100644 index 00000000..27894a4b --- /dev/null +++ b/tests/data/testdata.jsonl @@ -0,0 +1,18 @@ +{"id": 5, "label": 2, "text": "\u30a2\u30f3\u30d1\u30b5\u30f3\u30c9", "manual": true, "prob": 0.5, "manual": true, "prob": 0.5} +{"id": 10, "label": 2, "text": "\u8a00\u8a9e", "manual": true, "prob": 0.5} +{"id": 11, "label": 1, "text": "\u65e5\u672c\u8a9e", "manual": true, "prob": 0.5} +{"id": 12, "label": 1, "text": "\u5730\u7406\u5b66", "manual": true, "prob": 0.5} +{"id": 23, "label": 2, "text": "\u56fd\u306e\u4e00\u89a7", "manual": false, "prob": 0.5} +{"id": 31, "label": 2, "text": "\u30d1\u30ea", "manual": false, "prob": 0.5} +{"id": 32, "label": 3, "text": "\u30e8\u30fc\u30ed\u30c3\u30d1", "manual": false, "prob": 0.5} +{"id": 42, "label": 2, "text": "\u751f\u7269", "manual": true, "prob": 0.5} +{"id": 43, "label": 3, "text": "\u30b3\u30b1\u690d\u7269", "manual": false, "prob": 0.5} +{"id": 47, "label": 3, "text": "\u793e\u4f1a\u5b66", "manual": true, "prob": 0.5} +{"id": 111, "label": 1, "text": "\u65e5\u672c\u8a9e", "manual": true, "prob": 0.5} +{"id": 112, "label": 1, "text": "\u5730\u7406\u5b66", "manual": true, "prob": 0.5} +{"id": 123, "label": 2, "text": "\u56fd\u306e\u4e00\u89a7", "manual": false, "prob": 0.5} +{"id": 131, "label": 2, "text": "\u30d1\u30ea", "manual": false, "prob": 0.5} +{"id": 132, "label": 3, "text": "\u30e8\u30fc\u30ed\u30c3\u30d1", "manual": true, "prob": 0.5} +{"id": 142, "label": 2, "text": "\u751f\u7269", "manual": true, "prob": 0.5} +{"id": 143, "label": 1, "text": "\u30b3\u30b1\u690d\u7269", "manual": false, "prob": 0.5} +{"id": 147, "label": 3, "text": "\u793e\u4f1a\u5b66", "manual": true, "prob": 0.5} \ No newline at end of file diff --git a/tests/test_classifier.py b/tests/test_classifier.py new file mode 100644 index 00000000..5c765fa7 --- /dev/null +++ b/tests/test_classifier.py @@ -0,0 +1,14 @@ +import os +import unittest + +from doccano.classifier.task import run + + +class TestClassifier(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.filename = os.path.join(os.path.dirname(__file__), 'data/testdata.jsonl') + + def test_task_runner(self): + run(self.filename)