Browse Source

Implement classifier

pull/10/head
Hironsan 6 years ago
parent
commit
0ec45e48e9
8 changed files with 142 additions and 0 deletions
  1. 12
      doccano/classifier/model.py
  2. 27
      doccano/classifier/preprocess.py
  3. 35
      doccano/classifier/task.py
  4. 32
      doccano/classifier/utils.py
  5. 4
      requirements.txt
  6. 0
      tests/.gitkeep
  7. 18
      tests/data/testdata.jsonl
  8. 14
      tests/test_classifier.py

12
doccano/classifier/model.py

@ -0,0 +1,12 @@
"""
Baseline model.
"""
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
def build_model():
estimator = CalibratedClassifierCV(base_estimator=LinearSVC())
return estimator

27
doccano/classifier/preprocess.py

@ -0,0 +1,27 @@
"""
Preprocessor.
"""
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
t = MeCab.Tagger('-Owakati')
def tokenize(text):
"""Tokenize Japanese text.
Args:
text: Japanese string.
Returns:
A list of words.
"""
words = t.parse(text).rstrip().split()
return words
def build_vectorizer():
vectorizer = TfidfVectorizer(tokenizer=tokenize)
return vectorizer

35
doccano/classifier/task.py

@ -0,0 +1,35 @@
"""
Task runner.
"""
import numpy as np
from doccano.classifier.model import build_model
from doccano.classifier.preprocess import build_vectorizer
from doccano.classifier.utils import load_dataset, save_dataset, train_test_split
def run(filename):
print('Loading dataset...')
data = load_dataset(filename)
x_train, x_test, y_train, ids = train_test_split(data)
print('Building vectorizer and model...')
vectorizer = build_vectorizer()
clf = build_model()
print('Vectorizing...')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print('Fitting...')
clf.fit(x_train, y_train)
print('Predicting...')
y_pred = clf.predict(x_test)
y_prob = clf.predict_proba(x_test)
y_prob = np.max(y_prob, axis=-1)
print('Saving...')
outputs = {}
# data, ids, y_pred, y_prob
#save_dataset(outputs, filename)

32
doccano/classifier/utils.py

@ -0,0 +1,32 @@
"""
Utilities.
"""
import json
def train_test_split(data):
x_train, x_test, y_train, ids = [], [], [], []
for d in data:
text = d['text']
label = d['label']
if d['manual']:
x_train.append(text)
y_train.append(label)
else:
x_test.append(text)
ids.append(d['id'])
return x_train, x_test, y_train, ids
def load_dataset(filename):
with open(filename) as f:
data = [json.loads(line) for line in f]
return data
def save_dataset(obj, filename):
with open(filename, 'w') as f:
for line in obj:
f.write('{}\n'.format(json.dumps(line)))

4
requirements.txt

@ -0,0 +1,4 @@
mecab-python3==0.7
numpy==1.14.3
scikit-learn==0.19.1
scipy==1.1.0

0
tests/.gitkeep

18
tests/data/testdata.jsonl

@ -0,0 +1,18 @@
{"id": 5, "label": 2, "text": "\u30a2\u30f3\u30d1\u30b5\u30f3\u30c9", "manual": true, "prob": 0.5, "manual": true, "prob": 0.5}
{"id": 10, "label": 2, "text": "\u8a00\u8a9e", "manual": true, "prob": 0.5}
{"id": 11, "label": 1, "text": "\u65e5\u672c\u8a9e", "manual": true, "prob": 0.5}
{"id": 12, "label": 1, "text": "\u5730\u7406\u5b66", "manual": true, "prob": 0.5}
{"id": 23, "label": 2, "text": "\u56fd\u306e\u4e00\u89a7", "manual": false, "prob": 0.5}
{"id": 31, "label": 2, "text": "\u30d1\u30ea", "manual": false, "prob": 0.5}
{"id": 32, "label": 3, "text": "\u30e8\u30fc\u30ed\u30c3\u30d1", "manual": false, "prob": 0.5}
{"id": 42, "label": 2, "text": "\u751f\u7269", "manual": true, "prob": 0.5}
{"id": 43, "label": 3, "text": "\u30b3\u30b1\u690d\u7269", "manual": false, "prob": 0.5}
{"id": 47, "label": 3, "text": "\u793e\u4f1a\u5b66", "manual": true, "prob": 0.5}
{"id": 111, "label": 1, "text": "\u65e5\u672c\u8a9e", "manual": true, "prob": 0.5}
{"id": 112, "label": 1, "text": "\u5730\u7406\u5b66", "manual": true, "prob": 0.5}
{"id": 123, "label": 2, "text": "\u56fd\u306e\u4e00\u89a7", "manual": false, "prob": 0.5}
{"id": 131, "label": 2, "text": "\u30d1\u30ea", "manual": false, "prob": 0.5}
{"id": 132, "label": 3, "text": "\u30e8\u30fc\u30ed\u30c3\u30d1", "manual": true, "prob": 0.5}
{"id": 142, "label": 2, "text": "\u751f\u7269", "manual": true, "prob": 0.5}
{"id": 143, "label": 1, "text": "\u30b3\u30b1\u690d\u7269", "manual": false, "prob": 0.5}
{"id": 147, "label": 3, "text": "\u793e\u4f1a\u5b66", "manual": true, "prob": 0.5}

14
tests/test_classifier.py

@ -0,0 +1,14 @@
import os
import unittest
from doccano.classifier.task import run
class TestClassifier(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.filename = os.path.join(os.path.dirname(__file__), 'data/testdata.jsonl')
def test_task_runner(self):
run(self.filename)
Loading…
Cancel
Save