mirror of https://github.com/doccano/doccano.git
pythonannotation-tooldatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learning
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
27 lines
430 B
27 lines
430 B
"""
|
|
Preprocessor.
|
|
"""
|
|
import MeCab
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
t = MeCab.Tagger('-Owakati')
|
|
|
|
|
|
def tokenize(text):
|
|
"""Tokenize Japanese text.
|
|
|
|
Args:
|
|
text: Japanese string.
|
|
|
|
Returns:
|
|
A list of words.
|
|
"""
|
|
words = t.parse(text).rstrip().split()
|
|
|
|
return words
|
|
|
|
|
|
def build_vectorizer():
|
|
vectorizer = TfidfVectorizer(tokenizer=tokenize)
|
|
|
|
return vectorizer
|