27 lines
430 B

"""
Preprocessor.
"""
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
t = MeCab.Tagger('-Owakati')
def tokenize(text):
"""Tokenize Japanese text.
Args:
text: Japanese string.
Returns:
A list of words.
"""
words = t.parse(text).rstrip().split()
return words
def build_vectorizer():
vectorizer = TfidfVectorizer(tokenizer=tokenize)
return vectorizer