You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

27 lines
430 B

"""
Preprocessor.
"""
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
t = MeCab.Tagger('-Owakati')
def tokenize(text):
"""Tokenize Japanese text.
Args:
text: Japanese string.
Returns:
A list of words.
"""
words = t.parse(text).rstrip().split()
return words
def build_vectorizer():
vectorizer = TfidfVectorizer(tokenizer=tokenize)
return vectorizer