You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

27 lines
430 B

  1. """
  2. Preprocessor.
  3. """
  4. import MeCab
  5. from sklearn.feature_extraction.text import TfidfVectorizer
  6. t = MeCab.Tagger('-Owakati')
  7. def tokenize(text):
  8. """Tokenize Japanese text.
  9. Args:
  10. text: Japanese string.
  11. Returns:
  12. A list of words.
  13. """
  14. words = t.parse(text).rstrip().split()
  15. return words
  16. def build_vectorizer():
  17. vectorizer = TfidfVectorizer(tokenizer=tokenize)
  18. return vectorizer