From 73b03673635d734a4887d5d054fce216c5348902 Mon Sep 17 00:00:00 2001 From: Casey Date: Fri, 8 Jul 2022 14:09:19 -0700 Subject: [PATCH] fix fasttext --- backend/data_export/pipeline/formatters.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/data_export/pipeline/formatters.py b/backend/data_export/pipeline/formatters.py index 56b9144e..41efa19d 100644 --- a/backend/data_export/pipeline/formatters.py +++ b/backend/data_export/pipeline/formatters.py @@ -46,12 +46,15 @@ class FastTextCategoryFormatter(Formatter): """Format the label column to `__label__LabelA __label__LabelB` format. Also, drop the columns except for `data` and `self.target_column`. """ - dataset = dataset[[DATA, self.target_column]] + dataset = dataset[[DATA, self.target_column, "Comments"]] dataset[self.target_column] = dataset[self.target_column].apply( lambda labels: " ".join(sorted(f"__label__{label.to_string()}" for label in labels)) ) dataset[self.target_column] = dataset[self.target_column].fillna("") - dataset = dataset[self.target_column] + " " + dataset[DATA] + dataset["Comments"] = dataset["Comments"].apply( + lambda comments: "#".join(comment.to_string() for comment in comments) + ) + dataset = dataset[self.target_column] + " " + dataset[DATA] + " " + dataset["Comments"] return dataset