From 6781e26d33b0c67acfc556ab6bcfc727c6c4141a Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 17 Nov 2020 19:17:50 +0100 Subject: [PATCH 1/2] FastText import functionality --- app/api/tests/data/example_fasttext.txt | 3 ++ ...xample_fasttext_label_tag_without_name.txt | 1 + .../data/example_fasttext_without_text.txt | 2 + app/api/tests/test_api.py | 15 ++++++- app/api/utils.py | 43 +++++++++++++++++++ app/api/views.py | 4 +- frontend/store/projects.js | 12 +++++- 7 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 app/api/tests/data/example_fasttext.txt create mode 100644 app/api/tests/data/example_fasttext_label_tag_without_name.txt create mode 100644 app/api/tests/data/example_fasttext_without_text.txt diff --git a/app/api/tests/data/example_fasttext.txt b/app/api/tests/data/example_fasttext.txt new file mode 100644 index 00000000..30b49938 --- /dev/null +++ b/app/api/tests/data/example_fasttext.txt @@ -0,0 +1,3 @@ +__label__house mansion home +__label__president __label__american __label__us Obama Trump Kennedy +dog cat \ No newline at end of file diff --git a/app/api/tests/data/example_fasttext_label_tag_without_name.txt b/app/api/tests/data/example_fasttext_label_tag_without_name.txt new file mode 100644 index 00000000..41457ae4 --- /dev/null +++ b/app/api/tests/data/example_fasttext_label_tag_without_name.txt @@ -0,0 +1 @@ +__label__ house cat dog \ No newline at end of file diff --git a/app/api/tests/data/example_fasttext_without_text.txt b/app/api/tests/data/example_fasttext_without_text.txt new file mode 100644 index 00000000..a85dc28d --- /dev/null +++ b/app/api/tests/data/example_fasttext_without_text.txt @@ -0,0 +1,2 @@ +__label__cat ex ex ex +__label__dog \ No newline at end of file diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py index 743dca3c..e30f8515 100644 --- a/app/api/tests/test_api.py +++ b/app/api/tests/test_api.py @@ -9,7 +9,7 @@ from model_mommy import mommy from ..models import User, SequenceAnnotation, Document, Role, RoleMapping from ..models import DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, SPEECH2TEXT -from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser +from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FasttextParser from ..exceptions import FileParseException DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -1435,7 +1435,18 @@ class TestParser(APITestCase): def test_give_data_without_label_to_json_parser(self): self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False) - + + def test_give_labeling_data_to_fasttext_parser(self): + self.parser_helper(filename='example_fasttext.txt', parser=FasttextParser()) + + def test_give_data_without_label_name_to_fasttext_parser(self): + with self.assertRaises(FileParseException): + self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FasttextParser()) + + def test_give_data_without_text_to_fasttext_parser(self): + with self.assertRaises(FileParseException): + self.parser_helper(filename='example_fasttext_without_text.txt', parser=FasttextParser()) + class TestDownloader(APITestCase): diff --git a/app/api/utils.py b/app/api/utils.py index 03bbe118..7d522867 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -423,6 +423,49 @@ class JSONParser(FileParser): yield data +class FasttextParser(FileParser): + """ + Parse files in fastText format. + Labels are marked with the __label__ prefix + and the corresponding text comes afterwards in the same line + For example: + ``` + __label__dog poodle + __label__house mansion + ``` + """ + def parse(self, file): + file = EncodedIO(file) + file = io.TextIOWrapper(file, encoding=file.encoding) + data = [] + for i, line in enumerate(file, start=0): + if len(data) >= settings.IMPORT_BATCH_SIZE: + yield data + data = [] + + # Search Labels, check correct syntax and append + labels = [] + tokens = line.rstrip().split(" ") + for token in tokens: + if token.startswith('__label__'): + if token == '__label__': + raise FileParseException(line_num=i, line=line) + labels.append(token[len('__label__'):]) + else: + break + + # Check if text for labels is given + if len(tokens) == len(labels): + raise FileParseException(line_num=i, line=line) + + text = " ".join(tokens[len(labels):]) + data.append({'text': text, 'labels': labels}) + + if data: + yield data + + + class AudioParser(FileParser): def parse(self, file): file_type, _ = mimetypes.guess_type(file.name, strict=False) diff --git a/app/api/views.py b/app/api/views.py index bd1c8bfe..55f026ee 100644 --- a/app/api/views.py +++ b/app/api/views.py @@ -22,7 +22,7 @@ from .models import Project, Label, Document, RoleMapping, Role from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer -from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, iterable_to_io +from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FasttextParser, iterable_to_io from .utils import JSONLRenderer from .utils import JSONPainter, CSVPainter @@ -296,6 +296,8 @@ class TextUploadAPI(APIView): return ExcelParser() elif file_format == 'audio': return AudioParser() + elif file_format == 'fastText': + return FasttextParser() else: raise ValidationError('format {} is invalid.'.format(file_format)) diff --git a/frontend/store/projects.js b/frontend/store/projects.js index be6fe2c6..6fdd05f6 100644 --- a/frontend/store/projects.js +++ b/frontend/store/projects.js @@ -78,6 +78,11 @@ export const getters = { text: 'Excel', accept: '.xlsx' } + const fastText = { + type: 'fastText', + text: 'Fasttext', + accept: '.txt' + } if (state.current.project_type === 'DocumentClassification') { json.examples = [ '{"text": "Terrible customer service.", "labels": ["negative"]}\n', @@ -96,11 +101,16 @@ export const getters = { '"Really great transaction.","positive"\n', '"Great price.","positive"' ] + fastText.examples = [ + '__label__[label name] text \n', + '__label_president Obama Trump' + ] return [ plain, csv, json, - excel + excel, + fastText ] } else if (state.current.project_type === 'SequenceLabeling') { json.examples = [ From ba3b2ab0455d77a8c3c2e0038a0368753f167e8c Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 27 Nov 2020 19:14:32 +0100 Subject: [PATCH 2/2] FastText import: supports label tags everywhere in line --- app/api/tests/data/example_fasttext.txt | 3 ++- app/api/tests/test_api.py | 16 ++++++++-------- app/api/utils.py | 23 +++++++++++------------ app/api/views.py | 4 ++-- frontend/store/projects.js | 2 +- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/app/api/tests/data/example_fasttext.txt b/app/api/tests/data/example_fasttext.txt index 30b49938..d994c9fd 100644 --- a/app/api/tests/data/example_fasttext.txt +++ b/app/api/tests/data/example_fasttext.txt @@ -1,3 +1,4 @@ __label__house mansion home __label__president __label__american __label__us Obama Trump Kennedy -dog cat \ No newline at end of file +VW __label__car BMW +dog cat \ No newline at end of file diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py index e30f8515..b0c2cf31 100644 --- a/app/api/tests/test_api.py +++ b/app/api/tests/test_api.py @@ -9,7 +9,7 @@ from model_mommy import mommy from ..models import User, SequenceAnnotation, Document, Role, RoleMapping from ..models import DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, SPEECH2TEXT -from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FasttextParser +from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FastTextParser from ..exceptions import FileParseException DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -1435,18 +1435,18 @@ class TestParser(APITestCase): def test_give_data_without_label_to_json_parser(self): self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False) - + def test_give_labeling_data_to_fasttext_parser(self): - self.parser_helper(filename='example_fasttext.txt', parser=FasttextParser()) - + self.parser_helper(filename='example_fasttext.txt', parser=FastTextParser()) + def test_give_data_without_label_name_to_fasttext_parser(self): with self.assertRaises(FileParseException): - self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FasttextParser()) - + self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FastTextParser()) + def test_give_data_without_text_to_fasttext_parser(self): with self.assertRaises(FileParseException): - self.parser_helper(filename='example_fasttext_without_text.txt', parser=FasttextParser()) - + self.parser_helper(filename='example_fasttext_without_text.txt', parser=FastTextParser()) + class TestDownloader(APITestCase): diff --git a/app/api/utils.py b/app/api/utils.py index 7d522867..977cba5a 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -423,15 +423,15 @@ class JSONParser(FileParser): yield data -class FasttextParser(FileParser): +class FastTextParser(FileParser): """ Parse files in fastText format. - Labels are marked with the __label__ prefix + Labels are marked with the __label__ prefix and the corresponding text comes afterwards in the same line For example: ``` __label__dog poodle - __label__house mansion + __label__house mansion ``` """ def parse(self, file): @@ -443,27 +443,26 @@ class FasttextParser(FileParser): yield data data = [] - # Search Labels, check correct syntax and append + # Search labels and text, check correct syntax and append labels = [] - tokens = line.rstrip().split(" ") - for token in tokens: + text = [] + for token in line.rstrip().split(" "): if token.startswith('__label__'): if token == '__label__': - raise FileParseException(line_num=i, line=line) + raise FileParseException(line_num=i, line=line) labels.append(token[len('__label__'):]) else: - break + text.append(token) # Check if text for labels is given - if len(tokens) == len(labels): + if not text: raise FileParseException(line_num=i, line=line) - text = " ".join(tokens[len(labels):]) - data.append({'text': text, 'labels': labels}) + data.append({'text': " ".join(text), 'labels': labels}) if data: yield data - + class AudioParser(FileParser): diff --git a/app/api/views.py b/app/api/views.py index 55f026ee..a6e2d1ea 100644 --- a/app/api/views.py +++ b/app/api/views.py @@ -22,7 +22,7 @@ from .models import Project, Label, Document, RoleMapping, Role from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer -from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FasttextParser, iterable_to_io +from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io from .utils import JSONLRenderer from .utils import JSONPainter, CSVPainter @@ -297,7 +297,7 @@ class TextUploadAPI(APIView): elif file_format == 'audio': return AudioParser() elif file_format == 'fastText': - return FasttextParser() + return FastTextParser() else: raise ValidationError('format {} is invalid.'.format(file_format)) diff --git a/frontend/store/projects.js b/frontend/store/projects.js index 6fdd05f6..be2508a6 100644 --- a/frontend/store/projects.js +++ b/frontend/store/projects.js @@ -80,7 +80,7 @@ export const getters = { } const fastText = { type: 'fastText', - text: 'Fasttext', + text: 'FastText', accept: '.txt' } if (state.current.project_type === 'DocumentClassification') {