diff --git a/app/api/tests/data/example_fasttext.txt b/app/api/tests/data/example_fasttext.txt new file mode 100644 index 00000000..d994c9fd --- /dev/null +++ b/app/api/tests/data/example_fasttext.txt @@ -0,0 +1,4 @@ +__label__house mansion home +__label__president __label__american __label__us Obama Trump Kennedy +VW __label__car BMW +dog cat \ No newline at end of file diff --git a/app/api/tests/data/example_fasttext_label_tag_without_name.txt b/app/api/tests/data/example_fasttext_label_tag_without_name.txt new file mode 100644 index 00000000..41457ae4 --- /dev/null +++ b/app/api/tests/data/example_fasttext_label_tag_without_name.txt @@ -0,0 +1 @@ +__label__ house cat dog \ No newline at end of file diff --git a/app/api/tests/data/example_fasttext_without_text.txt b/app/api/tests/data/example_fasttext_without_text.txt new file mode 100644 index 00000000..a85dc28d --- /dev/null +++ b/app/api/tests/data/example_fasttext_without_text.txt @@ -0,0 +1,2 @@ +__label__cat ex ex ex +__label__dog \ No newline at end of file diff --git a/app/api/tests/test_api.py b/app/api/tests/test_api.py index 743dca3c..b0c2cf31 100644 --- a/app/api/tests/test_api.py +++ b/app/api/tests/test_api.py @@ -9,7 +9,7 @@ from model_mommy import mommy from ..models import User, SequenceAnnotation, Document, Role, RoleMapping from ..models import DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, SPEECH2TEXT -from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser +from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FastTextParser from ..exceptions import FileParseException DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -1436,6 +1436,17 @@ class TestParser(APITestCase): def test_give_data_without_label_to_json_parser(self): self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False) + def test_give_labeling_data_to_fasttext_parser(self): + self.parser_helper(filename='example_fasttext.txt', parser=FastTextParser()) + + def test_give_data_without_label_name_to_fasttext_parser(self): + with self.assertRaises(FileParseException): + self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FastTextParser()) + + def test_give_data_without_text_to_fasttext_parser(self): + with self.assertRaises(FileParseException): + self.parser_helper(filename='example_fasttext_without_text.txt', parser=FastTextParser()) + class TestDownloader(APITestCase): diff --git a/app/api/utils.py b/app/api/utils.py index 03bbe118..977cba5a 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -423,6 +423,48 @@ class JSONParser(FileParser): yield data +class FastTextParser(FileParser): + """ + Parse files in fastText format. + Labels are marked with the __label__ prefix + and the corresponding text comes afterwards in the same line + For example: + ``` + __label__dog poodle + __label__house mansion + ``` + """ + def parse(self, file): + file = EncodedIO(file) + file = io.TextIOWrapper(file, encoding=file.encoding) + data = [] + for i, line in enumerate(file, start=0): + if len(data) >= settings.IMPORT_BATCH_SIZE: + yield data + data = [] + + # Search labels and text, check correct syntax and append + labels = [] + text = [] + for token in line.rstrip().split(" "): + if token.startswith('__label__'): + if token == '__label__': + raise FileParseException(line_num=i, line=line) + labels.append(token[len('__label__'):]) + else: + text.append(token) + + # Check if text for labels is given + if not text: + raise FileParseException(line_num=i, line=line) + + data.append({'text': " ".join(text), 'labels': labels}) + + if data: + yield data + + + class AudioParser(FileParser): def parse(self, file): file_type, _ = mimetypes.guess_type(file.name, strict=False) diff --git a/app/api/views.py b/app/api/views.py index 44d6f081..8f01701f 100644 --- a/app/api/views.py +++ b/app/api/views.py @@ -24,7 +24,7 @@ from .models import Project, Label, Document, RoleMapping, Role from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer -from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, iterable_to_io +from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io from .utils import JSONLRenderer from .utils import JSONPainter, CSVPainter @@ -299,6 +299,8 @@ class TextUploadAPI(APIView): return ExcelParser() elif file_format == 'audio': return AudioParser() + elif file_format == 'fastText': + return FastTextParser() else: raise ValidationError('format {} is invalid.'.format(file_format)) diff --git a/frontend/store/projects.js b/frontend/store/projects.js index be6fe2c6..be2508a6 100644 --- a/frontend/store/projects.js +++ b/frontend/store/projects.js @@ -78,6 +78,11 @@ export const getters = { text: 'Excel', accept: '.xlsx' } + const fastText = { + type: 'fastText', + text: 'FastText', + accept: '.txt' + } if (state.current.project_type === 'DocumentClassification') { json.examples = [ '{"text": "Terrible customer service.", "labels": ["negative"]}\n', @@ -96,11 +101,16 @@ export const getters = { '"Really great transaction.","positive"\n', '"Great price.","positive"' ] + fastText.examples = [ + '__label__[label name] text \n', + '__label_president Obama Trump' + ] return [ plain, csv, json, - excel + excel, + fastText ] } else if (state.current.project_type === 'SequenceLabeling') { json.examples = [