Browse Source

FastText import: supports label tags everywhere in line

pull/1073/head
Paul 4 years ago
parent
commit
ba3b2ab045
5 changed files with 24 additions and 24 deletions
  1. 3
      app/api/tests/data/example_fasttext.txt
  2. 16
      app/api/tests/test_api.py
  3. 23
      app/api/utils.py
  4. 4
      app/api/views.py
  5. 2
      frontend/store/projects.js

3
app/api/tests/data/example_fasttext.txt

@ -1,3 +1,4 @@
__label__house mansion home
__label__president __label__american __label__us Obama Trump Kennedy
dog cat
VW __label__car BMW
dog cat

16
app/api/tests/test_api.py

@ -9,7 +9,7 @@ from model_mommy import mommy
from ..models import User, SequenceAnnotation, Document, Role, RoleMapping
from ..models import DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, SPEECH2TEXT
from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FasttextParser
from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FastTextParser
from ..exceptions import FileParseException
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
@ -1435,18 +1435,18 @@ class TestParser(APITestCase):
def test_give_data_without_label_to_json_parser(self):
self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False)
def test_give_labeling_data_to_fasttext_parser(self):
self.parser_helper(filename='example_fasttext.txt', parser=FasttextParser())
self.parser_helper(filename='example_fasttext.txt', parser=FastTextParser())
def test_give_data_without_label_name_to_fasttext_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FasttextParser())
self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FastTextParser())
def test_give_data_without_text_to_fasttext_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='example_fasttext_without_text.txt', parser=FasttextParser())
self.parser_helper(filename='example_fasttext_without_text.txt', parser=FastTextParser())
class TestDownloader(APITestCase):

23
app/api/utils.py

@ -423,15 +423,15 @@ class JSONParser(FileParser):
yield data
class FasttextParser(FileParser):
class FastTextParser(FileParser):
"""
Parse files in fastText format.
Labels are marked with the __label__ prefix
Labels are marked with the __label__ prefix
and the corresponding text comes afterwards in the same line
For example:
```
__label__dog poodle
__label__house mansion
__label__house mansion
```
"""
def parse(self, file):
@ -443,27 +443,26 @@ class FasttextParser(FileParser):
yield data
data = []
# Search Labels, check correct syntax and append
# Search labels and text, check correct syntax and append
labels = []
tokens = line.rstrip().split(" ")
for token in tokens:
text = []
for token in line.rstrip().split(" "):
if token.startswith('__label__'):
if token == '__label__':
raise FileParseException(line_num=i, line=line)
raise FileParseException(line_num=i, line=line)
labels.append(token[len('__label__'):])
else:
break
text.append(token)
# Check if text for labels is given
if len(tokens) == len(labels):
if not text:
raise FileParseException(line_num=i, line=line)
text = " ".join(tokens[len(labels):])
data.append({'text': text, 'labels': labels})
data.append({'text': " ".join(text), 'labels': labels})
if data:
yield data
class AudioParser(FileParser):

4
app/api/views.py

@ -22,7 +22,7 @@ from .models import Project, Label, Document, RoleMapping, Role
from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover
from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer
from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer
from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FasttextParser, iterable_to_io
from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io
from .utils import JSONLRenderer
from .utils import JSONPainter, CSVPainter
@ -297,7 +297,7 @@ class TextUploadAPI(APIView):
elif file_format == 'audio':
return AudioParser()
elif file_format == 'fastText':
return FasttextParser()
return FastTextParser()
else:
raise ValidationError('format {} is invalid.'.format(file_format))

2
frontend/store/projects.js

@ -80,7 +80,7 @@ export const getters = {
}
const fastText = {
type: 'fastText',
text: 'Fasttext',
text: 'FastText',
accept: '.txt'
}
if (state.current.project_type === 'DocumentClassification') {

Loading…
Cancel
Save