Browse Source

Merge pull request #1073 from SwiftPredator/feature/fasttext_import

FastText import functionality
pull/1156/head
Hiroki Nakayama 4 years ago
committed by GitHub
parent
commit
70b2c6f0e7
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 75 additions and 3 deletions
  1. 4
      app/api/tests/data/example_fasttext.txt
  2. 1
      app/api/tests/data/example_fasttext_label_tag_without_name.txt
  3. 2
      app/api/tests/data/example_fasttext_without_text.txt
  4. 13
      app/api/tests/test_api.py
  5. 42
      app/api/utils.py
  6. 4
      app/api/views.py
  7. 12
      frontend/store/projects.js

4
app/api/tests/data/example_fasttext.txt

@ -0,0 +1,4 @@
__label__house mansion home
__label__president __label__american __label__us Obama Trump Kennedy
VW __label__car BMW
dog cat

1
app/api/tests/data/example_fasttext_label_tag_without_name.txt

@ -0,0 +1 @@
__label__ house cat dog

2
app/api/tests/data/example_fasttext_without_text.txt

@ -0,0 +1,2 @@
__label__cat ex ex ex
__label__dog

13
app/api/tests/test_api.py

@ -9,7 +9,7 @@ from model_mommy import mommy
from ..models import User, SequenceAnnotation, Document, Role, RoleMapping
from ..models import DOCUMENT_CLASSIFICATION, SEQUENCE_LABELING, SEQ2SEQ, SPEECH2TEXT
from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser
from ..utils import PlainTextParser, CoNLLParser, JSONParser, CSVParser, FastTextParser
from ..exceptions import FileParseException
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
@ -1436,6 +1436,17 @@ class TestParser(APITestCase):
def test_give_data_without_label_to_json_parser(self):
self.parser_helper(filename='example.jsonl', parser=JSONParser(), include_label=False)
def test_give_labeling_data_to_fasttext_parser(self):
self.parser_helper(filename='example_fasttext.txt', parser=FastTextParser())
def test_give_data_without_label_name_to_fasttext_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='example_fasttext_label_tag_without_name.txt', parser=FastTextParser())
def test_give_data_without_text_to_fasttext_parser(self):
with self.assertRaises(FileParseException):
self.parser_helper(filename='example_fasttext_without_text.txt', parser=FastTextParser())
class TestDownloader(APITestCase):

42
app/api/utils.py

@ -423,6 +423,48 @@ class JSONParser(FileParser):
yield data
class FastTextParser(FileParser):
"""
Parse files in fastText format.
Labels are marked with the __label__ prefix
and the corresponding text comes afterwards in the same line
For example:
```
__label__dog poodle
__label__house mansion
```
"""
def parse(self, file):
file = EncodedIO(file)
file = io.TextIOWrapper(file, encoding=file.encoding)
data = []
for i, line in enumerate(file, start=0):
if len(data) >= settings.IMPORT_BATCH_SIZE:
yield data
data = []
# Search labels and text, check correct syntax and append
labels = []
text = []
for token in line.rstrip().split(" "):
if token.startswith('__label__'):
if token == '__label__':
raise FileParseException(line_num=i, line=line)
labels.append(token[len('__label__'):])
else:
text.append(token)
# Check if text for labels is given
if not text:
raise FileParseException(line_num=i, line=line)
data.append({'text': " ".join(text), 'labels': labels})
if data:
yield data
class AudioParser(FileParser):
def parse(self, file):
file_type, _ = mimetypes.guess_type(file.name, strict=False)

4
app/api/views.py

@ -24,7 +24,7 @@ from .models import Project, Label, Document, RoleMapping, Role
from .permissions import IsProjectAdmin, IsAnnotatorAndReadOnly, IsAnnotator, IsAnnotationApproverAndReadOnly, IsOwnAnnotation, IsAnnotationApprover
from .serializers import ProjectSerializer, LabelSerializer, DocumentSerializer, UserSerializer, ApproverSerializer
from .serializers import ProjectPolymorphicSerializer, RoleMappingSerializer, RoleSerializer
from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, iterable_to_io
from .utils import CSVParser, ExcelParser, JSONParser, PlainTextParser, CoNLLParser, AudioParser, FastTextParser, iterable_to_io
from .utils import JSONLRenderer
from .utils import JSONPainter, CSVPainter
@ -299,6 +299,8 @@ class TextUploadAPI(APIView):
return ExcelParser()
elif file_format == 'audio':
return AudioParser()
elif file_format == 'fastText':
return FastTextParser()
else:
raise ValidationError('format {} is invalid.'.format(file_format))

12
frontend/store/projects.js

@ -78,6 +78,11 @@ export const getters = {
text: 'Excel',
accept: '.xlsx'
}
const fastText = {
type: 'fastText',
text: 'FastText',
accept: '.txt'
}
if (state.current.project_type === 'DocumentClassification') {
json.examples = [
'{"text": "Terrible customer service.", "labels": ["negative"]}\n',
@ -96,11 +101,16 @@ export const getters = {
'"Really great transaction.","positive"\n',
'"Great price.","positive"'
]
fastText.examples = [
'__label__[label name] text \n',
'__label_president Obama Trump'
]
return [
plain,
csv,
json,
excel
excel,
fastText
]
} else if (state.current.project_type === 'SequenceLabeling') {
json.examples = [

Loading…
Cancel
Save