Browse Source

Auto-generate label shortkeys on corpus import

pull/199/head
Clemens Wolff 5 years ago
parent
commit
e37e16bfad
5 changed files with 71 additions and 20 deletions
  1. 2
      app/server/models.py
  2. 17
      app/server/serializers.py
  3. 1
      app/server/tests/data/classification.jsonl
  4. 11
      app/server/tests/test_api.py
  5. 60
      app/server/utils.py

2
app/server/models.py

@ -142,7 +142,7 @@ class Label(models.Model):
('shift', 'shift'),
('ctrl shift', 'ctrl shift')
)
SUFFIX_KEYS = (
SUFFIX_KEYS = tuple(
(c, c) for c in string.ascii_lowercase
)

17
app/server/serializers.py

@ -34,12 +34,17 @@ class LabelSerializer(serializers.ModelSerializer):
raise ValidationError('Shortcut key may not have a suffix key.')
# Don't allow to save same shortcut key when prefix_key is null.
context = self.context['request'].parser_context
project_id = context['kwargs'].get('project_id')
if Label.objects.filter(suffix_key=suffix_key,
prefix_key__isnull=True,
project=project_id).exists():
raise ValidationError('Duplicate key.')
try:
context = self.context['request'].parser_context
project_id = context['kwargs']['project_id']
except (AttributeError, KeyError):
pass # unit tests don't always have the correct context set up
else:
if Label.objects.filter(suffix_key=suffix_key,
prefix_key__isnull=True,
project=project_id).exists():
raise ValidationError('Duplicate key.')
return super().validate(attrs)
class Meta:

1
app/server/tests/data/classification.jsonl

@ -1,3 +1,4 @@
{"text": "example", "labels": ["positive"], "meta": {"wikiPageID": 1}}
{"text": "example", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
{"text": "example", "labels": ["negative"], "meta": {"wikiPageID": 3}}
{"text": "example", "labels": ["neutral"], "meta": {"wikiPageID": 4}}

11
app/server/tests/test_api.py

@ -747,8 +747,9 @@ class TestUploader(APITestCase):
expected_status=status.HTTP_201_CREATED)
self.label_test_helper(self.classification_labels_url, expected_labels=[
{'text': 'positive'},
{'text': 'negative'},
{'text': 'positive', 'suffix_key': 'p', 'prefix_key': None},
{'text': 'negative', 'suffix_key': 'n', 'prefix_key': None},
{'text': 'neutral', 'suffix_key': 'n', 'prefix_key': 'ctrl'},
])
def test_can_upload_labeling_jsonl(self):
@ -758,9 +759,9 @@ class TestUploader(APITestCase):
expected_status=status.HTTP_201_CREATED)
self.label_test_helper(self.labeling_labels_url, expected_labels=[
{'text': 'LOC'},
{'text': 'ORG'},
{'text': 'PER'},
{'text': 'LOC', 'suffix_key': 'l', 'prefix_key': None},
{'text': 'ORG', 'suffix_key': 'o', 'prefix_key': None},
{'text': 'PER', 'suffix_key': 'p', 'prefix_key': None},
])
def test_can_upload_seq2seq_jsonl(self):

60
app/server/utils.py

@ -74,16 +74,60 @@ class BaseStorage(object):
"""
return [label for label in labels if label not in created]
def to_serializer_format(self, labels):
"""Exclude created labels.
@classmethod
def to_serializer_format(cls, labels, created):
"""Convert a label to model dictionary.
Also assigns shortkeys for each label that don't clash with existing
label shortkeys.
Example:
>>> labels = ["positive"]
>>> self.to_serializer_format(labels)
[{"text": "negative"}]
```
>>> created = {}
>>> BaseStorage.to_serializer_format(labels, created)
[{"text": "positive", "suffix_key": "p", "prefix_key": None}]
"""
existing_shortkeys = {(label.suffix_key, label.prefix_key)
for label in created.values()}
serializer_labels = []
for label in sorted(labels):
serializer_label = {'text': label}
shortkey = cls.get_shortkey(label, existing_shortkeys)
if shortkey:
serializer_label['suffix_key'] = shortkey[0]
serializer_label['prefix_key'] = shortkey[1]
existing_shortkeys.add(shortkey)
serializer_labels.append(serializer_label)
return serializer_labels
@classmethod
def get_shortkey(cls, label, existing_shortkeys):
"""Find the first non existing shortkey for the label.
Example without existing shortkey:
>>> BaseStorage.get_shortkey("positive", set())
("p", None)
Example with existing shortkey:
>>> BaseStorage.get_shortkey("positive", {("p", None)})
("p", "ctrl")
"""
return [{'text': label} for label in labels]
model_prefix_keys = [key for (key, _) in Label.PREFIX_KEYS]
prefix_keys = [None] + model_prefix_keys
model_suffix_keys = {key for (key, _) in Label.SUFFIX_KEYS}
suffix_keys = [key for key in label.lower() if key in model_suffix_keys]
for shortkey in itertools.product(suffix_keys, prefix_keys):
if shortkey not in existing_shortkeys:
return shortkey
return None
def update_saved_labels(self, saved, new):
"""Update saved labels.
@ -120,7 +164,7 @@ class ClassificationStorage(BaseStorage):
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels)
unique_labels = self.to_serializer_format(unique_labels, saved_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
@ -170,7 +214,7 @@ class SequenceLabelingStorage(BaseStorage):
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels)
unique_labels = self.to_serializer_format(unique_labels, saved_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)

Loading…
Cancel
Save