You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

446 lines
14 KiB

import csv
import io
import itertools
import json
import re
from collections import defaultdict
from django.db import transaction
from rest_framework.renderers import JSONRenderer
from seqeval.metrics.sequence_labeling import get_entities
from app.settings import IMPORT_BATCH_SIZE
from .exceptions import FileParseException
from .models import Label
from .serializers import DocumentSerializer, LabelSerializer
def extract_label(tag):
ptn = re.compile(r'(B|I|E|S)-(.+)')
m = ptn.match(tag)
if m:
return m.groups()[1]
else:
return tag
class BaseStorage(object):
def __init__(self, data, project):
self.data = data
self.project = project
@transaction.atomic
def save(self, user):
raise NotImplementedError()
def save_doc(self, data):
serializer = DocumentSerializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
doc = serializer.save(project=self.project)
return doc
def save_label(self, data):
serializer = LabelSerializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
label = serializer.save(project=self.project)
return label
def save_annotation(self, data, user):
annotation_serializer = self.project.get_annotation_serializer()
serializer = annotation_serializer(data=data, many=True)
serializer.is_valid(raise_exception=True)
annotation = serializer.save(user=user)
return annotation
def extract_label(self, data):
"""Extract labels from parsed data.
Example:
>>> data = [{"labels": ["positive"]}, {"labels": ["negative"]}]
>>> self.extract_label(data)
[["positive"], ["negative"]]
"""
return [d.get('labels', []) for d in data]
def exclude_created_labels(self, labels, created):
"""Exclude created labels.
Example:
>>> labels = ["positive", "negative"]
>>> created = {"positive": ...}
>>> self.exclude_created_labels(labels, created)
["negative"]
"""
return [label for label in labels if label not in created]
def to_serializer_format(self, labels):
"""Exclude created labels.
Example:
>>> labels = ["positive"]
>>> self.to_serializer_format(labels)
[{"text": "negative"}]
```
"""
return [{'text': label} for label in labels]
def update_saved_labels(self, saved, new):
"""Update saved labels.
Example:
>>> saved = {'positive': ...}
>>> new = [<Label: positive>]
"""
for label in new:
saved[label.text] = label
return saved
class PlainStorage(BaseStorage):
@transaction.atomic
def save(self, user):
for text in self.data:
self.save_doc(text)
class ClassificationStorage(BaseStorage):
"""Store json for text classification.
The format is as follows:
{"text": "Python is awesome!", "labels": ["positive"]}
...
"""
@transaction.atomic
def save(self, user):
saved_labels = {label.text: label for label in self.project.labels.all()}
for data in self.data:
docs = self.save_doc(data)
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
self.save_annotation(annotations, user)
def extract_unique_labels(self, labels):
"""Extract unique labels
Example:
>>> labels = [["positive"], ["positive", "negative"], ["negative"]]
>>> self.extract_unique_labels(labels)
["positive", "negative"]
"""
return set(itertools.chain(*labels))
def make_annotations(self, docs, labels, saved_labels):
"""Make list of annotation obj for serializer.
Example:
>>> docs = ["<Document: a>", "<Document: b>", "<Document: c>"]
>>> labels = [["positive"], ["positive", "negative"], ["negative"]]
>>> saved_labels = {"positive": "<Label: positive>", 'negative': "<Label: negative>"}
>>> self.make_annotations(docs, labels, saved_labels)
[{"document": 1, "label": 1}, {"document": 2, "label": 1}
{"document": 2, "label": 2}, {"document": 3, "label": 2}]
"""
annotations = []
for doc, label in zip(docs, labels):
for name in label:
label = saved_labels[name]
annotations.append({'document': doc.id, 'label': label.id})
return annotations
class SequenceLabelingStorage(BaseStorage):
"""Upload jsonl for sequence labeling.
The format is as follows:
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
...
"""
@transaction.atomic
def save(self, user):
saved_labels = {label.text: label for label in self.project.labels.all()}
for data in self.data:
docs = self.save_doc(data)
labels = self.extract_label(data)
unique_labels = self.extract_unique_labels(labels)
unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
unique_labels = self.to_serializer_format(unique_labels)
new_labels = self.save_label(unique_labels)
saved_labels = self.update_saved_labels(saved_labels, new_labels)
annotations = self.make_annotations(docs, labels, saved_labels)
self.save_annotation(annotations, user)
def extract_unique_labels(self, labels):
"""Extract unique labels
Example:
>>> labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]]
>>> self.extract_unique_labels(labels)
["LOC", "ORG"]
"""
return set([label for _, _, label in itertools.chain(*labels)])
def make_annotations(self, docs, labels, saved_labels):
"""Make list of annotation obj for serializer.
Example:
>>> docs = ["<Document: a>", "<Document: b>"]
>>> labels = labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]]
>>> saved_labels = {"LOC": "<Label: LOC>", 'ORG': "<Label: ORG>"}
>>> self.make_annotations(docs, labels, saved_labels)
[
{"document": 1, "label": 1, "start_offset": 0, "end_offset": 1}
{"document": 2, "label": 2, "start_offset": 3, "end_offset": 4}
]
"""
annotations = []
for doc, spans in zip(docs, labels):
for span in spans:
start_offset, end_offset, name = span
label = saved_labels[name]
annotations.append({'document': doc.id,
'label': label.id,
'start_offset': start_offset,
'end_offset': end_offset})
return annotations
class Seq2seqStorage(BaseStorage):
"""Store json for seq2seq.
The format is as follows:
{"text": "Hello, World!", "labels": ["こんにちは、世界!"]}
...
"""
@transaction.atomic
def save(self, user):
for data in self.data:
doc = self.save_doc(data)
labels = self.extract_label(data)
annotations = self.make_annotations(doc, labels)
self.save_annotation(annotations, user)
def make_annotations(self, docs, labels):
"""Make list of annotation obj for serializer.
Example:
>>> docs = ["<Document: a>", "<Document: b>"]
>>> labels = [["Hello!"], ["How are you?", "What's up?"]]
>>> self.make_annotations(docs, labels)
[{"document": 1, "text": "Hello"}, {"document": 2, "text": "How are you?"}
{"document": 2, "text": "What's up?"}]
"""
annotations = []
for doc, texts in zip(docs, labels):
for text in texts:
annotations.append({'document': doc.id, 'text': text})
return annotations
class FileParser(object):
def parse(self, file):
raise NotImplementedError()
class CoNLLParser(FileParser):
"""Uploads CoNLL format file.
The file format is tab-separated values.
A blank line is required at the end of a sentence.
For example:
```
EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O
Peter B-PER
Blackburn I-PER
...
```
"""
def parse(self, file):
"""Store json for seq2seq.
Return format:
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
...
"""
words, tags = [], []
data = []
for i, line in enumerate(file, start=1):
if len(data) >= IMPORT_BATCH_SIZE:
yield data
data = []
line = line.decode('utf-8')
line = line.strip()
if line:
try:
word, tag = line.split('\t')
except ValueError:
raise FileParseException(line_num=i, line=line)
words.append(word)
tags.append(tag)
else:
j = self.calc_char_offset(words, tags)
data.append(j)
words, tags = [], []
if len(words) > 0:
j = self.calc_char_offset(words, tags)
data.append(j)
yield data
def calc_char_offset(self, words, tags):
"""
Examples:
>>> words = ['EU', 'rejects', 'German', 'call']
>>> tags = ['B-ORG', 'O', 'B-MISC', 'O']
>>> entities = get_entities(tags)
>>> entities
[['ORG', 0, 0], ['MISC', 2, 2]]
>>> self.calc_char_offset(words, tags)
{
'text': 'EU rejects German call',
'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
}
"""
doc = ' '.join(words)
j = {'text': ' '.join(words), 'labels': []}
pos = defaultdict(int)
for label, start_offset, end_offset in get_entities(tags):
entity = ' '.join(words[start_offset: end_offset + 1])
char_left = doc.index(entity, pos[entity])
char_right = char_left + len(entity)
span = [char_left, char_right, label]
j['labels'].append(span)
pos[entity] = char_right
return j
class PlainTextParser(FileParser):
"""Uploads plain text.
The file format is as follows:
```
EU rejects German call to boycott British lamb.
President Obama is speaking at the White House.
...
```
"""
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
while True:
batch = list(itertools.islice(file, IMPORT_BATCH_SIZE))
if not batch:
raise StopIteration
yield [{'text': line.strip()} for line in batch]
class CSVParser(FileParser):
"""Uploads csv file.
The file format is comma separated values.
Column names are required at the top of a file.
For example:
```
text, label
"EU rejects German call to boycott British lamb.",Politics
"President Obama is speaking at the White House.",Politics
"He lives in Newark, Ohio.",Other
...
```
"""
def parse(self, file):
file = io.TextIOWrapper(file, encoding='utf-8')
reader = csv.reader(file)
columns = next(reader)
data = []
for i, row in enumerate(reader, start=2):
if len(data) >= IMPORT_BATCH_SIZE:
yield data
data = []
if len(row) == len(columns) and len(row) >= 2:
text, label = row[:2]
meta = json.dumps(dict(zip(columns[2:], row[2:])))
j = {'text': text, 'labels': [label], 'meta': meta}
data.append(j)
else:
raise FileParseException(line_num=i, line=row)
if data:
yield data
class JSONParser(FileParser):
def parse(self, file):
data = []
for i, line in enumerate(file, start=1):
if len(data) >= IMPORT_BATCH_SIZE:
yield data
data = []
try:
j = json.loads(line)
j['meta'] = json.dumps(j.get('meta', {}))
data.append(j)
except json.decoder.JSONDecodeError:
raise FileParseException(line_num=i, line=line)
if data:
yield data
class JSONLRenderer(JSONRenderer):
def render(self, data, accepted_media_type=None, renderer_context=None):
"""
Render `data` into JSON, returning a bytestring.
"""
if data is None:
return bytes()
if not isinstance(data, list):
data = [data]
for d in data:
yield json.dumps(d,
cls=self.encoder_class,
ensure_ascii=self.ensure_ascii,
allow_nan=not self.strict) + '\n'
class JSONPainter(object):
def paint(self, documents):
serializer = DocumentSerializer(documents, many=True)
data = []
for d in serializer.data:
d['meta'] = json.loads(d['meta'])
for a in d['annotations']:
a.pop('id')
a.pop('prob')
a.pop('document')
data.append(d)
return data
class CSVPainter(JSONPainter):
def paint(self, documents):
data = super().paint(documents)
res = []
for d in data:
annotations = d.pop('annotations')
for a in annotations:
res.append({**d, **a})
return res