|
|
@ -1,15 +1,18 @@ |
|
|
|
import csv |
|
|
|
import io |
|
|
|
import itertools |
|
|
|
import json |
|
|
|
import re |
|
|
|
from collections import defaultdict |
|
|
|
|
|
|
|
from django.db import transaction |
|
|
|
from django.http import HttpResponse |
|
|
|
from rest_framework.exceptions import ValidationError |
|
|
|
from rest_framework.renderers import JSONRenderer |
|
|
|
from seqeval.metrics.sequence_labeling import get_entities |
|
|
|
|
|
|
|
from app.settings import IMPORT_BATCH_SIZE |
|
|
|
from .exceptions import FileParseException |
|
|
|
from .models import Label |
|
|
|
from .serializers import DocumentSerializer, LabelSerializer |
|
|
|
from .serializers import SequenceAnnotationSerializer, DocumentAnnotationSerializer, Seq2seqAnnotationSerializer |
|
|
|
|
|
|
|
|
|
|
|
def extract_label(tag): |
|
|
@ -21,44 +24,232 @@ def extract_label(tag): |
|
|
|
return tag |
|
|
|
|
|
|
|
|
|
|
|
class FileHandler(object): |
|
|
|
annotation_serializer = None |
|
|
|
class BaseStorage(object): |
|
|
|
|
|
|
|
def __init__(self, project): |
|
|
|
def __init__(self, data, project): |
|
|
|
self.data = data |
|
|
|
self.project = project |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
raise NotImplementedError() |
|
|
|
|
|
|
|
def parse(self, file): |
|
|
|
raise NotImplementedError() |
|
|
|
|
|
|
|
def render(self): |
|
|
|
def save(self, user): |
|
|
|
raise NotImplementedError() |
|
|
|
|
|
|
|
def save_doc(self, data): |
|
|
|
serializer = DocumentSerializer(data=data) |
|
|
|
serializer = DocumentSerializer(data=data, many=True) |
|
|
|
serializer.is_valid(raise_exception=True) |
|
|
|
doc = serializer.save(project=self.project) |
|
|
|
return doc |
|
|
|
|
|
|
|
def save_label(self, data): |
|
|
|
from .models import Label |
|
|
|
label = Label.objects.filter(project=self.project, **data).first() |
|
|
|
serializer = LabelSerializer(label, data=data) |
|
|
|
serializer = LabelSerializer(data=data, many=True) |
|
|
|
serializer.is_valid(raise_exception=True) |
|
|
|
label = serializer.save(project=self.project) |
|
|
|
return label |
|
|
|
|
|
|
|
def save_annotation(self, data, doc, user): |
|
|
|
serializer = self.annotation_serializer(data=data) |
|
|
|
def save_annotation(self, data, user): |
|
|
|
annotation_serializer = self.project.get_annotation_serializer() |
|
|
|
serializer = annotation_serializer(data=data, many=True) |
|
|
|
serializer.is_valid(raise_exception=True) |
|
|
|
annotation = serializer.save(document=doc, user=user) |
|
|
|
annotation = serializer.save(user=user) |
|
|
|
return annotation |
|
|
|
|
|
|
|
def extract_label(self, data): |
|
|
|
"""Extract labels from parsed data. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> data = [{"labels": ["positive"]}, {"labels": ["negative"]}] |
|
|
|
>>> self.extract_label(data) |
|
|
|
[["positive"], ["negative"]] |
|
|
|
""" |
|
|
|
return [d.get('labels', []) for d in data] |
|
|
|
|
|
|
|
def exclude_created_labels(self, labels, created): |
|
|
|
"""Exclude created labels. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> labels = ["positive", "negative"] |
|
|
|
>>> created = {"positive": ...} |
|
|
|
>>> self.exclude_created_labels(labels, created) |
|
|
|
["negative"] |
|
|
|
""" |
|
|
|
return [label for label in labels if label not in created] |
|
|
|
|
|
|
|
class CoNLLHandler(FileHandler): |
|
|
|
def to_serializer_format(self, labels): |
|
|
|
"""Exclude created labels. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> labels = ["positive"] |
|
|
|
>>> self.to_serializer_format(labels) |
|
|
|
[{"text": "negative"}] |
|
|
|
``` |
|
|
|
""" |
|
|
|
return [{'text': label} for label in labels] |
|
|
|
|
|
|
|
def update_saved_labels(self, saved, new): |
|
|
|
"""Update saved labels. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> saved = {'positive': ...} |
|
|
|
>>> new = [<Label: positive>] |
|
|
|
""" |
|
|
|
for label in new: |
|
|
|
saved[label.text] = label |
|
|
|
return saved |
|
|
|
|
|
|
|
|
|
|
|
class PlainStorage(BaseStorage): |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def save(self, user): |
|
|
|
for text in self.data: |
|
|
|
self.save_doc(text) |
|
|
|
|
|
|
|
|
|
|
|
class ClassificationStorage(BaseStorage): |
|
|
|
"""Store json for text classification. |
|
|
|
|
|
|
|
The format is as follows: |
|
|
|
{"text": "Python is awesome!", "labels": ["positive"]} |
|
|
|
... |
|
|
|
""" |
|
|
|
@transaction.atomic |
|
|
|
def save(self, user): |
|
|
|
saved_labels = {label.text: label for label in self.project.labels.all()} |
|
|
|
for data in self.data: |
|
|
|
docs = self.save_doc(data) |
|
|
|
labels = self.extract_label(data) |
|
|
|
unique_labels = self.extract_unique_labels(labels) |
|
|
|
unique_labels = self.exclude_created_labels(unique_labels, saved_labels) |
|
|
|
unique_labels = self.to_serializer_format(unique_labels) |
|
|
|
new_labels = self.save_label(unique_labels) |
|
|
|
saved_labels = self.update_saved_labels(saved_labels, new_labels) |
|
|
|
annotations = self.make_annotations(docs, labels, saved_labels) |
|
|
|
self.save_annotation(annotations, user) |
|
|
|
|
|
|
|
def extract_unique_labels(self, labels): |
|
|
|
"""Extract unique labels |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> labels = [["positive"], ["positive", "negative"], ["negative"]] |
|
|
|
>>> self.extract_unique_labels(labels) |
|
|
|
["positive", "negative"] |
|
|
|
""" |
|
|
|
return set(itertools.chain(*labels)) |
|
|
|
|
|
|
|
def make_annotations(self, docs, labels, saved_labels): |
|
|
|
"""Make list of annotation obj for serializer. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> docs = ["<Document: a>", "<Document: b>", "<Document: c>"] |
|
|
|
>>> labels = [["positive"], ["positive", "negative"], ["negative"]] |
|
|
|
>>> saved_labels = {"positive": "<Label: positive>", 'negative': "<Label: negative>"} |
|
|
|
>>> self.make_annotations(docs, labels, saved_labels) |
|
|
|
[{"document": 1, "label": 1}, {"document": 2, "label": 1} |
|
|
|
{"document": 2, "label": 2}, {"document": 3, "label": 2}] |
|
|
|
""" |
|
|
|
annotations = [] |
|
|
|
for doc, label in zip(docs, labels): |
|
|
|
for name in label: |
|
|
|
label = saved_labels[name] |
|
|
|
annotations.append({'document': doc.id, 'label': label.id}) |
|
|
|
return annotations |
|
|
|
|
|
|
|
|
|
|
|
class SequenceLabelingStorage(BaseStorage): |
|
|
|
"""Upload jsonl for sequence labeling. |
|
|
|
|
|
|
|
The format is as follows: |
|
|
|
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]} |
|
|
|
... |
|
|
|
""" |
|
|
|
@transaction.atomic |
|
|
|
def save(self, user): |
|
|
|
saved_labels = {label.text: label for label in self.project.labels.all()} |
|
|
|
for data in self.data: |
|
|
|
docs = self.save_doc(data) |
|
|
|
labels = self.extract_label(data) |
|
|
|
unique_labels = self.extract_unique_labels(labels) |
|
|
|
unique_labels = self.exclude_created_labels(unique_labels, saved_labels) |
|
|
|
unique_labels = self.to_serializer_format(unique_labels) |
|
|
|
new_labels = self.save_label(unique_labels) |
|
|
|
saved_labels = self.update_saved_labels(saved_labels, new_labels) |
|
|
|
annotations = self.make_annotations(docs, labels, saved_labels) |
|
|
|
self.save_annotation(annotations, user) |
|
|
|
|
|
|
|
def extract_unique_labels(self, labels): |
|
|
|
"""Extract unique labels |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]] |
|
|
|
>>> self.extract_unique_labels(labels) |
|
|
|
["LOC", "ORG"] |
|
|
|
""" |
|
|
|
return set([label for _, _, label in itertools.chain(*labels)]) |
|
|
|
|
|
|
|
def make_annotations(self, docs, labels, saved_labels): |
|
|
|
"""Make list of annotation obj for serializer. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> docs = ["<Document: a>", "<Document: b>"] |
|
|
|
>>> labels = labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]] |
|
|
|
>>> saved_labels = {"LOC": "<Label: LOC>", 'ORG': "<Label: ORG>"} |
|
|
|
>>> self.make_annotations(docs, labels, saved_labels) |
|
|
|
[ |
|
|
|
{"document": 1, "label": 1, "start_offset": 0, "end_offset": 1} |
|
|
|
{"document": 2, "label": 2, "start_offset": 3, "end_offset": 4} |
|
|
|
] |
|
|
|
""" |
|
|
|
annotations = [] |
|
|
|
for doc, spans in zip(docs, labels): |
|
|
|
for span in spans: |
|
|
|
start_offset, end_offset, name = span |
|
|
|
label = saved_labels[name] |
|
|
|
annotations.append({'document': doc.id, |
|
|
|
'label': label.id, |
|
|
|
'start_offset': start_offset, |
|
|
|
'end_offset': end_offset}) |
|
|
|
return annotations |
|
|
|
|
|
|
|
|
|
|
|
class Seq2seqStorage(BaseStorage): |
|
|
|
"""Store json for seq2seq. |
|
|
|
|
|
|
|
The format is as follows: |
|
|
|
{"text": "Hello, World!", "labels": ["こんにちは、世界!"]} |
|
|
|
... |
|
|
|
""" |
|
|
|
@transaction.atomic |
|
|
|
def save(self, user): |
|
|
|
for data in self.data: |
|
|
|
doc = self.save_doc(data) |
|
|
|
labels = self.extract_label(data) |
|
|
|
annotations = self.make_annotations(doc, labels) |
|
|
|
self.save_annotation(annotations, user) |
|
|
|
|
|
|
|
def make_annotations(self, docs, labels): |
|
|
|
"""Make list of annotation obj for serializer. |
|
|
|
|
|
|
|
Example: |
|
|
|
>>> docs = ["<Document: a>", "<Document: b>"] |
|
|
|
>>> labels = [["Hello!"], ["How are you?", "What's up?"]] |
|
|
|
>>> self.make_annotations(docs, labels) |
|
|
|
[{"document": 1, "text": "Hello"}, {"document": 2, "text": "How are you?"} |
|
|
|
{"document": 2, "text": "What's up?"}] |
|
|
|
""" |
|
|
|
annotations = [] |
|
|
|
for doc, texts in zip(docs, labels): |
|
|
|
for text in texts: |
|
|
|
annotations.append({'document': doc.id, 'text': text}) |
|
|
|
return annotations |
|
|
|
|
|
|
|
|
|
|
|
class FileParser(object): |
|
|
|
|
|
|
|
def parse(self, file): |
|
|
|
raise NotImplementedError() |
|
|
|
|
|
|
|
|
|
|
|
class CoNLLParser(FileParser): |
|
|
|
"""Uploads CoNLL format file. |
|
|
|
|
|
|
|
The file format is tab-separated values. |
|
|
@ -80,26 +271,19 @@ class CoNLLHandler(FileHandler): |
|
|
|
... |
|
|
|
``` |
|
|
|
""" |
|
|
|
annotation_serializer = SequenceAnnotationSerializer |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for words, tags in self.parse(file): |
|
|
|
start_offset = 0 |
|
|
|
sent = ' '.join(words) |
|
|
|
doc = self.save_doc({'text': sent}) |
|
|
|
for word, tag in zip(words, tags): |
|
|
|
label = extract_label(tag) |
|
|
|
label = self.save_label({'text': label}) |
|
|
|
data = {'start_offset': start_offset, |
|
|
|
'end_offset': start_offset + len(word), |
|
|
|
'label': label.id} |
|
|
|
start_offset += len(word) + 1 |
|
|
|
self.save_annotation(data, doc, user) |
|
|
|
|
|
|
|
def parse(self, file): |
|
|
|
"""Store json for seq2seq. |
|
|
|
|
|
|
|
Return format: |
|
|
|
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]} |
|
|
|
... |
|
|
|
""" |
|
|
|
words, tags = [], [] |
|
|
|
data = [] |
|
|
|
for i, line in enumerate(file, start=1): |
|
|
|
if len(data) >= IMPORT_BATCH_SIZE: |
|
|
|
yield data |
|
|
|
data = [] |
|
|
|
line = line.decode('utf-8') |
|
|
|
line = line.strip() |
|
|
|
if line: |
|
|
@ -110,16 +294,42 @@ class CoNLLHandler(FileHandler): |
|
|
|
words.append(word) |
|
|
|
tags.append(tag) |
|
|
|
else: |
|
|
|
yield words, tags |
|
|
|
j = self.calc_char_offset(words, tags) |
|
|
|
data.append(j) |
|
|
|
words, tags = [], [] |
|
|
|
if len(words) > 0: |
|
|
|
yield words, tags |
|
|
|
|
|
|
|
def render(self): |
|
|
|
raise ValidationError("This project type doesn't support CoNLL format.") |
|
|
|
|
|
|
|
|
|
|
|
class PlainTextHandler(FileHandler): |
|
|
|
j = self.calc_char_offset(words, tags) |
|
|
|
data.append(j) |
|
|
|
yield data |
|
|
|
|
|
|
|
def calc_char_offset(self, words, tags): |
|
|
|
""" |
|
|
|
Examples: |
|
|
|
>>> words = ['EU', 'rejects', 'German', 'call'] |
|
|
|
>>> tags = ['B-ORG', 'O', 'B-MISC', 'O'] |
|
|
|
>>> entities = get_entities(tags) |
|
|
|
>>> entities |
|
|
|
[['ORG', 0, 0], ['MISC', 2, 2]] |
|
|
|
>>> self.calc_char_offset(words, tags) |
|
|
|
{ |
|
|
|
'text': 'EU rejects German call', |
|
|
|
'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']] |
|
|
|
} |
|
|
|
""" |
|
|
|
doc = ' '.join(words) |
|
|
|
j = {'text': ' '.join(words), 'labels': []} |
|
|
|
pos = defaultdict(int) |
|
|
|
for label, start_offset, end_offset in get_entities(tags): |
|
|
|
entity = ' '.join(words[start_offset: end_offset + 1]) |
|
|
|
char_left = doc.index(entity, pos[entity]) |
|
|
|
char_right = char_left + len(entity) |
|
|
|
span = [char_left, char_right, label] |
|
|
|
j['labels'].append(span) |
|
|
|
pos[entity] = char_right |
|
|
|
return j |
|
|
|
|
|
|
|
|
|
|
|
class PlainTextParser(FileParser): |
|
|
|
"""Uploads plain text. |
|
|
|
|
|
|
|
The file format is as follows: |
|
|
@ -129,21 +339,16 @@ class PlainTextHandler(FileHandler): |
|
|
|
... |
|
|
|
``` |
|
|
|
""" |
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for text in self.parse(file): |
|
|
|
self.save_doc({'text': text}) |
|
|
|
|
|
|
|
def parse(self, file): |
|
|
|
file = io.TextIOWrapper(file, encoding='utf-8') |
|
|
|
for i, line in enumerate(file, start=1): |
|
|
|
yield line.strip() |
|
|
|
|
|
|
|
def render(self): |
|
|
|
raise ValidationError("You cannot download plain text. Please specify csv or json.") |
|
|
|
while True: |
|
|
|
batch = list(itertools.islice(file, IMPORT_BATCH_SIZE)) |
|
|
|
if not batch: |
|
|
|
raise StopIteration |
|
|
|
yield [{'text': line.strip()} for line in batch] |
|
|
|
|
|
|
|
|
|
|
|
class CSVHandler(FileHandler): |
|
|
|
class CSVParser(FileParser): |
|
|
|
"""Uploads csv file. |
|
|
|
|
|
|
|
The file format is comma separated values. |
|
|
@ -161,163 +366,81 @@ class CSVHandler(FileHandler): |
|
|
|
file = io.TextIOWrapper(file, encoding='utf-8') |
|
|
|
reader = csv.reader(file) |
|
|
|
columns = next(reader) |
|
|
|
data = [] |
|
|
|
for i, row in enumerate(reader, start=2): |
|
|
|
if len(data) >= IMPORT_BATCH_SIZE: |
|
|
|
yield data |
|
|
|
data = [] |
|
|
|
if len(row) == len(columns) and len(row) >= 2: |
|
|
|
text, label = row[:2] |
|
|
|
meta = json.dumps(dict(zip(columns[2:], row[2:]))) |
|
|
|
data = {'text': text, 'meta': meta} |
|
|
|
yield data, label |
|
|
|
j = {'text': text, 'labels': [label], 'meta': meta} |
|
|
|
data.append(j) |
|
|
|
else: |
|
|
|
raise FileParseException(line_num=i, line=row) |
|
|
|
if data: |
|
|
|
yield data |
|
|
|
|
|
|
|
def render(self): |
|
|
|
queryset = self.project.documents.all() |
|
|
|
serializer = DocumentSerializer(queryset, many=True) |
|
|
|
filename = '_'.join(self.project.name.lower().split()) |
|
|
|
response = HttpResponse(content_type='text/csv') |
|
|
|
response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format(filename) |
|
|
|
writer = csv.writer(response) |
|
|
|
columns = ['id', 'text', 'label', 'user'] |
|
|
|
meta_keys = self.get_meta_keys(serializer.data) |
|
|
|
columns.extend(meta_keys) |
|
|
|
writer.writerow(columns) |
|
|
|
for d in serializer.data: |
|
|
|
meta = json.loads(d['meta']) |
|
|
|
for a in d['annotations']: |
|
|
|
row = self.make_row(d, a) |
|
|
|
row.extend([meta[k] for k in meta_keys]) |
|
|
|
writer.writerow(row) |
|
|
|
return response |
|
|
|
|
|
|
|
def get_meta_keys(self, data): |
|
|
|
if len(data): |
|
|
|
meta = json.loads(data[0]['meta']) |
|
|
|
return sorted(meta.keys()) |
|
|
|
else: |
|
|
|
return [] |
|
|
|
|
|
|
|
def make_row(self, doc, annotation): |
|
|
|
raise NotImplementedError('Please implement in subclass.') |
|
|
|
|
|
|
|
|
|
|
|
class CSVClassificationHandler(CSVHandler): |
|
|
|
annotation_serializer = DocumentAnnotationSerializer |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for data, label in self.parse(file): |
|
|
|
doc = self.save_doc(data) |
|
|
|
label = self.save_label({'text': label}) |
|
|
|
self.save_annotation({'label': label.id}, doc, user) |
|
|
|
|
|
|
|
def make_row(self, doc, annotation): |
|
|
|
row = [doc['id'], doc['text'], annotation['label'], annotation['user']] |
|
|
|
return row |
|
|
|
|
|
|
|
|
|
|
|
class CSVSeq2seqHandler(CSVHandler): |
|
|
|
annotation_serializer = Seq2seqAnnotationSerializer |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for data, label in self.parse(file): |
|
|
|
doc = self.save_doc(data) |
|
|
|
self.save_annotation({'text': label}, doc, user) |
|
|
|
|
|
|
|
def make_row(self, doc, annotation): |
|
|
|
row = [doc['id'], doc['text'], annotation['text'], annotation['user']] |
|
|
|
return row |
|
|
|
class JSONParser(FileParser): |
|
|
|
|
|
|
|
|
|
|
|
class JsonHandler(FileHandler): |
|
|
|
"""Uploads jsonl file. |
|
|
|
|
|
|
|
The file format is as follows: |
|
|
|
``` |
|
|
|
{"text": "example1"} |
|
|
|
{"text": "example2"} |
|
|
|
... |
|
|
|
``` |
|
|
|
""" |
|
|
|
def parse(self, file): |
|
|
|
data = [] |
|
|
|
for i, line in enumerate(file, start=1): |
|
|
|
if len(data) >= IMPORT_BATCH_SIZE: |
|
|
|
yield data |
|
|
|
data = [] |
|
|
|
try: |
|
|
|
j = json.loads(line) |
|
|
|
j['meta'] = json.dumps(j.get('meta', {})) |
|
|
|
yield j |
|
|
|
data.append(j) |
|
|
|
except json.decoder.JSONDecodeError: |
|
|
|
raise FileParseException(line_num=i, line=line) |
|
|
|
if data: |
|
|
|
yield data |
|
|
|
|
|
|
|
def render(self): |
|
|
|
queryset = self.project.documents.all() |
|
|
|
serializer = DocumentSerializer(queryset, many=True) |
|
|
|
filename = '_'.join(self.project.name.lower().split()) |
|
|
|
response = HttpResponse(content_type='application/json') |
|
|
|
response['Content-Disposition'] = 'attachment; filename="{}.jsonl"'.format(filename) |
|
|
|
for d in serializer.data: |
|
|
|
d['meta'] = json.loads(d['meta']) |
|
|
|
dump = json.dumps(d, ensure_ascii=False) |
|
|
|
response.write(dump + '\n') |
|
|
|
return response |
|
|
|
|
|
|
|
class JSONLRenderer(JSONRenderer): |
|
|
|
|
|
|
|
class JsonClassificationHandler(JsonHandler): |
|
|
|
"""Upload jsonl for text classification. |
|
|
|
def render(self, data, accepted_media_type=None, renderer_context=None): |
|
|
|
""" |
|
|
|
Render `data` into JSON, returning a bytestring. |
|
|
|
""" |
|
|
|
if data is None: |
|
|
|
return bytes() |
|
|
|
|
|
|
|
The format is as follows: |
|
|
|
``` |
|
|
|
{"text": "Python is awesome!", "labels": ["positive"]} |
|
|
|
... |
|
|
|
``` |
|
|
|
""" |
|
|
|
annotation_serializer = DocumentAnnotationSerializer |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for data in self.parse(file): |
|
|
|
doc = self.save_doc(data) |
|
|
|
for label in data.get('labels', []): |
|
|
|
label = self.save_label({'text': label}) |
|
|
|
self.save_annotation({'label': label.id}, doc, user) |
|
|
|
|
|
|
|
|
|
|
|
class JsonLabelingHandler(JsonHandler): |
|
|
|
"""Upload jsonl for sequence labeling. |
|
|
|
|
|
|
|
The format is as follows: |
|
|
|
``` |
|
|
|
{"text": "Python is awesome!", "labels": [[0, 6, "Product"],]} |
|
|
|
... |
|
|
|
``` |
|
|
|
""" |
|
|
|
annotation_serializer = SequenceAnnotationSerializer |
|
|
|
if not isinstance(data, list): |
|
|
|
data = [data] |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for data in self.parse(file): |
|
|
|
doc = self.save_doc(data) |
|
|
|
for start_offset, end_offset, label in data.get('labels', []): |
|
|
|
label = self.save_label({'text': label}) |
|
|
|
data = {'label': label.id, |
|
|
|
'start_offset': start_offset, |
|
|
|
'end_offset': end_offset} |
|
|
|
self.save_annotation(data, doc, user) |
|
|
|
for d in data: |
|
|
|
yield json.dumps(d, |
|
|
|
cls=self.encoder_class, |
|
|
|
ensure_ascii=self.ensure_ascii, |
|
|
|
allow_nan=not self.strict) + '\n' |
|
|
|
|
|
|
|
|
|
|
|
class JsonSeq2seqHandler(JsonHandler): |
|
|
|
"""Upload jsonl for seq2seq. |
|
|
|
|
|
|
|
The format is as follows: |
|
|
|
``` |
|
|
|
{"text": "Hello, World!", "labels": ["こんにちは、世界!"]} |
|
|
|
... |
|
|
|
``` |
|
|
|
""" |
|
|
|
annotation_serializer = Seq2seqAnnotationSerializer |
|
|
|
class JSONPainter(object): |
|
|
|
|
|
|
|
@transaction.atomic |
|
|
|
def handle_uploaded_file(self, file, user): |
|
|
|
for data in self.parse(file): |
|
|
|
doc = self.save_doc(data) |
|
|
|
for label in data.get('labels', []): |
|
|
|
self.save_annotation({'text': label}, doc, user) |
|
|
|
def paint(self, documents): |
|
|
|
serializer = DocumentSerializer(documents, many=True) |
|
|
|
data = [] |
|
|
|
for d in serializer.data: |
|
|
|
d['meta'] = json.loads(d['meta']) |
|
|
|
for a in d['annotations']: |
|
|
|
a.pop('id') |
|
|
|
a.pop('prob') |
|
|
|
a.pop('document') |
|
|
|
data.append(d) |
|
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
class CSVPainter(JSONPainter): |
|
|
|
|
|
|
|
def paint(self, documents): |
|
|
|
data = super().paint(documents) |
|
|
|
res = [] |
|
|
|
for d in data: |
|
|
|
annotations = d.pop('annotations') |
|
|
|
for a in annotations: |
|
|
|
res.append({**d, **a}) |
|
|
|
return res |