doccano/app/server/utils.py

import csv
import io
import itertools
import json
import re
from collections import defaultdict

from django.db import transaction
from rest_framework.renderers import JSONRenderer
from seqeval.metrics.sequence_labeling import get_entities

from app.settings import IMPORT_BATCH_SIZE
from .exceptions import FileParseException
from .models import Label
from .serializers import DocumentSerializer, LabelSerializer


def extract_label(tag):
    ptn = re.compile(r'(B|I|E|S)-(.+)')
    m = ptn.match(tag)
    if m:
        return m.groups()[1]
    else:
        return tag


class BaseStorage(object):

    def __init__(self, data, project):
        self.data = data
        self.project = project

    @transaction.atomic
    def save(self, user):
        raise NotImplementedError()

    def save_doc(self, data):
        serializer = DocumentSerializer(data=data, many=True)
        serializer.is_valid(raise_exception=True)
        doc = serializer.save(project=self.project)
        return doc

    def save_label(self, data):
        serializer = LabelSerializer(data=data, many=True)
        serializer.is_valid(raise_exception=True)
        label = serializer.save(project=self.project)
        return label

    def save_annotation(self, data, user):
        annotation_serializer = self.project.get_annotation_serializer()
        serializer = annotation_serializer(data=data, many=True)
        serializer.is_valid(raise_exception=True)
        annotation = serializer.save(user=user)
        return annotation

    def extract_label(self, data):
        """Extract labels from parsed data.

        Example:
            >>> data = [{"labels": ["positive"]}, {"labels": ["negative"]}]
            >>> self.extract_label(data)
            [["positive"], ["negative"]]
        """
        return [d.get('labels', []) for d in data]

    def exclude_created_labels(self, labels, created):
        """Exclude created labels.

        Example:
            >>> labels = ["positive", "negative"]
            >>> created = {"positive": ...}
            >>> self.exclude_created_labels(labels, created)
            ["negative"]
        """
        return [label for label in labels if label not in created]

    def to_serializer_format(self, labels):
        """Exclude created labels.

        Example:
            >>> labels = ["positive"]
            >>> self.to_serializer_format(labels)
            [{"text": "negative"}]
        ```
        """
        return [{'text': label} for label in labels]

    def update_saved_labels(self, saved, new):
        """Update saved labels.

        Example:
            >>> saved = {'positive': ...}
            >>> new = [<Label: positive>]
        """
        for label in new:
            saved[label.text] = label
        return saved


class PlainStorage(BaseStorage):

    @transaction.atomic
    def save(self, user):
        for text in self.data:
            self.save_doc(text)


class ClassificationStorage(BaseStorage):
    """Store json for text classification.

    The format is as follows:
    {"text": "Python is awesome!", "labels": ["positive"]}
    ...
    """
    @transaction.atomic
    def save(self, user):
        saved_labels = {label.text: label for label in self.project.labels.all()}
        for data in self.data:
            docs = self.save_doc(data)
            labels = self.extract_label(data)
            unique_labels = self.extract_unique_labels(labels)
            unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
            unique_labels = self.to_serializer_format(unique_labels)
            new_labels = self.save_label(unique_labels)
            saved_labels = self.update_saved_labels(saved_labels, new_labels)
            annotations = self.make_annotations(docs, labels, saved_labels)
            self.save_annotation(annotations, user)

    def extract_unique_labels(self, labels):
        """Extract unique labels

        Example:
            >>> labels = [["positive"], ["positive", "negative"], ["negative"]]
            >>> self.extract_unique_labels(labels)
            ["positive", "negative"]
        """
        return set(itertools.chain(*labels))

    def make_annotations(self, docs, labels, saved_labels):
        """Make list of annotation obj for serializer.

        Example:
            >>> docs = ["<Document: a>", "<Document: b>", "<Document: c>"]
            >>> labels = [["positive"], ["positive", "negative"], ["negative"]]
            >>> saved_labels = {"positive": "<Label: positive>", 'negative': "<Label: negative>"}
            >>> self.make_annotations(docs, labels, saved_labels)
            [{"document": 1, "label": 1}, {"document": 2, "label": 1}
            {"document": 2, "label": 2}, {"document": 3, "label": 2}]
        """
        annotations = []
        for doc, label in zip(docs, labels):
            for name in label:
                label = saved_labels[name]
                annotations.append({'document': doc.id, 'label': label.id})
        return annotations


class SequenceLabelingStorage(BaseStorage):
    """Upload jsonl for sequence labeling.

    The format is as follows:
    {"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
    ...
    """
    @transaction.atomic
    def save(self, user):
        saved_labels = {label.text: label for label in self.project.labels.all()}
        for data in self.data:
            docs = self.save_doc(data)
            labels = self.extract_label(data)
            unique_labels = self.extract_unique_labels(labels)
            unique_labels = self.exclude_created_labels(unique_labels, saved_labels)
            unique_labels = self.to_serializer_format(unique_labels)
            new_labels = self.save_label(unique_labels)
            saved_labels = self.update_saved_labels(saved_labels, new_labels)
            annotations = self.make_annotations(docs, labels, saved_labels)
            self.save_annotation(annotations, user)

    def extract_unique_labels(self, labels):
        """Extract unique labels

        Example:
            >>> labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]]
            >>> self.extract_unique_labels(labels)
            ["LOC", "ORG"]
        """
        return set([label for _, _, label in itertools.chain(*labels)])

    def make_annotations(self, docs, labels, saved_labels):
        """Make list of annotation obj for serializer.

        Example:
            >>> docs = ["<Document: a>", "<Document: b>"]
            >>> labels = labels = [[[0, 1, "LOC"]], [[3, 4, "ORG"]]]
            >>> saved_labels = {"LOC": "<Label: LOC>", 'ORG': "<Label: ORG>"}
            >>> self.make_annotations(docs, labels, saved_labels)
            [
              {"document": 1, "label": 1, "start_offset": 0, "end_offset": 1}
              {"document": 2, "label": 2, "start_offset": 3, "end_offset": 4}
            ]
        """
        annotations = []
        for doc, spans in zip(docs, labels):
            for span in spans:
                start_offset, end_offset, name = span
                label = saved_labels[name]
                annotations.append({'document': doc.id,
                                    'label': label.id,
                                    'start_offset': start_offset,
                                    'end_offset': end_offset})
        return annotations


class Seq2seqStorage(BaseStorage):
    """Store json for seq2seq.

    The format is as follows:
    {"text": "Hello, World!", "labels": ["こんにちは、世界!"]}
    ...
    """
    @transaction.atomic
    def save(self, user):
        for data in self.data:
            doc = self.save_doc(data)
            labels = self.extract_label(data)
            annotations = self.make_annotations(doc, labels)
            self.save_annotation(annotations, user)

    def make_annotations(self, docs, labels):
        """Make list of annotation obj for serializer.

        Example:
            >>> docs = ["<Document: a>", "<Document: b>"]
            >>> labels = [["Hello!"], ["How are you?", "What's up?"]]
            >>> self.make_annotations(docs, labels)
            [{"document": 1, "text": "Hello"}, {"document": 2, "text": "How are you?"}
            {"document": 2, "text": "What's up?"}]
        """
        annotations = []
        for doc, texts in zip(docs, labels):
            for text in texts:
                annotations.append({'document': doc.id, 'text': text})
        return annotations


class FileParser(object):

    def parse(self, file):
        raise NotImplementedError()


class CoNLLParser(FileParser):
    """Uploads CoNLL format file.

    The file format is tab-separated values.
    A blank line is required at the end of a sentence.
    For example:
    ```
    EU	B-ORG
    rejects	O
    German	B-MISC
    call	O
    to	O
    boycott	O
    British	B-MISC
    lamb	O
    .	O

    Peter	B-PER
    Blackburn	I-PER
    ...
    ```
    """
    def parse(self, file):
        """Store json for seq2seq.

        Return format:
        {"text": "Python is awesome!", "labels": [[0, 6, "Product"],]}
        ...
        """
        words, tags = [], []
        data = []
        for i, line in enumerate(file, start=1):
            if len(data) >= IMPORT_BATCH_SIZE:
                yield data
                data = []
            line = line.decode('utf-8')
            line = line.strip()
            if line:
                try:
                    word, tag = line.split('\t')
                except ValueError:
                    raise FileParseException(line_num=i, line=line)
                words.append(word)
                tags.append(tag)
            else:
                j = self.calc_char_offset(words, tags)
                data.append(j)
                words, tags = [], []
        if len(words) > 0:
            j = self.calc_char_offset(words, tags)
            data.append(j)
            yield data

    def calc_char_offset(self, words, tags):
        """
        Examples:
            >>> words = ['EU', 'rejects', 'German', 'call']
            >>> tags = ['B-ORG', 'O', 'B-MISC', 'O']
            >>> entities = get_entities(tags)
            >>> entities
            [['ORG', 0, 0], ['MISC', 2, 2]]
            >>> self.calc_char_offset(words, tags)
            {
              'text': 'EU rejects German call',
              'labels': [[0, 2, 'ORG'], [11, 17, 'MISC']]
            }
        """
        doc = ' '.join(words)
        j = {'text': ' '.join(words), 'labels': []}
        pos = defaultdict(int)
        for label, start_offset, end_offset in get_entities(tags):
            entity = ' '.join(words[start_offset: end_offset + 1])
            char_left = doc.index(entity, pos[entity])
            char_right = char_left + len(entity)
            span = [char_left, char_right, label]
            j['labels'].append(span)
            pos[entity] = char_right
        return j


class PlainTextParser(FileParser):
    """Uploads plain text.

    The file format is as follows:
    ```
    EU rejects German call to boycott British lamb.
    President Obama is speaking at the White House.
    ...
    ```
    """
    def parse(self, file):
        file = io.TextIOWrapper(file, encoding='utf-8')
        while True:
            batch = list(itertools.islice(file, IMPORT_BATCH_SIZE))
            if not batch:
                raise StopIteration
            yield [{'text': line.strip()} for line in batch]


class CSVParser(FileParser):
    """Uploads csv file.

    The file format is comma separated values.
    Column names are required at the top of a file.
    For example:
    ```
    text, label
    "EU rejects German call to boycott British lamb.",Politics
    "President Obama is speaking at the White House.",Politics
    "He lives in Newark, Ohio.",Other
    ...
    ```
    """
    def parse(self, file):
        file = io.TextIOWrapper(file, encoding='utf-8')
        reader = csv.reader(file)
        columns = next(reader)
        data = []
        for i, row in enumerate(reader, start=2):
            if len(data) >= IMPORT_BATCH_SIZE:
                yield data
                data = []
            if len(row) == len(columns) and len(row) >= 2:
                text, label = row[:2]
                meta = json.dumps(dict(zip(columns[2:], row[2:])))
                j = {'text': text, 'labels': [label], 'meta': meta}
                data.append(j)
            else:
                raise FileParseException(line_num=i, line=row)
        if data:
            yield data


class JSONParser(FileParser):

    def parse(self, file):
        data = []
        for i, line in enumerate(file, start=1):
            if len(data) >= IMPORT_BATCH_SIZE:
                yield data
                data = []
            try:
                j = json.loads(line)
                j['meta'] = json.dumps(j.get('meta', {}))
                data.append(j)
            except json.decoder.JSONDecodeError:
                raise FileParseException(line_num=i, line=line)
        if data:
            yield data


class JSONLRenderer(JSONRenderer):

    def render(self, data, accepted_media_type=None, renderer_context=None):
        """
        Render `data` into JSON, returning a bytestring.
        """
        if data is None:
            return bytes()

        if not isinstance(data, list):
            data = [data]

        for d in data:
            yield json.dumps(d,
                             cls=self.encoder_class,
                             ensure_ascii=self.ensure_ascii,
                             allow_nan=not self.strict) + '\n'


class JSONPainter(object):

    def paint(self, documents):
        serializer = DocumentSerializer(documents, many=True)
        data = []
        for d in serializer.data:
            d['meta'] = json.loads(d['meta'])
            for a in d['annotations']:
                a.pop('id')
                a.pop('prob')
                a.pop('document')
            data.append(d)
        return data


class CSVPainter(JSONPainter):

    def paint(self, documents):
        data = super().paint(documents)
        res = []
        for d in data:
            annotations = d.pop('annotations')
            for a in annotations:
                res.append({**d, **a})
        return res