|
|
@ -6,6 +6,7 @@ import re |
|
|
|
from collections import defaultdict |
|
|
|
from random import Random |
|
|
|
|
|
|
|
import conllu |
|
|
|
from django.db import transaction |
|
|
|
from django.conf import settings |
|
|
|
from rest_framework.renderers import JSONRenderer |
|
|
@ -242,45 +243,50 @@ class CoNLLParser(FileParser): |
|
|
|
``` |
|
|
|
""" |
|
|
|
def parse(self, file): |
|
|
|
words, tags = [], [] |
|
|
|
data = [] |
|
|
|
file = io.TextIOWrapper(file, encoding='utf-8') |
|
|
|
for i, line in enumerate(file, start=1): |
|
|
|
if len(data) >= settings.IMPORT_BATCH_SIZE: |
|
|
|
yield data |
|
|
|
data = [] |
|
|
|
line = line.strip() |
|
|
|
if line: |
|
|
|
try: |
|
|
|
word, tag = line.split('\t') |
|
|
|
except ValueError: |
|
|
|
raise FileParseException(line_num=i, line=line) |
|
|
|
words.append(word) |
|
|
|
tags.append(tag) |
|
|
|
elif words and tags: |
|
|
|
j = self.calc_char_offset(words, tags) |
|
|
|
data.append(j) |
|
|
|
words, tags = [], [] |
|
|
|
if len(words) > 0: |
|
|
|
j = self.calc_char_offset(words, tags) |
|
|
|
data.append(j) |
|
|
|
|
|
|
|
# Add check exception |
|
|
|
|
|
|
|
field_parsers = { |
|
|
|
"ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]), |
|
|
|
} |
|
|
|
|
|
|
|
gen_parser = conllu.parse_incr( |
|
|
|
file, |
|
|
|
fields=("form", "ne"), |
|
|
|
field_parsers=field_parsers |
|
|
|
) |
|
|
|
|
|
|
|
try: |
|
|
|
for sentence in gen_parser: |
|
|
|
if not sentence: |
|
|
|
continue |
|
|
|
if len(data) >= settings.IMPORT_BATCH_SIZE: |
|
|
|
yield data |
|
|
|
data = [] |
|
|
|
words, labels = [], [] |
|
|
|
for item in sentence: |
|
|
|
word = item.get("form") |
|
|
|
tag = item.get("ne") |
|
|
|
|
|
|
|
if tag is not None: |
|
|
|
char_left = sum(map(len, words)) + len(words) |
|
|
|
char_right = char_left + len(word) |
|
|
|
span = [char_left, char_right, tag] |
|
|
|
labels.append(span) |
|
|
|
|
|
|
|
words.append(word) |
|
|
|
|
|
|
|
# Create and add JSONL |
|
|
|
data.append({'text': ' '.join(words), 'labels': labels}) |
|
|
|
|
|
|
|
except conllu.parser.ParseException as e: |
|
|
|
raise FileParseException(line_num=-1, line=str(e)) |
|
|
|
|
|
|
|
if data: |
|
|
|
yield data |
|
|
|
|
|
|
|
@classmethod |
|
|
|
def calc_char_offset(cls, words, tags): |
|
|
|
doc = ' '.join(words) |
|
|
|
j = {'text': ' '.join(words), 'labels': []} |
|
|
|
pos = defaultdict(int) |
|
|
|
for label, start_offset, end_offset in get_entities(tags): |
|
|
|
entity = ' '.join(words[start_offset: end_offset + 1]) |
|
|
|
char_left = doc.index(entity, pos[entity]) |
|
|
|
char_right = char_left + len(entity) |
|
|
|
span = [char_left, char_right, label] |
|
|
|
j['labels'].append(span) |
|
|
|
pos[entity] = char_right |
|
|
|
return j |
|
|
|
|
|
|
|
|
|
|
|
class PlainTextParser(FileParser): |
|
|
|
"""Uploads plain text. |
|
|
|