From a07dd1863bbad48a3214ea67bff36f1f4847baaa Mon Sep 17 00:00:00 2001 From: Alexey Matveev <> Date: Fri, 19 Jul 2019 11:29:03 +0300 Subject: [PATCH 1/6] Use library conllu --- app/api/tests/data/labeling.invalid.conll | 2 +- app/api/tests/test_utils.py | 13 ++-- app/api/utils.py | 73 +++++++++++++---------- requirements.txt | 3 +- 4 files changed, 52 insertions(+), 39 deletions(-) diff --git a/app/api/tests/data/labeling.invalid.conll b/app/api/tests/data/labeling.invalid.conll index 57c390cd..3e153a0f 100644 --- a/app/api/tests/data/labeling.invalid.conll +++ b/app/api/tests/data/labeling.invalid.conll @@ -1,4 +1,4 @@ -SOCCERO +SOCCERO SOCCERO SOCCERO - O JAPAN B-LOC GET O diff --git a/app/api/tests/test_utils.py b/app/api/tests/test_utils.py index 5d4874b8..30e00063 100644 --- a/app/api/tests/test_utils.py +++ b/app/api/tests/test_utils.py @@ -143,13 +143,16 @@ class TestSeq2seqStorage(TestCase): class TestCoNLLParser(TestCase): def test_calc_char_offset(self): - words = ['EU', 'rejects', 'German', 'call'] - tags = ['B-ORG', 'O', 'B-MISC', 'O'] + f = io.BytesIO() - entities = get_entities(tags) - actual = CoNLLParser.calc_char_offset(words, tags) + s = [ + ("EU", "ORG"), ("rejects", "_"), ("German", "MISC"), ("call", "_") + ] + for w, t in s: + f.write("{}\t{}\n".format(w, t).encode()) + f.seek(0) - self.assertEqual(entities, [('ORG', 0, 0), ('MISC', 2, 2)]) + actual = next(CoNLLParser().parse(f))[0] self.assertEqual(actual, { 'text': 'EU rejects German call', diff --git a/app/api/utils.py b/app/api/utils.py index 2f9fca92..04a50b38 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -6,6 +6,7 @@ import re from collections import defaultdict from random import Random +import conllu from django.db import transaction from django.conf import settings from rest_framework.renderers import JSONRenderer @@ -242,45 +243,51 @@ class CoNLLParser(FileParser): ``` """ def parse(self, file): - words, tags = [], [] data = [] file = io.TextIOWrapper(file, encoding='utf-8') - for i, line in enumerate(file, start=1): - if len(data) >= settings.IMPORT_BATCH_SIZE: - yield data - data = [] - line = line.strip() - if line: - try: - word, tag = line.split('\t') - except ValueError: - raise FileParseException(line_num=i, line=line) + + # Add check exception + + field_parsers = { + "ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]), + } + + try: + sentences = conllu.parse( + file.read(), + fields=("form", "ne"), + field_parsers=field_parsers + ) + except conllu.parser.ParseException as e: + raise FileParseException(line_num=-1, line=str(e)) + + for sentence in sentences: + if not sentence: + continue + # if len(data) >= settings.IMPORT_BATCH_SIZE: + # yield data + # data = [] + words, labels = [], [] + for item in sentence: + word = item.get("form") + tag = item.get("ne", None) + + if tag is not None: + char_left = sum(map(lambda x: len(x), words)) + len(words) + char_right = char_left + len(word) + span = [char_left, char_right, tag] + labels.append(span) + words.append(word) - tags.append(tag) - elif words and tags: - j = self.calc_char_offset(words, tags) - data.append(j) - words, tags = [], [] - if len(words) > 0: - j = self.calc_char_offset(words, tags) + + # Create JSONL + j = {'text': ' '.join(words), 'labels': labels} + data.append(j) + if data: yield data - @classmethod - def calc_char_offset(cls, words, tags): - doc = ' '.join(words) - j = {'text': ' '.join(words), 'labels': []} - pos = defaultdict(int) - for label, start_offset, end_offset in get_entities(tags): - entity = ' '.join(words[start_offset: end_offset + 1]) - char_left = doc.index(entity, pos[entity]) - char_right = char_left + len(entity) - span = [char_left, char_right, label] - j['labels'].append(span) - pos[entity] = char_right - return j - class PlainTextParser(FileParser): """Uploads plain text. @@ -373,6 +380,7 @@ class JSONLRenderer(JSONRenderer): ensure_ascii=self.ensure_ascii, allow_nan=not self.strict) + '\n' + class JSONPainter(object): def paint(self, documents): @@ -406,6 +414,7 @@ class JSONPainter(object): data.append(d) return data + class CSVPainter(JSONPainter): def paint(self, documents): diff --git a/requirements.txt b/requirements.txt index af1b6582..841d6b56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ djangorestframework-csv==2.1.0 djangorestframework-filters==0.10.2 environs==4.1.0 djangorestframework-xml==1.4.0 -Faker==0.8.8 +Faker==0.9.1 flake8==3.6.0 furl==2.0.0 gunicorn==19.9.0 @@ -36,3 +36,4 @@ unittest-xml-reporting==2.5.1 vcrpy==2.0.1 vcrpy-unittest==0.1.7 whitenoise[brotli]==4.1.2 +conllu From 4c194289e4fd6907bbaa08812a8ed89e7e5e5b56 Mon Sep 17 00:00:00 2001 From: Alexey Matveev <> Date: Fri, 19 Jul 2019 11:43:37 +0300 Subject: [PATCH 2/6] #FIX Lambda may not be necessary --- app/api/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/api/utils.py b/app/api/utils.py index 04a50b38..eec8d60d 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -273,7 +273,7 @@ class CoNLLParser(FileParser): tag = item.get("ne", None) if tag is not None: - char_left = sum(map(lambda x: len(x), words)) + len(words) + char_left = sum(map(len, words)) + len(words) char_right = char_left + len(word) span = [char_left, char_right, tag] labels.append(span) From 4a07b2e3f2b33775955eb408482ac47744854bc3 Mon Sep 17 00:00:00 2001 From: Alexey Matveev <> Date: Thu, 8 Aug 2019 09:10:46 +0300 Subject: [PATCH 3/6] Add version bor conllu --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 841d6b56..8bad2859 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,4 +36,4 @@ unittest-xml-reporting==2.5.1 vcrpy==2.0.1 vcrpy-unittest==0.1.7 whitenoise[brotli]==4.1.2 -conllu +conllu==1.3.1 From 43d9234b78a73840c41ab6f0da83960da84c5d14 Mon Sep 17 00:00:00 2001 From: Alexey Matveev <> Date: Thu, 8 Aug 2019 09:42:09 +0300 Subject: [PATCH 4/6] Fix by comment --- app/api/tests/test_utils.py | 14 ++++----- app/api/utils.py | 58 ++++++++++++++++++------------------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/app/api/tests/test_utils.py b/app/api/tests/test_utils.py index 30e00063..34d0e9ac 100644 --- a/app/api/tests/test_utils.py +++ b/app/api/tests/test_utils.py @@ -143,14 +143,12 @@ class TestSeq2seqStorage(TestCase): class TestCoNLLParser(TestCase): def test_calc_char_offset(self): - f = io.BytesIO() - - s = [ - ("EU", "ORG"), ("rejects", "_"), ("German", "MISC"), ("call", "_") - ] - for w, t in s: - f.write("{}\t{}\n".format(w, t).encode()) - f.seek(0) + f = io.BytesIO( + b"EU\tORG\n" + b"rejects\t_\n" + b"German\tMISC\n" + b"call\t_\n" + ) actual = next(CoNLLParser().parse(f))[0] diff --git a/app/api/utils.py b/app/api/utils.py index eec8d60d..e7e42b80 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -252,39 +252,39 @@ class CoNLLParser(FileParser): "ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]), } + gen_parser = conllu.parse_incr( + file, + fields=("form", "ne"), + field_parsers=field_parsers + ) + try: - sentences = conllu.parse( - file.read(), - fields=("form", "ne"), - field_parsers=field_parsers - ) + for sentence in gen_parser: + if not sentence: + continue + if len(data) >= settings.IMPORT_BATCH_SIZE: + yield data + data = [] + words, labels = [], [] + for item in sentence: + word = item.get("form") + tag = item.get("ne") + + if tag is not None: + char_left = sum(map(len, words)) + len(words) + char_right = char_left + len(word) + span = [char_left, char_right, tag] + labels.append(span) + + words.append(word) + + # Create JSONL + j = {'text': ' '.join(words), 'labels': labels} + + data.append(j) except conllu.parser.ParseException as e: raise FileParseException(line_num=-1, line=str(e)) - for sentence in sentences: - if not sentence: - continue - # if len(data) >= settings.IMPORT_BATCH_SIZE: - # yield data - # data = [] - words, labels = [], [] - for item in sentence: - word = item.get("form") - tag = item.get("ne", None) - - if tag is not None: - char_left = sum(map(len, words)) + len(words) - char_right = char_left + len(word) - span = [char_left, char_right, tag] - labels.append(span) - - words.append(word) - - # Create JSONL - j = {'text': ' '.join(words), 'labels': labels} - - data.append(j) - if data: yield data From 54623a2e58858e601b63b9b4337f8ccc3007610f Mon Sep 17 00:00:00 2001 From: Alexey Matveev <> Date: Fri, 9 Aug 2019 14:56:18 +0300 Subject: [PATCH 5/6] Fix multi line --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8bad2859..bf19f50e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,4 +36,4 @@ unittest-xml-reporting==2.5.1 vcrpy==2.0.1 vcrpy-unittest==0.1.7 whitenoise[brotli]==4.1.2 -conllu==1.3.1 +conllu==1.3.2 From 4be71c6603b39fbb9eeb6a5d2aa32d5860eb6e50 Mon Sep 17 00:00:00 2001 From: Alexey Matveev <> Date: Tue, 13 Aug 2019 19:06:48 +0300 Subject: [PATCH 6/6] Fix inline variable j --- app/api/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/app/api/utils.py b/app/api/utils.py index e7e42b80..2ecda302 100644 --- a/app/api/utils.py +++ b/app/api/utils.py @@ -278,10 +278,9 @@ class CoNLLParser(FileParser): words.append(word) - # Create JSONL - j = {'text': ' '.join(words), 'labels': labels} + # Create and add JSONL + data.append({'text': ' '.join(words), 'labels': labels}) - data.append(j) except conllu.parser.ParseException as e: raise FileParseException(line_num=-1, line=str(e))