From a07dd1863bbad48a3214ea67bff36f1f4847baaa Mon Sep 17 00:00:00 2001
From: Alexey Matveev <>
Date: Fri, 19 Jul 2019 11:29:03 +0300
Subject: [PATCH 1/6] Use library conllu

---
 app/api/tests/data/labeling.invalid.conll |  2 +-
 app/api/tests/test_utils.py               | 13 ++--
 app/api/utils.py                          | 73 +++++++++++++----------
 requirements.txt                          |  3 +-
 4 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/app/api/tests/data/labeling.invalid.conll b/app/api/tests/data/labeling.invalid.conll
index 57c390cd..3e153a0f 100644
--- a/app/api/tests/data/labeling.invalid.conll
+++ b/app/api/tests/data/labeling.invalid.conll
@@ -1,4 +1,4 @@
-SOCCERO
+SOCCERO SOCCERO SOCCERO
 -	O
 JAPAN	B-LOC
 GET	O
diff --git a/app/api/tests/test_utils.py b/app/api/tests/test_utils.py
index 5d4874b8..30e00063 100644
--- a/app/api/tests/test_utils.py
+++ b/app/api/tests/test_utils.py
@@ -143,13 +143,16 @@ class TestSeq2seqStorage(TestCase):
 
 class TestCoNLLParser(TestCase):
     def test_calc_char_offset(self):
-        words = ['EU', 'rejects', 'German', 'call']
-        tags = ['B-ORG', 'O', 'B-MISC', 'O']
+        f = io.BytesIO()
 
-        entities = get_entities(tags)
-        actual = CoNLLParser.calc_char_offset(words, tags)
+        s = [
+            ("EU", "ORG"), ("rejects", "_"), ("German", "MISC"), ("call", "_")
+        ]
+        for w, t in s:
+            f.write("{}\t{}\n".format(w, t).encode())
+        f.seek(0)
 
-        self.assertEqual(entities, [('ORG', 0, 0), ('MISC', 2, 2)])
+        actual = next(CoNLLParser().parse(f))[0]
 
         self.assertEqual(actual, {
             'text': 'EU rejects German call',
diff --git a/app/api/utils.py b/app/api/utils.py
index 2f9fca92..04a50b38 100644
--- a/app/api/utils.py
+++ b/app/api/utils.py
@@ -6,6 +6,7 @@ import re
 from collections import defaultdict
 from random import Random
 
+import conllu
 from django.db import transaction
 from django.conf import settings
 from rest_framework.renderers import JSONRenderer
@@ -242,45 +243,51 @@ class CoNLLParser(FileParser):
     ```
     """
     def parse(self, file):
-        words, tags = [], []
         data = []
         file = io.TextIOWrapper(file, encoding='utf-8')
-        for i, line in enumerate(file, start=1):
-            if len(data) >= settings.IMPORT_BATCH_SIZE:
-                yield data
-                data = []
-            line = line.strip()
-            if line:
-                try:
-                    word, tag = line.split('\t')
-                except ValueError:
-                    raise FileParseException(line_num=i, line=line)
+
+        # Add check exception
+
+        field_parsers = {
+            "ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]),
+        }
+
+        try:
+            sentences = conllu.parse(
+                file.read(),
+                fields=("form", "ne"),
+                field_parsers=field_parsers
+            )
+        except conllu.parser.ParseException as e:
+            raise FileParseException(line_num=-1, line=str(e))
+
+        for sentence in sentences:
+            if not sentence:
+                continue
+            # if len(data) >= settings.IMPORT_BATCH_SIZE:
+            #     yield data
+            #     data = []
+            words, labels = [], []
+            for item in sentence:
+                word = item.get("form")
+                tag = item.get("ne", None)
+
+                if tag is not None:
+                    char_left = sum(map(lambda x: len(x), words)) + len(words)
+                    char_right = char_left + len(word)
+                    span = [char_left, char_right, tag]
+                    labels.append(span)
+
                 words.append(word)
-                tags.append(tag)
-            elif words and tags:
-                j = self.calc_char_offset(words, tags)
-                data.append(j)
-                words, tags = [], []
-        if len(words) > 0:
-            j = self.calc_char_offset(words, tags)
+
+            # Create JSONL
+            j = {'text': ' '.join(words), 'labels': labels}
+
             data.append(j)
+
         if data:
             yield data
 
-    @classmethod
-    def calc_char_offset(cls, words, tags):
-        doc = ' '.join(words)
-        j = {'text': ' '.join(words), 'labels': []}
-        pos = defaultdict(int)
-        for label, start_offset, end_offset in get_entities(tags):
-            entity = ' '.join(words[start_offset: end_offset + 1])
-            char_left = doc.index(entity, pos[entity])
-            char_right = char_left + len(entity)
-            span = [char_left, char_right, label]
-            j['labels'].append(span)
-            pos[entity] = char_right
-        return j
-
 
 class PlainTextParser(FileParser):
     """Uploads plain text.
@@ -373,6 +380,7 @@ class JSONLRenderer(JSONRenderer):
                              ensure_ascii=self.ensure_ascii,
                              allow_nan=not self.strict) + '\n'
 
+
 class JSONPainter(object):
 
     def paint(self, documents):
@@ -406,6 +414,7 @@ class JSONPainter(object):
             data.append(d)
         return data
 
+
 class CSVPainter(JSONPainter):
 
     def paint(self, documents):
diff --git a/requirements.txt b/requirements.txt
index af1b6582..841d6b56 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ djangorestframework-csv==2.1.0
 djangorestframework-filters==0.10.2
 environs==4.1.0
 djangorestframework-xml==1.4.0
-Faker==0.8.8
+Faker==0.9.1
 flake8==3.6.0
 furl==2.0.0
 gunicorn==19.9.0
@@ -36,3 +36,4 @@ unittest-xml-reporting==2.5.1
 vcrpy==2.0.1
 vcrpy-unittest==0.1.7
 whitenoise[brotli]==4.1.2
+conllu

From 4c194289e4fd6907bbaa08812a8ed89e7e5e5b56 Mon Sep 17 00:00:00 2001
From: Alexey Matveev <>
Date: Fri, 19 Jul 2019 11:43:37 +0300
Subject: [PATCH 2/6] #FIX Lambda may not be necessary

---
 app/api/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/api/utils.py b/app/api/utils.py
index 04a50b38..eec8d60d 100644
--- a/app/api/utils.py
+++ b/app/api/utils.py
@@ -273,7 +273,7 @@ class CoNLLParser(FileParser):
                 tag = item.get("ne", None)
 
                 if tag is not None:
-                    char_left = sum(map(lambda x: len(x), words)) + len(words)
+                    char_left = sum(map(len, words)) + len(words)
                     char_right = char_left + len(word)
                     span = [char_left, char_right, tag]
                     labels.append(span)

From 4a07b2e3f2b33775955eb408482ac47744854bc3 Mon Sep 17 00:00:00 2001
From: Alexey Matveev <>
Date: Thu, 8 Aug 2019 09:10:46 +0300
Subject: [PATCH 3/6] Add version bor conllu

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 841d6b56..8bad2859 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,4 +36,4 @@ unittest-xml-reporting==2.5.1
 vcrpy==2.0.1
 vcrpy-unittest==0.1.7
 whitenoise[brotli]==4.1.2
-conllu
+conllu==1.3.1

From 43d9234b78a73840c41ab6f0da83960da84c5d14 Mon Sep 17 00:00:00 2001
From: Alexey Matveev <>
Date: Thu, 8 Aug 2019 09:42:09 +0300
Subject: [PATCH 4/6] Fix by comment

---
 app/api/tests/test_utils.py | 14 ++++-----
 app/api/utils.py            | 58 ++++++++++++++++++-------------------
 2 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/app/api/tests/test_utils.py b/app/api/tests/test_utils.py
index 30e00063..34d0e9ac 100644
--- a/app/api/tests/test_utils.py
+++ b/app/api/tests/test_utils.py
@@ -143,14 +143,12 @@ class TestSeq2seqStorage(TestCase):
 
 class TestCoNLLParser(TestCase):
     def test_calc_char_offset(self):
-        f = io.BytesIO()
-
-        s = [
-            ("EU", "ORG"), ("rejects", "_"), ("German", "MISC"), ("call", "_")
-        ]
-        for w, t in s:
-            f.write("{}\t{}\n".format(w, t).encode())
-        f.seek(0)
+        f = io.BytesIO(
+          b"EU\tORG\n"
+          b"rejects\t_\n"
+          b"German\tMISC\n"
+          b"call\t_\n"
+        )
 
         actual = next(CoNLLParser().parse(f))[0]
 
diff --git a/app/api/utils.py b/app/api/utils.py
index eec8d60d..e7e42b80 100644
--- a/app/api/utils.py
+++ b/app/api/utils.py
@@ -252,39 +252,39 @@ class CoNLLParser(FileParser):
             "ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]),
         }
 
+        gen_parser = conllu.parse_incr(
+            file,
+            fields=("form", "ne"),
+            field_parsers=field_parsers
+        )
+
         try:
-            sentences = conllu.parse(
-                file.read(),
-                fields=("form", "ne"),
-                field_parsers=field_parsers
-            )
+            for sentence in gen_parser:
+                if not sentence:
+                    continue
+                if len(data) >= settings.IMPORT_BATCH_SIZE:
+                    yield data
+                    data = []
+                words, labels = [], []
+                for item in sentence:
+                    word = item.get("form")
+                    tag = item.get("ne")
+
+                    if tag is not None:
+                        char_left = sum(map(len, words)) + len(words)
+                        char_right = char_left + len(word)
+                        span = [char_left, char_right, tag]
+                        labels.append(span)
+
+                    words.append(word)
+
+                # Create JSONL
+                j = {'text': ' '.join(words), 'labels': labels}
+
+                data.append(j)
         except conllu.parser.ParseException as e:
             raise FileParseException(line_num=-1, line=str(e))
 
-        for sentence in sentences:
-            if not sentence:
-                continue
-            # if len(data) >= settings.IMPORT_BATCH_SIZE:
-            #     yield data
-            #     data = []
-            words, labels = [], []
-            for item in sentence:
-                word = item.get("form")
-                tag = item.get("ne", None)
-
-                if tag is not None:
-                    char_left = sum(map(len, words)) + len(words)
-                    char_right = char_left + len(word)
-                    span = [char_left, char_right, tag]
-                    labels.append(span)
-
-                words.append(word)
-
-            # Create JSONL
-            j = {'text': ' '.join(words), 'labels': labels}
-
-            data.append(j)
-
         if data:
             yield data
 

From 54623a2e58858e601b63b9b4337f8ccc3007610f Mon Sep 17 00:00:00 2001
From: Alexey Matveev <>
Date: Fri, 9 Aug 2019 14:56:18 +0300
Subject: [PATCH 5/6] Fix multi line

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8bad2859..bf19f50e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,4 +36,4 @@ unittest-xml-reporting==2.5.1
 vcrpy==2.0.1
 vcrpy-unittest==0.1.7
 whitenoise[brotli]==4.1.2
-conllu==1.3.1
+conllu==1.3.2

From 4be71c6603b39fbb9eeb6a5d2aa32d5860eb6e50 Mon Sep 17 00:00:00 2001
From: Alexey Matveev <>
Date: Tue, 13 Aug 2019 19:06:48 +0300
Subject: [PATCH 6/6] Fix inline variable j

---
 app/api/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/app/api/utils.py b/app/api/utils.py
index e7e42b80..2ecda302 100644
--- a/app/api/utils.py
+++ b/app/api/utils.py
@@ -278,10 +278,9 @@ class CoNLLParser(FileParser):
 
                     words.append(word)
 
-                # Create JSONL
-                j = {'text': ' '.join(words), 'labels': labels}
+                # Create and add JSONL
+                data.append({'text': ' '.join(words), 'labels': labels})
 
-                data.append(j)
         except conllu.parser.ParseException as e:
             raise FileParseException(line_num=-1, line=str(e))