From 44de1824af81fe2917840b4193f7d3a4f1118b51 Mon Sep 17 00:00:00 2001 From: Clemens Wolff Date: Fri, 24 May 2019 11:57:49 -0400 Subject: [PATCH 1/4] Make import batch size configurable via env --- app/app/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/app/settings.py b/app/app/settings.py index e634230d..c9773630 100644 --- a/app/app/settings.py +++ b/app/app/settings.py @@ -257,7 +257,7 @@ SECURE_PROXY_SSL_HEADER = ('HTTP_X_FORWARDED_PROTO', 'https') # Size of the batch for creating documents # on the import phase -IMPORT_BATCH_SIZE = 500 +IMPORT_BATCH_SIZE = env.int('IMPORT_BATCH_SIZE', 500) GOOGLE_TRACKING_ID = env('GOOGLE_TRACKING_ID', 'UA-125643874-2') From c495b141cc769d963a5bec569b82fbac18e97408 Mon Sep 17 00:00:00 2001 From: Clemens Wolff Date: Fri, 24 May 2019 11:58:18 -0400 Subject: [PATCH 2/4] Ensure data pagination is covered in tests --- app/server/tests/test_api.py | 1 + app/server/utils.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/app/server/tests/test_api.py b/app/server/tests/test_api.py index f93a88b7..fc10a36f 100644 --- a/app/server/tests/test_api.py +++ b/app/server/tests/test_api.py @@ -872,6 +872,7 @@ class TestFeatures(APITestCase): self.assertFalse(response.json().get('cloud_upload')) +@override_settings(IMPORT_BATCH_SIZE=2) class TestParser(APITestCase): def parser_helper(self, filename, parser, include_label=True): diff --git a/app/server/utils.py b/app/server/utils.py index 08e1b5ab..a4814fa4 100644 --- a/app/server/utils.py +++ b/app/server/utils.py @@ -7,10 +7,10 @@ from collections import defaultdict from random import Random from django.db import transaction +from django.conf import settings from rest_framework.renderers import JSONRenderer from seqeval.metrics.sequence_labeling import get_entities -from app.settings import IMPORT_BATCH_SIZE from .exceptions import FileParseException from .models import Label from .serializers import DocumentSerializer, LabelSerializer @@ -251,7 +251,7 @@ class CoNLLParser(FileParser): words, tags = [], [] data = [] for i, line in enumerate(file, start=1): - if len(data) >= IMPORT_BATCH_SIZE: + if len(data) >= settings.IMPORT_BATCH_SIZE: yield data data = [] line = line.decode('utf-8') @@ -301,7 +301,7 @@ class PlainTextParser(FileParser): def parse(self, file): file = io.TextIOWrapper(file, encoding='utf-8') while True: - batch = list(itertools.islice(file, IMPORT_BATCH_SIZE)) + batch = list(itertools.islice(file, settings.IMPORT_BATCH_SIZE)) if not batch: break yield [{'text': line.strip()} for line in batch] @@ -327,7 +327,7 @@ class CSVParser(FileParser): columns = next(reader) data = [] for i, row in enumerate(reader, start=2): - if len(data) >= IMPORT_BATCH_SIZE: + if len(data) >= settings.IMPORT_BATCH_SIZE: yield data data = [] if len(row) == len(columns) and len(row) >= 2: @@ -347,7 +347,7 @@ class JSONParser(FileParser): file = io.TextIOWrapper(file, encoding='utf-8') data = [] for i, line in enumerate(file, start=1): - if len(data) >= IMPORT_BATCH_SIZE: + if len(data) >= settings.IMPORT_BATCH_SIZE: yield data data = [] try: From 22285277f086c164f55ecbc6bd88c40a5b074f5b Mon Sep 17 00:00:00 2001 From: Clemens Wolff Date: Fri, 24 May 2019 11:58:34 -0400 Subject: [PATCH 3/4] Remove outdated comment --- app/server/utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/app/server/utils.py b/app/server/utils.py index a4814fa4..3026dd38 100644 --- a/app/server/utils.py +++ b/app/server/utils.py @@ -242,12 +242,6 @@ class CoNLLParser(FileParser): ``` """ def parse(self, file): - """Store json for seq2seq. - - Return format: - {"text": "Python is awesome!", "labels": [[0, 6, "Product"],]} - ... - """ words, tags = [], [] data = [] for i, line in enumerate(file, start=1): From e1ae68c5dfdc311a2b9e774b7aaf3002604ed7c4 Mon Sep 17 00:00:00 2001 From: Clemens Wolff Date: Fri, 24 May 2019 11:58:45 -0400 Subject: [PATCH 4/4] Transparently decode file --- app/server/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/server/utils.py b/app/server/utils.py index 3026dd38..8f18225a 100644 --- a/app/server/utils.py +++ b/app/server/utils.py @@ -244,11 +244,11 @@ class CoNLLParser(FileParser): def parse(self, file): words, tags = [], [] data = [] + file = io.TextIOWrapper(file, encoding='utf-8') for i, line in enumerate(file, start=1): if len(data) >= settings.IMPORT_BATCH_SIZE: yield data data = [] - line = line.decode('utf-8') line = line.strip() if line: try: