From 7c3116f1508b5fa32589ac3ea2472ba60dadde23 Mon Sep 17 00:00:00 2001 From: Clemens Wolff Date: Sat, 25 May 2019 21:12:33 -0400 Subject: [PATCH] Fix CoNLL parsing with trailing newlines --- app/server/tests/data/labeling.trailing.conll | 14 ++++++++++++++ app/server/tests/test_api.py | 8 +++++++- app/server/utils.py | 3 ++- 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 app/server/tests/data/labeling.trailing.conll diff --git a/app/server/tests/data/labeling.trailing.conll b/app/server/tests/data/labeling.trailing.conll new file mode 100644 index 00000000..13a4b33d --- /dev/null +++ b/app/server/tests/data/labeling.trailing.conll @@ -0,0 +1,14 @@ +SOCCER O +- O +JAPAN B-LOC +GET O +LUCKY O +WIN O +, O +CHINA B-PER +IN O +SURPRISE O +DEFEAT O +. O + + diff --git a/app/server/tests/test_api.py b/app/server/tests/test_api.py index e5471cc7..907c1cd1 100644 --- a/app/server/tests/test_api.py +++ b/app/server/tests/test_api.py @@ -808,16 +808,22 @@ class TestParser(APITestCase): def parser_helper(self, filename, parser, include_label=True): with open(os.path.join(DATA_DIR, filename), mode='rb') as f: - result = parser.parse(f) + result = list(parser.parse(f)) for data in result: for r in data: self.assertIn('text', r) if include_label: self.assertIn('labels', r) + return result def test_give_valid_data_to_conll_parser(self): self.parser_helper(filename='labeling.conll', parser=CoNLLParser()) + def test_give_valid_data_to_conll_parser_with_trailing_newlines(self): + result = self.parser_helper(filename='labeling.trailing.conll', parser=CoNLLParser()) + self.assertEqual(len(result), 1) + self.assertEqual(len(result[0]), 1) + def test_plain_parser(self): self.parser_helper(filename='example.txt', parser=PlainTextParser(), include_label=False) diff --git a/app/server/utils.py b/app/server/utils.py index e33bbed5..c334527b 100644 --- a/app/server/utils.py +++ b/app/server/utils.py @@ -263,13 +263,14 @@ class CoNLLParser(FileParser): raise FileParseException(line_num=i, line=line) words.append(word) tags.append(tag) - else: + elif words and tags: j = self.calc_char_offset(words, tags) data.append(j) words, tags = [], [] if len(words) > 0: j = self.calc_char_offset(words, tags) data.append(j) + if data: yield data @classmethod