Browse Source

Merge pull request #225 from CatalystCode/bugfix/conll-with-trailing-newlines

Bugfix/Fix CoNLL parsing with trailing newlines
pull/281/head
Hiroki Nakayama 5 years ago
committed by GitHub
parent
commit
9c9d223f39
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 2 deletions
  1. 14
      app/server/tests/data/labeling.trailing.conll
  2. 8
      app/server/tests/test_api.py
  3. 3
      app/server/utils.py

14
app/server/tests/data/labeling.trailing.conll

@ -0,0 +1,14 @@
SOCCER O
- O
JAPAN B-LOC
GET O
LUCKY O
WIN O
, O
CHINA B-PER
IN O
SURPRISE O
DEFEAT O
. O

8
app/server/tests/test_api.py

@ -808,16 +808,22 @@ class TestParser(APITestCase):
def parser_helper(self, filename, parser, include_label=True): def parser_helper(self, filename, parser, include_label=True):
with open(os.path.join(DATA_DIR, filename), mode='rb') as f: with open(os.path.join(DATA_DIR, filename), mode='rb') as f:
result = parser.parse(f)
result = list(parser.parse(f))
for data in result: for data in result:
for r in data: for r in data:
self.assertIn('text', r) self.assertIn('text', r)
if include_label: if include_label:
self.assertIn('labels', r) self.assertIn('labels', r)
return result
def test_give_valid_data_to_conll_parser(self): def test_give_valid_data_to_conll_parser(self):
self.parser_helper(filename='labeling.conll', parser=CoNLLParser()) self.parser_helper(filename='labeling.conll', parser=CoNLLParser())
def test_give_valid_data_to_conll_parser_with_trailing_newlines(self):
result = self.parser_helper(filename='labeling.trailing.conll', parser=CoNLLParser())
self.assertEqual(len(result), 1)
self.assertEqual(len(result[0]), 1)
def test_plain_parser(self): def test_plain_parser(self):
self.parser_helper(filename='example.txt', parser=PlainTextParser(), include_label=False) self.parser_helper(filename='example.txt', parser=PlainTextParser(), include_label=False)

3
app/server/utils.py

@ -263,13 +263,14 @@ class CoNLLParser(FileParser):
raise FileParseException(line_num=i, line=line) raise FileParseException(line_num=i, line=line)
words.append(word) words.append(word)
tags.append(tag) tags.append(tag)
else:
elif words and tags:
j = self.calc_char_offset(words, tags) j = self.calc_char_offset(words, tags)
data.append(j) data.append(j)
words, tags = [], [] words, tags = [], []
if len(words) > 0: if len(words) > 0:
j = self.calc_char_offset(words, tags) j = self.calc_char_offset(words, tags)
data.append(j) data.append(j)
if data:
yield data yield data
@classmethod @classmethod

Loading…
Cancel
Save