You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
3.8 KiB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. import json
  2. import os
  3. import shutil
  4. import tempfile
  5. import unittest
  6. from data_import.pipeline import parsers
  7. from data_import.pipeline.readers import LINE_NUMBER_COLUMN
  8. class TestParser(unittest.TestCase):
  9. def setUp(self):
  10. self.test_dir = tempfile.mkdtemp()
  11. self.test_file = os.path.join(self.test_dir, "test_file.csv")
  12. def tearDown(self):
  13. shutil.rmtree(self.test_dir)
  14. def create_file(self, content):
  15. with open(self.test_file, "w") as f:
  16. f.write(content)
  17. def assert_record(self, content, parser, expected):
  18. self.create_file(content)
  19. it = parser.parse(self.test_file)
  20. for expect in expected:
  21. row = next(it)
  22. row.pop(LINE_NUMBER_COLUMN, None)
  23. self.assertEqual(row, expect)
  24. with self.assertRaises(StopIteration):
  25. next(it)
  26. class TestPlainParser(TestParser):
  27. def test_read(self):
  28. content = "example"
  29. parser = parsers.PlainParser()
  30. expected = [{}]
  31. self.assert_record(content, parser, expected)
  32. class TestLineParser(TestParser):
  33. def test_read(self):
  34. content = "Hello, World!\nこんにちは"
  35. parser = parsers.LineParser()
  36. expected = [{"text": "Hello, World!"}, {"text": "こんにちは"}]
  37. self.assert_record(content, parser, expected)
  38. class TestTextFileParser(TestParser):
  39. def test_read(self):
  40. content = "Hello, World!\nこんにちは"
  41. parser = parsers.TextFileParser()
  42. expected = [{"text": content}]
  43. self.assert_record(content, parser, expected)
  44. class TestCsvParser(TestParser):
  45. def test_read(self):
  46. content = "label,text\nLabel,Text"
  47. parser = parsers.CSVParser(delimiter=",")
  48. expected = [{"label": "Label", "text": "Text"}]
  49. self.assert_record(content, parser, expected)
  50. def test_can_change_delimiter(self):
  51. content = "label\ttext\nLabel\tText"
  52. parser = parsers.CSVParser(delimiter="\t")
  53. expected = [{"label": "Label", "text": "Text"}]
  54. self.assert_record(content, parser, expected)
  55. def test_can_read_null_value(self):
  56. content = "text,label\nText"
  57. parser = parsers.CSVParser(delimiter=",")
  58. expected = [{"text": "Text", "label": None}]
  59. self.assert_record(content, parser, expected)
  60. class TestJSONParser(TestParser):
  61. def test_read(self):
  62. content = json.dumps([{"text": "line1", "labels": "Label1"}, {"text": "line2", "labels": "Label2"}])
  63. parser = parsers.JSONParser()
  64. expected = json.loads(content)
  65. self.assert_record(content, parser, expected)
  66. class TestJSONLParser(TestParser):
  67. def test_read(self):
  68. line1 = json.dumps({"text": "line1", "labels": "Label1"})
  69. line2 = json.dumps({"text": "line2", "labels": "Label2"})
  70. content = f"{line1}\n{line2}"
  71. parser = parsers.JSONLParser()
  72. expected = [json.loads(line1), json.loads(line2)]
  73. self.assert_record(content, parser, expected)
  74. class TestFastTextParser(TestParser):
  75. def test_read(self):
  76. content = "__label__sauce __label__cheese Text"
  77. parser = parsers.FastTextParser()
  78. expected = [{"text": "Text", "label": ["sauce", "cheese"]}]
  79. self.assert_record(content, parser, expected)
  80. class TestCoNLLParser(TestParser):
  81. def test_can_read(self):
  82. content = """EU\tB-ORG
  83. rejects\tO
  84. German\tB-MISC
  85. call\tO
  86. to\tO
  87. boycott\tO
  88. British\tB-MISC
  89. lamb\tO
  90. .\tO
  91. Peter\tB-PER
  92. Blackburn\tI-PER
  93. """
  94. parser = parsers.CoNLLParser()
  95. expected = [
  96. {
  97. "text": "EU rejects German call to boycott British lamb .",
  98. "label": [(0, 2, "ORG"), (11, 17, "MISC"), (34, 41, "MISC")],
  99. },
  100. {"text": "Peter Blackburn", "label": [(0, 15, "PER")]},
  101. ]
  102. self.assert_record(content, parser, expected)