import json import os import shutil import tempfile import unittest from data_import.pipeline import parsers class TestParser(unittest.TestCase): def setUp(self): self.test_dir = tempfile.mkdtemp() self.test_file = os.path.join(self.test_dir, "test_file.csv") def tearDown(self): shutil.rmtree(self.test_dir) def create_file(self, content): with open(self.test_file, "w") as f: f.write(content) def assert_record(self, content, parser, expected): self.create_file(content) it = parser.parse(self.test_file) for expect in expected: row = next(it) self.assertEqual(row, expect) with self.assertRaises(StopIteration): next(it) class TestPlainParser(TestParser): def test_read(self): content = "example" parser = parsers.PlainParser() expected = [{}] self.assert_record(content, parser, expected) class TestLineParser(TestParser): def test_read(self): content = "Hello, World!\nこんにちは" parser = parsers.LineParser() expected = [{"text": "Hello, World!"}, {"text": "こんにちは"}] self.assert_record(content, parser, expected) class TestTextFileParser(TestParser): def test_read(self): content = "Hello, World!\nこんにちは" parser = parsers.TextFileParser() expected = [{"text": content}] self.assert_record(content, parser, expected) class TestCsvParser(TestParser): def test_read(self): content = "label,text\nLabel,Text" parser = parsers.CSVParser(delimiter=",") expected = [{"label": "Label", "text": "Text"}] self.assert_record(content, parser, expected) def test_can_change_delimiter(self): content = "label\ttext\nLabel\tText" parser = parsers.CSVParser(delimiter="\t") expected = [{"label": "Label", "text": "Text"}] self.assert_record(content, parser, expected) def test_can_read_null_value(self): content = "text,label\nText" parser = parsers.CSVParser(delimiter=",") expected = [{"text": "Text", "label": None}] self.assert_record(content, parser, expected) class TestJSONParser(TestParser): def test_read(self): content = json.dumps([{"text": "line1", "labels": "Label1"}, {"text": "line2", "labels": "Label2"}]) parser = parsers.JSONParser() expected = json.loads(content) self.assert_record(content, parser, expected) class TestJSONLParser(TestParser): def test_read(self): line1 = json.dumps({"text": "line1", "labels": "Label1"}) line2 = json.dumps({"text": "line2", "labels": "Label2"}) content = f"{line1}\n{line2}" parser = parsers.JSONLParser() expected = [json.loads(line1), json.loads(line2)] self.assert_record(content, parser, expected) class TestFastTextParser(TestParser): def test_read(self): content = "__label__sauce __label__cheese Text" parser = parsers.FastTextParser() expected = [{"text": "Text", "label": ["sauce", "cheese"]}] self.assert_record(content, parser, expected) class TestCoNLLParser(TestParser): def test_can_read(self): content = """EU\tB-ORG rejects\tO German\tB-MISC call\tO to\tO boycott\tO British\tB-MISC lamb\tO .\tO Peter\tB-PER Blackburn\tI-PER """ parser = parsers.CoNLLParser() expected = [ { "text": "EU rejects German call to boycott British lamb .", "label": [(0, 2, "ORG"), (11, 17, "MISC"), (34, 41, "MISC")], }, {"text": "Peter Blackburn", "label": [(0, 15, "PER")]}, ] self.assert_record(content, parser, expected)