mirror of https://github.com/doccano/doccano.git
pythondatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learningannotation-tool
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
140 lines
3.8 KiB
140 lines
3.8 KiB
import json
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
|
|
from data_import.pipeline import parsers
|
|
|
|
|
|
class TestParser(unittest.TestCase):
|
|
def setUp(self):
|
|
self.test_dir = tempfile.mkdtemp()
|
|
self.test_file = os.path.join(self.test_dir, 'test_file.csv')
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.test_dir)
|
|
|
|
def create_file(self, content):
|
|
with open(self.test_file, 'w') as f:
|
|
f.write(content)
|
|
|
|
def assert_record(self, content, parser, expected):
|
|
self.create_file(content)
|
|
it = parser.parse(self.test_file)
|
|
for expect in expected:
|
|
row = next(it)
|
|
self.assertEqual(row, expect)
|
|
with self.assertRaises(StopIteration):
|
|
next(it)
|
|
|
|
|
|
class TestPlainParser(TestParser):
|
|
|
|
def test_read(self):
|
|
content = 'example'
|
|
parser = parsers.PlainParser()
|
|
expected = [{}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestLineParser(TestParser):
|
|
|
|
def test_read(self):
|
|
content = 'Hello, World!\nこんにちは'
|
|
parser = parsers.LineParser()
|
|
expected = [{'text': 'Hello, World!'}, {'text': 'こんにちは'}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestTextFileParser(TestParser):
|
|
|
|
def test_read(self):
|
|
content = 'Hello, World!\nこんにちは'
|
|
parser = parsers.TextFileParser()
|
|
expected = [{'text': content}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestCsvParser(TestParser):
|
|
|
|
def test_read(self):
|
|
content = 'label,text\nLabel,Text'
|
|
parser = parsers.CSVParser(delimiter=',')
|
|
expected = [{'label': 'Label', 'text': 'Text'}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
def test_can_change_delimiter(self):
|
|
content = 'label\ttext\nLabel\tText'
|
|
parser = parsers.CSVParser(delimiter='\t')
|
|
expected = [{'label': 'Label', 'text': 'Text'}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
def test_can_read_null_value(self):
|
|
content = 'text,label\nText'
|
|
parser = parsers.CSVParser(delimiter=',')
|
|
expected = [{'text': 'Text', 'label': None}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestJSONParser(TestParser):
|
|
|
|
def test_read(self):
|
|
content = json.dumps([
|
|
{'text': 'line1', 'labels': 'Label1'},
|
|
{'text': 'line2', 'labels': 'Label2'}
|
|
])
|
|
parser = parsers.JSONParser()
|
|
expected = json.loads(content)
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestJSONLParser(TestParser):
|
|
|
|
def test_read(self):
|
|
line1 = json.dumps({'text': 'line1', 'labels': 'Label1'})
|
|
line2 = json.dumps({'text': 'line2', 'labels': 'Label2'})
|
|
content = f"{line1}\n{line2}"
|
|
parser = parsers.JSONLParser()
|
|
expected = [json.loads(line1), json.loads(line2)]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestFastTextParser(TestParser):
|
|
|
|
def test_read(self):
|
|
content = '__label__sauce __label__cheese Text'
|
|
parser = parsers.FastTextParser()
|
|
expected = [{'text': 'Text', 'label': ['sauce', 'cheese']}]
|
|
self.assert_record(content, parser, expected)
|
|
|
|
|
|
class TestCoNLLParser(TestParser):
|
|
|
|
def test_can_read(self):
|
|
content = """EU\tB-ORG
|
|
rejects\tO
|
|
German\tB-MISC
|
|
call\tO
|
|
to\tO
|
|
boycott\tO
|
|
British\tB-MISC
|
|
lamb\tO
|
|
.\tO
|
|
|
|
Peter\tB-PER
|
|
Blackburn\tI-PER
|
|
|
|
"""
|
|
parser = parsers.CoNLLParser()
|
|
expected = [
|
|
{
|
|
'text': 'EU rejects German call to boycott British lamb .',
|
|
'label': [(0, 2, 'ORG'), (11, 17, 'MISC'), (34, 41, 'MISC')]
|
|
},
|
|
{
|
|
'text': 'Peter Blackburn',
|
|
'label': [(0, 15, 'PER')]
|
|
}
|
|
]
|
|
self.assert_record(content, parser, expected)
|