Browse Source

Merge pull request #861 from cgill95/feature/import_unlabeled_dataset

Feature/import unlabeled dataset
pull/937/head
Hiroki Nakayama 4 years ago
committed by GitHub
parent
commit
84b794aadb
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 19 additions and 15 deletions
  1. 9
      app/api/tests/data/example.csv
  2. 0
      app/api/tests/data/example_column_and_row_not_matching.csv
  3. 0
      app/api/tests/data/example_column_and_row_not_matching.xlsx
  4. 16
      app/api/tests/test_api.py
  5. 9
      app/api/utils.py

9
app/api/tests/data/example.csv

@ -1,4 +1,5 @@
text,label
AAA,Positive
BBB,Positive
CCC,Negative
text,label,meta
AAA
BBB,Positive,The following is meta data
CCC,Negative
DDD,,This is meta data

app/api/tests/data/example.invalid.1.csv → app/api/tests/data/example_column_and_row_not_matching.csv

app/api/tests/data/example.invalid.1.xlsx → app/api/tests/data/example_column_and_row_not_matching.xlsx

16
app/api/tests/test_api.py

@ -1212,11 +1212,11 @@ class TestUploader(APITestCase):
file_format='csv',
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_csv_file_does_not_match_column_and_row(self):
def test_can_upload_csv_file_does_not_match_column_and_row(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.invalid.1.csv',
filename='example_column_and_row_not_matching.csv',
file_format='csv',
expected_status=status.HTTP_400_BAD_REQUEST)
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_csv_file_has_too_many_columns(self):
self.upload_test_helper(project_id=self.classification_project.id,
@ -1242,11 +1242,11 @@ class TestUploader(APITestCase):
file_format='excel',
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_excel_file_does_not_match_column_and_row(self):
def test_can_upload_excel_file_does_not_match_column_and_row(self):
self.upload_test_helper(project_id=self.classification_project.id,
filename='example.invalid.1.xlsx',
filename='example_column_and_row_not_matching.xlsx',
file_format='excel',
expected_status=status.HTTP_400_BAD_REQUEST)
expected_status=status.HTTP_201_CREATED)
def test_cannot_upload_excel_file_has_too_many_columns(self):
self.upload_test_helper(project_id=self.classification_project.id,
@ -1419,10 +1419,10 @@ class TestParser(APITestCase):
parser=CoNLLParser())
def test_give_classification_data_to_csv_parser(self):
self.parser_helper(filename='example.csv', parser=CSVParser())
self.parser_helper(filename='example.csv', parser=CSVParser(), include_label=False)
def test_give_seq2seq_data_to_csv_parser(self):
self.parser_helper(filename='example.csv', parser=CSVParser())
self.parser_helper(filename='example.csv', parser=CSVParser(), include_label=False)
def test_give_classification_data_to_json_parser(self):
self.parser_helper(filename='classification.jsonl', parser=JSONParser())

9
app/api/utils.py

@ -385,14 +385,17 @@ class ExcelParser(FileParser):
yield data
data = []
# Only text column
if len(row) == len(columns) and len(row) == 1:
if len(row) <= len(columns) and len(row) == 1:
data.append({'text': row[0]})
# Text, labels and metadata columns
elif len(row) == len(columns) and len(row) >= 2:
elif 2 <= len(row) <= len(columns):
datum = dict(zip(columns, row))
text, label = datum.pop('text'), datum.pop('label')
meta = FileParser.encode_metadata(datum)
j = {'text': text, 'labels': [label], 'meta': meta}
if label != '':
j = {'text': text, 'labels': [label], 'meta': meta}
else:
j = {'text': text, 'meta': meta}
data.append(j)
else:
raise FileParseException(line_num=i, line=row)

Loading…
Cancel
Save