From 9ad5818939ab8bdcaa69812aae0ce57c3ed7b332 Mon Sep 17 00:00:00 2001 From: Dhiraj Suvarna Date: Mon, 28 Mar 2022 20:09:18 +0530 Subject: [PATCH 01/15] added encoding format while opening of file --- backend/data_export/pipeline/writers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/data_export/pipeline/writers.py b/backend/data_export/pipeline/writers.py index aacf8904..61441d1a 100644 --- a/backend/data_export/pipeline/writers.py +++ b/backend/data_export/pipeline/writers.py @@ -35,7 +35,7 @@ class LineWriter(BaseWriter): for record in records: filename = os.path.join(self.tmpdir, f"{record.user}.{self.extension}") if filename not in files: - f = open(filename, mode="a") + f = open(filename, mode="a", encoding="utf-8") files[filename] = f f = files[filename] line = self.create_line(record) From 70bc6d0de4723d1044e6d96dee8ff7808127d1c4 Mon Sep 17 00:00:00 2001 From: Pisanu Federico Date: Sun, 3 Apr 2022 12:09:18 +0200 Subject: [PATCH 02/15] iss1765: added confirmed_by field to ExampleStateSerializer --- backend/examples/serializers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/examples/serializers.py b/backend/examples/serializers.py index 3a50c08b..22c51bf6 100644 --- a/backend/examples/serializers.py +++ b/backend/examples/serializers.py @@ -43,5 +43,5 @@ class ExampleSerializer(serializers.ModelSerializer): class ExampleStateSerializer(serializers.ModelSerializer): class Meta: model = ExampleState - fields = ("id", "example", "confirmed_by") - read_only_fields = ("id", "example", "confirmed_by") + fields = ("id", "example", "confirmed_by", "confirmed_at") + read_only_fields = ("id", "example", "confirmed_by", "confirmed_at") From 81dcf4197d38f47739c5068f9618311fa15bb954 Mon Sep 17 00:00:00 2001 From: Alexander Kurakin Date: Thu, 31 Mar 2022 19:28:26 +0300 Subject: [PATCH 03/15] Set nounset on Bash scripts (fixes #860) --- tools/azure.sh | 2 ++ tools/create-admin.sh | 5 +++-- tools/create-package.sh | 4 ++++ tools/dev-celery.sh | 2 ++ tools/dev-django.sh | 9 ++++++++- tools/dev-nuxt.sh | 1 + tools/heroku.sh | 6 ++++++ tools/install-mssql.sh | 3 +++ tools/prod-celery.sh | 2 ++ tools/prod-django.sh | 8 +++++++- tools/run.sh | 9 +++++++++ 11 files changed, 47 insertions(+), 4 deletions(-) diff --git a/tools/azure.sh b/tools/azure.sh index d08a19e9..b661b590 100755 --- a/tools/azure.sh +++ b/tools/azure.sh @@ -11,6 +11,8 @@ if [[ -z "${DOCCANO_ADMIN_CONTACT_EMAIL}" ]]; then echo "Missing DOCCANO_ADMIN_C if [[ -z "${DOCCANO_ADMIN_PASSWORD}" ]]; then echo "Missing DOCCANO_ADMIN_PASSWORD environment variable" >&2; exit 1; fi if ! az account show >/dev/null; then echo "Must be logged into Azure" >&2; exit 2; fi +set -o nounset + az group create \ --location "${DOCCANO_LOCATION}" \ --name "${DOCCANO_RESOURCE_GROUP}" diff --git a/tools/create-admin.sh b/tools/create-admin.sh index d17e4ddd..72d76622 100755 --- a/tools/create-admin.sh +++ b/tools/create-admin.sh @@ -1,8 +1,9 @@ #!/usr/bin/env bash -if [[ "$#" -ne 3 ]]; then echo "Usage: $0 " >&2; exit 1; fi - set -o errexit +set -o nounset + +if [[ "$#" -ne 3 ]]; then echo "Usage: $0 " >&2; exit 1; fi python app/manage.py wait_for_db python app/manage.py migrate diff --git a/tools/create-package.sh b/tools/create-package.sh index e9896c8a..99b5e96f 100755 --- a/tools/create-package.sh +++ b/tools/create-package.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash + +set -o errexit +set -o nounset + mkdir -p backend/client cd frontend diff --git a/tools/dev-celery.sh b/tools/dev-celery.sh index ec4e61e5..0c60a90c 100755 --- a/tools/dev-celery.sh +++ b/tools/dev-celery.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash + set -o errexit +set -o nounset cd "/src/backend" diff --git a/tools/dev-django.sh b/tools/dev-django.sh index de8d16c2..bbb5b9ea 100755 --- a/tools/dev-django.sh +++ b/tools/dev-django.sh @@ -1,6 +1,13 @@ #!/usr/bin/env bash + set -o errexit +if [[ -z "${ADMIN_USERNAME}" ]]; then echo "Missing ADMIN_USERNAME environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_PASSWORD}" ]]; then echo "Missing ADMIN_PASSWORD environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_EMAIL}" ]]; then echo "Missing ADMIN_EMAIL environment variable" >&2; exit 1; fi + +set -o nounset + app="/src/backend" echo "Initializing database" @@ -18,4 +25,4 @@ if [[ -n "${ADMIN_USERNAME}" ]] && [[ -n "${ADMIN_PASSWORD}" ]] && [[ -n "${ADMI fi echo "Starting django" -python -u "${app}/manage.py" runserver 0.0.0.0:8000 +python -u "${app}/manage.py" runserver ${HOST:-0.0.0.0}:${PORT:-8000} diff --git a/tools/dev-nuxt.sh b/tools/dev-nuxt.sh index 772bb16b..36c38d62 100755 --- a/tools/dev-nuxt.sh +++ b/tools/dev-nuxt.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash set -o errexit +set -o nounset root="$(dirname "$0")/.." app="${root}/frontend" diff --git a/tools/heroku.sh b/tools/heroku.sh index 07443663..f6252026 100644 --- a/tools/heroku.sh +++ b/tools/heroku.sh @@ -2,6 +2,12 @@ set -o errexit +if [[ -z "${ADMIN_USER_NAME}" ]]; then echo "Missing ADMIN_USER_NAME environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_PASSWORD}" ]]; then echo "Missing ADMIN_PASSWORD environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_CONTACT_EMAIL}" ]]; then echo "Missing ADMIN_CONTACT_EMAIL environment variable" >&2; exit 1; fi + +set -o nounset + python /doccano/backend/manage.py migrate if [ -n "$ADMIN_USER_NAME" ]; then python /doccano/backend/manage.py create_admin --noinput --username="$ADMIN_USER_NAME" --email="$ADMIN_CONTACT_EMAIL" --password="$ADMIN_PASSWORD" diff --git a/tools/install-mssql.sh b/tools/install-mssql.sh index 358ad44b..c7b9d794 100755 --- a/tools/install-mssql.sh +++ b/tools/install-mssql.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +set -o errexit +set -o nounset + # parse arguments mode="prod" for opt in "$@"; do diff --git a/tools/prod-celery.sh b/tools/prod-celery.sh index 73c4971f..0ffba164 100755 --- a/tools/prod-celery.sh +++ b/tools/prod-celery.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash + set -o errexit +set -o nounset cd /backend diff --git a/tools/prod-django.sh b/tools/prod-django.sh index 7c38b719..976075ea 100755 --- a/tools/prod-django.sh +++ b/tools/prod-django.sh @@ -2,6 +2,12 @@ set -o errexit +if [[ -z "${ADMIN_USERNAME}" ]]; then echo "Missing ADMIN_USERNAME environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_PASSWORD}" ]]; then echo "Missing ADMIN_PASSWORD environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_EMAIL}" ]]; then echo "Missing ADMIN_EMAIL environment variable" >&2; exit 1; fi + +set -o nounset + echo "Making staticfiles" static_dir=staticfiles if [[ ! -d $static_dir ]] || [[ -z $(ls -A $static_dir) ]]; then @@ -25,4 +31,4 @@ if [[ -n "${ADMIN_USERNAME}" ]] && [[ -n "${ADMIN_PASSWORD}" ]] && [[ -n "${ADMI fi echo "Starting django" -gunicorn --bind="0.0.0.0:${PORT:-8000}" --workers="${WORKERS:-4}" config.wsgi --timeout 300 +gunicorn --bind="${HOST:-0.0.0.0}:${PORT:-8000}" --workers="${WORKERS:-4}" config.wsgi --timeout 300 diff --git a/tools/run.sh b/tools/run.sh index 84eb4cbe..5a008a02 100755 --- a/tools/run.sh +++ b/tools/run.sh @@ -2,6 +2,15 @@ set -o errexit +if [[ -z "${ADMIN_USERNAME}" ]]; then echo "Missing ADMIN_USERNAME environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_PASSWORD}" ]]; then echo "Missing ADMIN_PASSWORD environment variable" >&2; exit 1; fi +if [[ -z "${ADMIN_EMAIL}" ]]; then echo "Missing ADMIN_EMAIL environment variable" >&2; exit 1; fi +if [[ -z "${PORT}" ]]; then echo "Missing PORT environment variable" >&2; exit 1; fi +if [[ -z "${WORKERS}" ]]; then echo "Missing WORKERS environment variable" >&2; exit 1; fi +if [[ -z "${CELERY_WORKERS}" ]]; then echo "Missing CELERY_WORKERS environment variable" >&2; exit 1; fi + +set -o nounset + echo "Making staticfiles" static_dir=staticfiles mkdir -p client/dist/static From 306dd51e7a884d4191ddf8fe940317dfac183863 Mon Sep 17 00:00:00 2001 From: mkmark Date: Wed, 6 Apr 2022 03:10:52 +0800 Subject: [PATCH 04/15] fix empty export in entity-relationship-labeling --- backend/data_export/pipeline/repositories.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/backend/data_export/pipeline/repositories.py b/backend/data_export/pipeline/repositories.py index a45fc5b5..2d7f8db7 100644 --- a/backend/data_export/pipeline/repositories.py +++ b/backend/data_export/pipeline/repositories.py @@ -80,7 +80,11 @@ class TextRepository(BaseRepository): for doc in docs: label_per_user = self.label_per_user(doc) if self.project.collaborative_annotation: - label_per_user = self.reduce_user(label_per_user) + if getattr(self.project, "use_relation", False): + value_type = "dict" + else: + value_type = "list" + label_per_user = self.reduce_user(label_per_user, value_type) for user, label in label_per_user.items(): yield Record(data_id=doc.id, data=doc.text, label=label, user=user, metadata=doc.meta) # todo: @@ -96,9 +100,17 @@ class TextRepository(BaseRepository): def label_per_user(self, doc) -> Dict: raise NotImplementedError() - def reduce_user(self, label_per_user: Dict[str, List]): - value = list(itertools.chain(*label_per_user.values())) - return {"all": value} + def reduce_user(self, label_per_user: Dict, value_type): + if value_type == "list": + value_list = list(itertools.chain(*label_per_user)) + return {"all": value_list} + if value_type == "dict": + value_dict = dict( + (label_type, label_per_user[user][label_type]) + for user in label_per_user + for label_type in label_per_user[user] + ) + return {"all": value_dict} class TextClassificationRepository(TextRepository): From fdce902f9947cc7b5c94b88c21857676a0db5ba6 Mon Sep 17 00:00:00 2001 From: mkmark Date: Wed, 6 Apr 2022 03:40:37 +0800 Subject: [PATCH 05/15] show total progress if collaborative_annotation --- backend/metrics/views.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/metrics/views.py b/backend/metrics/views.py index ac39fd1a..bca82de1 100644 --- a/backend/metrics/views.py +++ b/backend/metrics/views.py @@ -1,5 +1,6 @@ import abc +from django.shortcuts import get_object_or_404 from rest_framework import status from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response @@ -8,7 +9,7 @@ from rest_framework.views import APIView from examples.models import Example, ExampleState from label_types.models import CategoryType, LabelType, RelationType, SpanType from labels.models import Category, Label, Relation, Span -from projects.models import Member +from projects.models import Member, Project from projects.permissions import IsProjectAdmin, IsProjectStaffAndReadOnly @@ -18,7 +19,11 @@ class ProgressAPI(APIView): def get(self, request, *args, **kwargs): examples = Example.objects.filter(project=self.kwargs["project_id"]).values("id") total = examples.count() - complete = ExampleState.objects.count_done(examples, user=self.request.user) + project = get_object_or_404(Project, pk=self.kwargs["project_id"]) + if project.collaborative_annotation: + complete = ExampleState.objects.count_done(examples) + else: + complete = ExampleState.objects.count_done(examples, user=self.request.user) data = {"total": total, "remaining": total - complete, "complete": complete} return Response(data=data, status=status.HTTP_200_OK) From a361f17d845f56f7e8bf1655b6524374b1983c36 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 09:40:14 +0900 Subject: [PATCH 06/15] Update factory for export repository --- backend/data_export/celery_tasks.py | 2 +- backend/data_export/pipeline/factories.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/data_export/celery_tasks.py b/backend/data_export/celery_tasks.py index 167bf93b..63a40283 100644 --- a/backend/data_export/celery_tasks.py +++ b/backend/data_export/celery_tasks.py @@ -13,7 +13,7 @@ logger = get_task_logger(__name__) @shared_task def export_dataset(project_id, file_format: str, export_approved=False): project = get_object_or_404(Project, pk=project_id) - repository = create_repository(project) + repository = create_repository(project, file_format) writer = create_writer(file_format)(settings.MEDIA_ROOT) service = ExportApplicationService(repository, writer) filepath = service.export(export_approved) diff --git a/backend/data_export/pipeline/factories.py b/backend/data_export/pipeline/factories.py index 44a38235..9096a442 100644 --- a/backend/data_export/pipeline/factories.py +++ b/backend/data_export/pipeline/factories.py @@ -11,8 +11,8 @@ from projects.models import ( ) -def create_repository(project): - if getattr(project, "use_relation", False): +def create_repository(project, file_format: str): + if getattr(project, "use_relation", False) and file_format == catalog.JSONLRelation.name: return repositories.RelationExtractionRepository(project) mapping = { DOCUMENT_CLASSIFICATION: repositories.TextClassificationRepository, From 5d39a7004e47ce1dd3ebde1c4e5ff66f32e50e23 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 09:41:30 +0900 Subject: [PATCH 07/15] Add reduce_user to RelationExtractionRepository --- backend/data_export/pipeline/repositories.py | 32 ++- .../data_export/tests/test_repositories.py | 204 +++++++++++++----- 2 files changed, 169 insertions(+), 67 deletions(-) diff --git a/backend/data_export/pipeline/repositories.py b/backend/data_export/pipeline/repositories.py index 2d7f8db7..e50eafe3 100644 --- a/backend/data_export/pipeline/repositories.py +++ b/backend/data_export/pipeline/repositories.py @@ -1,7 +1,7 @@ import abc import itertools from collections import defaultdict -from typing import Dict, Iterator, List, Tuple, Union +from typing import Any, Dict, Iterator, List, Tuple, Union from .data import Record from examples.models import Example @@ -54,7 +54,7 @@ class FileRepository(BaseRepository): label_per_user[a.user.username].append(a.label.text) return label_per_user - def reduce_user(self, label_per_user: Dict[str, List]): + def reduce_user(self, label_per_user: Dict[str, Any]): value = list(itertools.chain(*label_per_user.values())) return {"all": value} @@ -80,11 +80,7 @@ class TextRepository(BaseRepository): for doc in docs: label_per_user = self.label_per_user(doc) if self.project.collaborative_annotation: - if getattr(self.project, "use_relation", False): - value_type = "dict" - else: - value_type = "list" - label_per_user = self.reduce_user(label_per_user, value_type) + label_per_user = self.reduce_user(label_per_user) for user, label in label_per_user.items(): yield Record(data_id=doc.id, data=doc.text, label=label, user=user, metadata=doc.meta) # todo: @@ -100,17 +96,9 @@ class TextRepository(BaseRepository): def label_per_user(self, doc) -> Dict: raise NotImplementedError() - def reduce_user(self, label_per_user: Dict, value_type): - if value_type == "list": - value_list = list(itertools.chain(*label_per_user)) - return {"all": value_list} - if value_type == "dict": - value_dict = dict( - (label_type, label_per_user[user][label_type]) - for user in label_per_user - for label_type in label_per_user[user] - ) - return {"all": value_dict} + def reduce_user(self, label_per_user: Dict[str, Any]): + value = list(itertools.chain(*label_per_user.values())) + return {"all": value} class TextClassificationRepository(TextRepository): @@ -173,6 +161,14 @@ class RelationExtractionRepository(TextRepository): label_per_user[user]["entities"] = span return label_per_user + def reduce_user(self, label_per_user: Dict[str, Any]): + entities = [] + relations = [] + for user, label in label_per_user.items(): + entities.extend(label.get("entities", [])) + relations.extend(label.get("relations", [])) + return {"all": {"entities": entities, "relations": relations}} + class Seq2seqRepository(TextRepository): @property diff --git a/backend/data_export/tests/test_repositories.py b/backend/data_export/tests/test_repositories.py index d7654ecc..0ff25287 100644 --- a/backend/data_export/tests/test_repositories.py +++ b/backend/data_export/tests/test_repositories.py @@ -5,70 +5,176 @@ from model_mommy import mommy from ..pipeline.repositories import ( IntentDetectionSlotFillingRepository, RelationExtractionRepository, + SequenceLabelingRepository, ) from projects.models import INTENT_DETECTION_AND_SLOT_FILLING, SEQUENCE_LABELING from projects.tests.utils import prepare_project -class TestCSVWriter(unittest.TestCase): - def setUp(self): - self.project = prepare_project(INTENT_DETECTION_AND_SLOT_FILLING) +class TestRepository(unittest.TestCase): + def assert_records(self, repository, expected): + records = list(repository.list()) + self.assertEqual(len(records), len(expected)) + for record, expect in zip(records, expected): + self.assertEqual(record.data, expect["data"]) + self.assertEqual(record.label, expect["label"]) + self.assertEqual(record.user, expect["user"]) + + +class TestIntentDetectionSlotFillingRepository(TestRepository): + def prepare_data(self, project): + self.example = mommy.make("Example", project=project.item, text="example") + self.category1 = mommy.make("Category", example=self.example, user=project.admin) + self.category2 = mommy.make("Category", example=self.example, user=project.annotator) + self.span = mommy.make("Span", example=self.example, user=project.admin, start_offset=0, end_offset=1) def test_list(self): - example = mommy.make("Example", project=self.project.item, text="example") - category = mommy.make("Category", example=example, user=self.project.admin) - span = mommy.make("Span", example=example, user=self.project.admin, start_offset=0, end_offset=1) - repository = IntentDetectionSlotFillingRepository(self.project.item) + project = prepare_project(INTENT_DETECTION_AND_SLOT_FILLING) + repository = IntentDetectionSlotFillingRepository(project.item) + self.prepare_data(project) expected = [ { - "data": example.text, + "data": self.example.text, + "label": { + "cats": [self.category1.label.text], + "entities": [(self.span.start_offset, self.span.end_offset, self.span.label.text)], + }, + "user": project.admin.username, + }, + { + "data": self.example.text, + "label": { + "cats": [self.category2.label.text], + "entities": [], + }, + "user": project.annotator.username, + }, + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(INTENT_DETECTION_AND_SLOT_FILLING, collaborative_annotation=True) + repository = IntentDetectionSlotFillingRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, "label": { - "cats": [category.label.text], - "entities": [(span.start_offset, span.end_offset, span.label.text)], + "cats": [self.category1.label.text, self.category2.label.text], + "entities": [(self.span.start_offset, self.span.end_offset, self.span.label.text)], }, + "user": "all", } ] - records = list(repository.list()) - self.assertEqual(len(records), len(expected)) - for record, expect in zip(records, expected): - self.assertEqual(record.data, expect["data"]) - self.assertEqual(record.label["cats"], expect["label"]["cats"]) - self.assertEqual(record.label["entities"], expect["label"]["entities"]) + self.assert_records(repository, expected) -class TestRelationExtractionRepository(unittest.TestCase): - def setUp(self): - self.project = prepare_project(SEQUENCE_LABELING, use_relation=True) +class TestSequenceLabelingRepository(TestRepository): + def prepare_data(self, project): + self.example = mommy.make("Example", project=project.item, text="example") + self.span1 = mommy.make("Span", example=self.example, user=project.admin, start_offset=0, end_offset=1) + self.span2 = mommy.make("Span", example=self.example, user=project.annotator, start_offset=1, end_offset=2) - def test_label_per_user(self): - from_entity = mommy.make("Span", start_offset=0, end_offset=1, user=self.project.admin) - to_entity = mommy.make( - "Span", start_offset=1, end_offset=2, example=from_entity.example, user=self.project.admin - ) - relation = mommy.make( - "Relation", from_id=from_entity, to_id=to_entity, example=from_entity.example, user=self.project.admin - ) - repository = RelationExtractionRepository(self.project.item) - expected = { - "admin": { - "entities": [ - { - "id": from_entity.id, - "start_offset": from_entity.start_offset, - "end_offset": from_entity.end_offset, - "label": from_entity.label.text, - }, - { - "id": to_entity.id, - "start_offset": to_entity.start_offset, - "end_offset": to_entity.end_offset, - "label": to_entity.label.text, - }, - ], - "relations": [ - {"id": relation.id, "from_id": from_entity.id, "to_id": to_entity.id, "type": relation.type.text} + def test_list(self): + project = prepare_project(SEQUENCE_LABELING) + repository = SequenceLabelingRepository(project) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, + "label": [(self.span1.start_offset, self.span1.end_offset, self.span1.label.text)], + "user": project.admin.username, + }, + { + "data": self.example.text, + "label": [(self.span2.start_offset, self.span2.end_offset, self.span2.label.text)], + "user": project.annotator.username, + }, + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(SEQUENCE_LABELING, collaborative_annotation=True) + repository = SequenceLabelingRepository(project) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, + "label": [ + (self.span1.start_offset, self.span1.end_offset, self.span1.label.text), + (self.span2.start_offset, self.span2.end_offset, self.span2.label.text), ], + "user": "all", + } + ] + self.assert_records(repository, expected) + + +class TestRelationExtractionRepository(TestRepository): + def test_list(self): + project = prepare_project(SEQUENCE_LABELING, use_relation=True) + example = mommy.make("Example", project=project.item, text="example") + span1 = mommy.make("Span", example=example, user=project.admin, start_offset=0, end_offset=1) + span2 = mommy.make("Span", example=example, user=project.admin, start_offset=1, end_offset=2) + relation = mommy.make("Relation", from_id=span1, to_id=span2, example=example, user=project.admin) + repository = RelationExtractionRepository(project.item) + expected = [ + { + "data": example.text, + "label": { + "entities": [ + { + "id": span1.id, + "start_offset": span1.start_offset, + "end_offset": span1.end_offset, + "label": span1.label.text, + }, + { + "id": span2.id, + "start_offset": span2.start_offset, + "end_offset": span2.end_offset, + "label": span2.label.text, + }, + ], + "relations": [ + {"id": relation.id, "from_id": span1.id, "to_id": span2.id, "type": relation.type.text} + ], + }, + "user": project.admin.username, + } + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(SEQUENCE_LABELING, collaborative_annotation=True, use_relation=True) + example = mommy.make("Example", project=project.item, text="example") + span1 = mommy.make("Span", example=example, user=project.admin, start_offset=0, end_offset=1) + span2 = mommy.make("Span", example=example, user=project.annotator, start_offset=1, end_offset=2) + relation = mommy.make("Relation", from_id=span1, to_id=span2, example=example, user=project.admin) + repository = RelationExtractionRepository(project.item) + expected = [ + { + "data": example.text, + "label": { + "entities": [ + { + "id": span1.id, + "start_offset": span1.start_offset, + "end_offset": span1.end_offset, + "label": span1.label.text, + }, + { + "id": span2.id, + "start_offset": span2.start_offset, + "end_offset": span2.end_offset, + "label": span2.label.text, + }, + ], + "relations": [ + {"id": relation.id, "from_id": span1.id, "to_id": span2.id, "type": relation.type.text} + ], + }, + "user": "all", } - } - actual = repository.label_per_user(from_entity.example) - self.assertDictEqual(actual, expected) + ] + self.assert_records(repository, expected) From a57f38d9de635a83b90ad9a1530c5300f64a1129 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 09:45:26 +0900 Subject: [PATCH 08/15] Set default value to IntentDetectionSlotFillingRepository --- backend/data_export/pipeline/repositories.py | 3 +++ backend/data_export/tests/test_repositories.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/data_export/pipeline/repositories.py b/backend/data_export/pipeline/repositories.py index e50eafe3..71420b7c 100644 --- a/backend/data_export/pipeline/repositories.py +++ b/backend/data_export/pipeline/repositories.py @@ -201,4 +201,7 @@ class IntentDetectionSlotFillingRepository(TextRepository): label_per_user[user]["cats"] = cats for user, span in span_per_user.items(): label_per_user[user]["entities"] = span + for label in label_per_user.values(): + label.setdefault("cats", []) + label.setdefault("entities", []) return label_per_user diff --git a/backend/data_export/tests/test_repositories.py b/backend/data_export/tests/test_repositories.py index 0ff25287..b0d29689 100644 --- a/backend/data_export/tests/test_repositories.py +++ b/backend/data_export/tests/test_repositories.py @@ -77,7 +77,7 @@ class TestSequenceLabelingRepository(TestRepository): def test_list(self): project = prepare_project(SEQUENCE_LABELING) - repository = SequenceLabelingRepository(project) + repository = SequenceLabelingRepository(project.item) self.prepare_data(project) expected = [ { @@ -95,7 +95,7 @@ class TestSequenceLabelingRepository(TestRepository): def test_list_on_collaborative_annotation(self): project = prepare_project(SEQUENCE_LABELING, collaborative_annotation=True) - repository = SequenceLabelingRepository(project) + repository = SequenceLabelingRepository(project.item) self.prepare_data(project) expected = [ { From 03198dfd90f6656809bcfa0f1db55e97cafbb518 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 09:46:52 +0900 Subject: [PATCH 09/15] Add reduce_user to IntentDetectionSlotFillingRepository --- backend/data_export/pipeline/repositories.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/data_export/pipeline/repositories.py b/backend/data_export/pipeline/repositories.py index 71420b7c..1423e235 100644 --- a/backend/data_export/pipeline/repositories.py +++ b/backend/data_export/pipeline/repositories.py @@ -205,3 +205,11 @@ class IntentDetectionSlotFillingRepository(TextRepository): label.setdefault("cats", []) label.setdefault("entities", []) return label_per_user + + def reduce_user(self, label_per_user: Dict[str, Any]): + cats = [] + entities = [] + for user, label in label_per_user.items(): + cats.extend(label.get("cats", [])) + entities.extend(label.get("entities", [])) + return {"all": {"entities": entities, "cats": cats}} From bcc07baef5b1e1263f18719957f2ad1c8f601a77 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 09:53:26 +0900 Subject: [PATCH 10/15] Add test cases for TestTextClassificationRepository --- .../data_export/tests/test_repositories.py | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/backend/data_export/tests/test_repositories.py b/backend/data_export/tests/test_repositories.py index b0d29689..9202b047 100644 --- a/backend/data_export/tests/test_repositories.py +++ b/backend/data_export/tests/test_repositories.py @@ -6,8 +6,13 @@ from ..pipeline.repositories import ( IntentDetectionSlotFillingRepository, RelationExtractionRepository, SequenceLabelingRepository, + TextClassificationRepository, +) +from projects.models import ( + DOCUMENT_CLASSIFICATION, + INTENT_DETECTION_AND_SLOT_FILLING, + SEQUENCE_LABELING, ) -from projects.models import INTENT_DETECTION_AND_SLOT_FILLING, SEQUENCE_LABELING from projects.tests.utils import prepare_project @@ -21,6 +26,44 @@ class TestRepository(unittest.TestCase): self.assertEqual(record.user, expect["user"]) +class TestTextClassificationRepository(TestRepository): + def prepare_data(self, project): + self.example = mommy.make("Example", project=project.item, text="example") + self.category1 = mommy.make("Category", example=self.example, user=project.admin) + self.category2 = mommy.make("Category", example=self.example, user=project.annotator) + + def test_list(self): + project = prepare_project(DOCUMENT_CLASSIFICATION) + repository = TextClassificationRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, + "label": [self.category1.label.text], + "user": project.admin.username, + }, + { + "data": self.example.text, + "label": [self.category2.label.text], + "user": project.annotator.username, + }, + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(DOCUMENT_CLASSIFICATION, collaborative_annotation=True) + repository = TextClassificationRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, + "label": [self.category1.label.text, self.category2.label.text], + "user": "all", + } + ] + self.assert_records(repository, expected) + + class TestIntentDetectionSlotFillingRepository(TestRepository): def prepare_data(self, project): self.example = mommy.make("Example", project=project.item, text="example") From 63f8d747be82072d3260b3eacacb1a5aad5ad475 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 09:56:46 +0900 Subject: [PATCH 11/15] Add test cases for Seq2seqRepository --- .../data_export/tests/test_repositories.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/backend/data_export/tests/test_repositories.py b/backend/data_export/tests/test_repositories.py index 9202b047..a4780de4 100644 --- a/backend/data_export/tests/test_repositories.py +++ b/backend/data_export/tests/test_repositories.py @@ -5,12 +5,14 @@ from model_mommy import mommy from ..pipeline.repositories import ( IntentDetectionSlotFillingRepository, RelationExtractionRepository, + Seq2seqRepository, SequenceLabelingRepository, TextClassificationRepository, ) from projects.models import ( DOCUMENT_CLASSIFICATION, INTENT_DETECTION_AND_SLOT_FILLING, + SEQ2SEQ, SEQUENCE_LABELING, ) from projects.tests.utils import prepare_project @@ -64,6 +66,44 @@ class TestTextClassificationRepository(TestRepository): self.assert_records(repository, expected) +class TestSeq2seqRepository(TestRepository): + def prepare_data(self, project): + self.example = mommy.make("Example", project=project.item, text="example") + self.text1 = mommy.make("TextLabel", example=self.example, user=project.admin) + self.text2 = mommy.make("TextLabel", example=self.example, user=project.annotator) + + def test_list(self): + project = prepare_project(SEQ2SEQ) + repository = Seq2seqRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, + "label": [self.text1.text], + "user": project.admin.username, + }, + { + "data": self.example.text, + "label": [self.text2.text], + "user": project.annotator.username, + }, + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(SEQ2SEQ, collaborative_annotation=True) + repository = Seq2seqRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.text, + "label": [self.text1.text, self.text2.text], + "user": "all", + } + ] + self.assert_records(repository, expected) + + class TestIntentDetectionSlotFillingRepository(TestRepository): def prepare_data(self, project): self.example = mommy.make("Example", project=project.item, text="example") From c637a2de1f33ac282e4f67c82e0aaafaeb11750b Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 10:02:32 +0900 Subject: [PATCH 12/15] Add test cases for Speech2textRepository --- .../data_export/tests/test_repositories.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/backend/data_export/tests/test_repositories.py b/backend/data_export/tests/test_repositories.py index a4780de4..cec3b296 100644 --- a/backend/data_export/tests/test_repositories.py +++ b/backend/data_export/tests/test_repositories.py @@ -7,6 +7,7 @@ from ..pipeline.repositories import ( RelationExtractionRepository, Seq2seqRepository, SequenceLabelingRepository, + Speech2TextRepository, TextClassificationRepository, ) from projects.models import ( @@ -14,6 +15,7 @@ from projects.models import ( INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ, SEQUENCE_LABELING, + SPEECH2TEXT, ) from projects.tests.utils import prepare_project @@ -261,3 +263,41 @@ class TestRelationExtractionRepository(TestRepository): } ] self.assert_records(repository, expected) + + +class TestSpeech2TextRepository(TestRepository): + def prepare_data(self, project): + self.example = mommy.make("Example", project=project.item, text="example") + self.text1 = mommy.make("TextLabel", example=self.example, user=project.admin) + self.text2 = mommy.make("TextLabel", example=self.example, user=project.annotator) + + def test_list(self): + project = prepare_project(SPEECH2TEXT) + repository = Speech2TextRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.filename, + "label": [self.text1.text], + "user": project.admin.username, + }, + { + "data": self.example.filename, + "label": [self.text2.text], + "user": project.annotator.username, + }, + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(SPEECH2TEXT, collaborative_annotation=True) + repository = Speech2TextRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.filename, + "label": [self.text1.text, self.text2.text], + "user": "all", + } + ] + self.assert_records(repository, expected) From d2dec899da7492146907a29ac4ad99729c632e45 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 10:05:57 +0900 Subject: [PATCH 13/15] Add test cases for FileRepository --- .../data_export/tests/test_repositories.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/backend/data_export/tests/test_repositories.py b/backend/data_export/tests/test_repositories.py index cec3b296..410089af 100644 --- a/backend/data_export/tests/test_repositories.py +++ b/backend/data_export/tests/test_repositories.py @@ -3,6 +3,7 @@ import unittest from model_mommy import mommy from ..pipeline.repositories import ( + FileRepository, IntentDetectionSlotFillingRepository, RelationExtractionRepository, Seq2seqRepository, @@ -12,6 +13,7 @@ from ..pipeline.repositories import ( ) from projects.models import ( DOCUMENT_CLASSIFICATION, + IMAGE_CLASSIFICATION, INTENT_DETECTION_AND_SLOT_FILLING, SEQ2SEQ, SEQUENCE_LABELING, @@ -301,3 +303,41 @@ class TestSpeech2TextRepository(TestRepository): } ] self.assert_records(repository, expected) + + +class TestFileRepository(TestRepository): + def prepare_data(self, project): + self.example = mommy.make("Example", project=project.item, text="example") + self.category1 = mommy.make("Category", example=self.example, user=project.admin) + self.category2 = mommy.make("Category", example=self.example, user=project.annotator) + + def test_list(self): + project = prepare_project(IMAGE_CLASSIFICATION) + repository = FileRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.filename, + "label": [self.category1.label.text], + "user": project.admin.username, + }, + { + "data": self.example.filename, + "label": [self.category2.label.text], + "user": project.annotator.username, + }, + ] + self.assert_records(repository, expected) + + def test_list_on_collaborative_annotation(self): + project = prepare_project(IMAGE_CLASSIFICATION, collaborative_annotation=True) + repository = FileRepository(project.item) + self.prepare_data(project) + expected = [ + { + "data": self.example.filename, + "label": [self.category1.label.text, self.category2.label.text], + "user": "all", + } + ] + self.assert_records(repository, expected) From 5ad4bb777c88c214ddc3eb796c2cde80ec54fcd8 Mon Sep 17 00:00:00 2001 From: Hironsan Date: Thu, 7 Apr 2022 10:59:58 +0900 Subject: [PATCH 14/15] Enable to pass mypy --- backend/data_export/pipeline/repositories.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/backend/data_export/pipeline/repositories.py b/backend/data_export/pipeline/repositories.py index 1423e235..9734b193 100644 --- a/backend/data_export/pipeline/repositories.py +++ b/backend/data_export/pipeline/repositories.py @@ -1,7 +1,7 @@ import abc import itertools from collections import defaultdict -from typing import Any, Dict, Iterator, List, Tuple, Union +from typing import Any, Dict, Iterator, List, Tuple from .data import Record from examples.models import Example @@ -10,13 +10,12 @@ from projects.models import Project SpanType = Tuple[int, int, str] -class BaseRepository(abc.ABC): +class BaseRepository: def __init__(self, project: Project): self.project = project - @abc.abstractmethod def list(self, export_approved=False) -> Iterator[Record]: - pass + raise NotImplementedError() class FileRepository(BaseRepository): @@ -192,7 +191,7 @@ class IntentDetectionSlotFillingRepository(TextRepository): def label_per_user(self, doc) -> Dict: category_per_user: Dict[str, List[str]] = defaultdict(list) span_per_user: Dict[str, List[SpanType]] = defaultdict(list) - label_per_user: Dict[str, Dict[str, Union[List[str], List[SpanType]]]] = defaultdict(dict) + label_per_user: Dict[str, Dict[str, List]] = defaultdict(dict) for a in doc.categories.all(): category_per_user[a.user.username].append(a.label.text) for a in doc.spans.all(): From bec56fe432fe3872b8b7d74cfa51f064794b5a8e Mon Sep 17 00:00:00 2001 From: Hironsan Date: Fri, 8 Apr 2022 11:07:22 +0900 Subject: [PATCH 15/15] Add test cases for Progress API --- backend/metrics/tests.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/backend/metrics/tests.py b/backend/metrics/tests.py index 82fdd166..eadad24b 100644 --- a/backend/metrics/tests.py +++ b/backend/metrics/tests.py @@ -28,6 +28,41 @@ class TestMemberProgress(CRUDMixin): self.assertEqual(response.data, {"total": 1, "progress": expected_progress}) +class TestProgressHelper(CRUDMixin): + collaborative_annotation = False + + def setUp(self): + self.project = prepare_project(DOCUMENT_CLASSIFICATION, collaborative_annotation=self.collaborative_annotation) + self.example = make_doc(self.project.item) + mommy.make("ExampleState", example=self.example, confirmed_by=self.project.admin) + self.url = reverse(viewname="progress", args=[self.project.item.id]) + + +class TestProgress(TestProgressHelper): + collaborative_annotation = False + + def test_fetch_progress(self): + response = self.assert_fetch(self.project.admin, status.HTTP_200_OK) + expected = {"total": 1, "remaining": 0, "complete": 1} + self.assertEqual(response.data, expected) + + def test_cannot_affect_others_progress(self): + for member in self.project.staffs: + response = self.assert_fetch(member, status.HTTP_200_OK) + expected = {"total": 1, "remaining": 1, "complete": 0} + self.assertEqual(response.data, expected) + + +class TestProgressOnCollaborativeAnnotation(TestProgressHelper): + collaborative_annotation = True + + def test_fetch_progress(self): + for member in self.project.members: + response = self.assert_fetch(member, status.HTTP_200_OK) + expected = {"total": 1, "remaining": 0, "complete": 1} + self.assertEqual(response.data, expected) + + class TestCategoryDistribution(CRUDMixin): def setUp(self): self.project = prepare_project(DOCUMENT_CLASSIFICATION)