You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
6.7 KiB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
  1. import csv
  2. import json
  3. from io import TextIOWrapper
  4. import itertools as it
  5. import logging
  6. from django.urls import reverse
  7. from django.http import HttpResponse, HttpResponseRedirect
  8. from django.shortcuts import get_object_or_404
  9. from django.views import View
  10. from django.views.generic import TemplateView, CreateView
  11. from django.views.generic.list import ListView
  12. from django.contrib.auth.mixins import LoginRequiredMixin
  13. from django.contrib import messages
  14. from .permissions import SuperUserMixin
  15. from .forms import ProjectForm
  16. from .models import Document, Project
  17. from app import settings
  18. logger = logging.getLogger(__name__)
  19. class IndexView(TemplateView):
  20. template_name = 'index.html'
  21. class ProjectView(LoginRequiredMixin, TemplateView):
  22. def get_template_names(self):
  23. project = get_object_or_404(Project, pk=self.kwargs['project_id'])
  24. return [project.get_template_name()]
  25. class ProjectsView(LoginRequiredMixin, CreateView):
  26. form_class = ProjectForm
  27. template_name = 'projects.html'
  28. class DatasetView(SuperUserMixin, LoginRequiredMixin, ListView):
  29. template_name = 'admin/dataset.html'
  30. paginate_by = 5
  31. def get_queryset(self):
  32. project = get_object_or_404(Project, pk=self.kwargs['project_id'])
  33. return project.documents.all()
  34. class LabelView(SuperUserMixin, LoginRequiredMixin, TemplateView):
  35. template_name = 'admin/label.html'
  36. class StatsView(SuperUserMixin, LoginRequiredMixin, TemplateView):
  37. template_name = 'admin/stats.html'
  38. class GuidelineView(SuperUserMixin, LoginRequiredMixin, TemplateView):
  39. template_name = 'admin/guideline.html'
  40. class DataUpload(SuperUserMixin, LoginRequiredMixin, TemplateView):
  41. template_name = 'admin/dataset_upload.html'
  42. class ImportFileError(Exception):
  43. def __init__(self, message):
  44. self.message = message
  45. def extract_metadata_csv(self, row, text_col, header_without_text):
  46. vals_without_text = [val for i,val in enumerate(row) if i != text_col]
  47. return json.dumps(dict(zip(header_without_text, vals_without_text)))
  48. def csv_to_documents(self, project, file, text_key='text'):
  49. form_data = TextIOWrapper(file, encoding='utf-8')
  50. reader = csv.reader(form_data)
  51. maybe_header = next(reader)
  52. if maybe_header:
  53. if text_key in maybe_header:
  54. text_col = maybe_header.index(text_key)
  55. elif len(maybe_header) == 1:
  56. reader = it.chain([maybe_header], reader)
  57. text_col = 0
  58. else:
  59. raise DataUpload.ImportFileError("CSV file must have either a title with \"text\" column or have only one column ")
  60. header_without_text = [title for i,title in enumerate(maybe_header)
  61. if i != text_col]
  62. return (
  63. Document(
  64. text=row[text_col],
  65. metadata=self.extract_metadata_csv(row, text_col, header_without_text),
  66. project=project
  67. )
  68. for row in reader
  69. )
  70. else:
  71. return []
  72. def extract_metadata_json(self, entry, text_key):
  73. copy = entry.copy()
  74. del copy[text_key]
  75. return json.dumps(copy)
  76. def json_to_documents(self, project, file, text_key='text'):
  77. parsed_entries = (json.loads(line) for line in file)
  78. return (
  79. Document(text=entry[text_key], metadata=self.extract_metadata_json(entry, text_key), project=project)
  80. for entry in parsed_entries
  81. )
  82. def post(self, request, *args, **kwargs):
  83. project = get_object_or_404(Project, pk=kwargs.get('project_id'))
  84. import_format = request.POST['format']
  85. try:
  86. file = request.FILES['file'].file
  87. documents = []
  88. if import_format == 'csv':
  89. documents = self.csv_to_documents(project, file)
  90. elif import_format == 'json':
  91. documents = self.json_to_documents(project, file)
  92. batch_size = settings.IMPORT_BATCH_SIZE
  93. while True:
  94. batch = list(it.islice(documents, batch_size))
  95. if not batch:
  96. break
  97. Document.objects.bulk_create(batch, batch_size=batch_size)
  98. return HttpResponseRedirect(reverse('dataset', args=[project.id]))
  99. except DataUpload.ImportFileError as e:
  100. messages.add_message(request, messages.ERROR, e.message)
  101. return HttpResponseRedirect(reverse('upload', args=[project.id]))
  102. except Exception as e:
  103. logger.exception(e)
  104. messages.add_message(request, messages.ERROR, 'Something went wrong')
  105. return HttpResponseRedirect(reverse('upload', args=[project.id]))
  106. class DataDownload(SuperUserMixin, LoginRequiredMixin, TemplateView):
  107. template_name = 'admin/dataset_download.html'
  108. class DataDownloadFile(SuperUserMixin, LoginRequiredMixin, View):
  109. def get(self, request, *args, **kwargs):
  110. project_id = self.kwargs['project_id']
  111. project = get_object_or_404(Project, pk=project_id)
  112. docs = project.get_documents(is_null=False).distinct()
  113. export_format = request.GET.get('format')
  114. filename = '_'.join(project.name.lower().split())
  115. try:
  116. if export_format == 'csv':
  117. response = self.get_csv(filename, docs)
  118. elif export_format == 'json':
  119. response = self.get_json(filename, docs)
  120. return response
  121. except Exception as e:
  122. logger.exception(e)
  123. messages.add_message(request, messages.ERROR, "Something went wrong")
  124. return HttpResponseRedirect(reverse('download', args=[project.id]))
  125. def get_csv(self, filename, docs):
  126. response = HttpResponse(content_type='text/csv')
  127. response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format(filename)
  128. writer = csv.writer(response)
  129. for d in docs:
  130. writer.writerows(d.to_csv())
  131. return response
  132. def get_json(self, filename, docs):
  133. response = HttpResponse(content_type='text/json')
  134. response['Content-Disposition'] = 'attachment; filename="{}.json"'.format(filename)
  135. for d in docs:
  136. dump = json.dumps(d.to_json(), ensure_ascii=False)
  137. response.write(dump + '\n') # write each json object end with a newline
  138. return response
  139. class DemoTextClassification(TemplateView):
  140. template_name = 'demo/demo_text_classification.html'
  141. class DemoNamedEntityRecognition(TemplateView):
  142. template_name = 'demo/demo_named_entity.html'
  143. class DemoTranslation(TemplateView):
  144. template_name = 'demo/demo_translation.html'