diff --git a/docs/advanced/offline_deployment.md b/docs/advanced/offline_deployment.md new file mode 100644 index 00000000..906161e2 --- /dev/null +++ b/docs/advanced/offline_deployment.md @@ -0,0 +1,29 @@ +# Doccano Offline Deployment + +## Use Case +These offline deployment scripts are suited for deploying Doccano on an air gapped Ubuntu 18.04/20.04 virtual machine (VM 2) with no internet connectivity (such as in clinical environments). + +The preparation requires another machine (VM 1) with internet access and `docker`/`docker-compose` preinstalled (with $USER in `docker` group) and running the same Ubuntu distribution as VM 2. + +The focus is primarily on the `docker-compose`-based production deployment. +The files mentioned in this document are located in the `tools/offline_deployment/` directory. + +## Setup Steps + +Run the following steps on VM 1: +1. Clone this repository +2. Run the scripts `offline_01_*.sh` in ascending order + Skip OR modify and run the script `offline_01_1-optional_use_https` + Do NOT run these scripts as `sudo`! The scripts will ask for sudo-permissions when it is needed. + +Now, move over to VM 2 + +3. Copy the repository folder from VM 1 to VM 2 +4. Run the scripts `offline_02_*.sh` in ascending order + Do NOT run these scripts as `sudo`! The scripts will ask for sudo-permissions when it is needed. +5. Make minor changes on `docker-compose.prod.yml` to change the admin credentials +6. Run `docker-compose -f docker-compose.prod.yml up` in the repository root directory or use the script `offline_03_*.sh` + +## Remarks + +The setup was tested on Ubuntu 18.04 machines. \ No newline at end of file diff --git a/tools/offline_deployment/offline_01_1-optional_use_https.sh b/tools/offline_deployment/offline_01_1-optional_use_https.sh new file mode 100755 index 00000000..468b0edf --- /dev/null +++ b/tools/offline_deployment/offline_01_1-optional_use_https.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +cd ../.. +unset DIR + +# create certificate pair +sudo apt-get install -y openssl +openssl req -new -newkey rsa:4096 -sha256 -nodes -x509 -keyout ./nginx/cert.key -out ./nginx/cert.crt \ + -subj "/C=US/ST=StateCode/L=LocationName/O=OrganizationName/OU=OrganizationUnit/CN=doccano.herokuapp.com" + +# define cert paths inside container +ssl_cert="/certs/cert.crt" +ssl_cert_key="/certs/cert.key" + +# edit nginx.conf +sed -i "s|listen 80;|listen 443 ssl;\n ssl_certificate $ssl_cert;\n ssl_certificate_key $ssl_cert_key;|g" nginx/nginx.conf + +# edit nginx Dockerfile +echo "RUN mkdir -p /certs/" >> nginx/Dockerfile +echo "COPY nginx/cert.key /certs/cert.key" >> nginx/Dockerfile +echo "COPY nginx/cert.crt /certs/cert.crt" >> nginx/Dockerfile + +# edit published port +sed -i "s|- 80:80|- 443:443|g" docker-compose.prod.yml + +echo "Switched to HTTPS" diff --git a/tools/offline_deployment/offline_01_2-patch_and_extract_Docker_images.sh b/tools/offline_deployment/offline_01_2-patch_and_extract_Docker_images.sh new file mode 100755 index 00000000..1024af24 --- /dev/null +++ b/tools/offline_deployment/offline_01_2-patch_and_extract_Docker_images.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +unset DIR + +# WORKAROUND: Downgrade docker-compose version to match Ubuntu 18.04 default compose package +echo "Patching docker-compose to match Ubuntu 18.04 compose package" +sed -i 's|version: "3.7"|version: "3.3"|g' ../../docker-compose.prod.yml + +sed -i 's^dockerfile: backend/Dockerfile.prod^dockerfile: backend/Dockerfile.prod\n image: doccano-backend:custom^g' ../../docker-compose.prod.yml +sed -i 's^dockerfile: nginx/Dockerfile^dockerfile: nginx/Dockerfile\n image: doccano-nginx:custom^g' ../../docker-compose.prod.yml + +# Modify Dockerfile for nginx to add python3 and offline patch +sed -i 's|FROM nginx|COPY tools/offline_deployment/offline_patcher.py /patch.py\ +RUN apk add -U --no-cache py3-requests \\\ + \&\& mkdir -p /app/dist/offline \&\& python3 /patch.py /app/dist /app/dist/offline /offline\ +\ +FROM nginx|' ../../nginx/Dockerfile + +# Modify Dockerfile for backend to add python3 and offline patch +# TODO: Remark: Not needed due to SPA frontend +#sed -i 's|COPY ./Pipfile\* /backend/|COPY ./Pipfile\* /backend/\ +#COPY tools/offline_deployment/offline_patcher.py /patch.py\ +#RUN apt-get update \ +# \&\& apt-get install -y --no-install-recommends \ +# python3 python3-requests \ +# \&\& apt-get clean \\\ +# \&\& rm -rf /var/lib/apt/lists/\*\ +# \&\& mkdir -p /backend/server/static/offline \&\& python3 /patch.py /backend/server /server/static/offline\ +#\ +#|' ../../backend/Dockerfile.prod + +docker-compose -f ../../docker-compose.prod.yml pull +docker-compose -f ../../docker-compose.prod.yml build + +docker image save -o doccano-backend.tar doccano-backend:custom +docker image save -o doccano-nginx.tar doccano-nginx:custom +docker image save -o postgres.tar postgres:13.1-alpine +docker image save -o rabbitmq.tar rabbitmq:3.8 diff --git a/tools/offline_deployment/offline_01_3-download_APT_packages.sh b/tools/offline_deployment/offline_01_3-download_APT_packages.sh new file mode 100755 index 00000000..95342f59 --- /dev/null +++ b/tools/offline_deployment/offline_01_3-download_APT_packages.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +unset DIR + +# Prepare and download packages +pdir="/offline_packages" +mkdir -p "$(pwd)${pdir}" +cd "$(pwd)${pdir}" + +SELECTED_PACKAGES="wget unzip curl tar docker.io docker-compose" + +apt-get download $(apt-cache depends --recurse --no-recommends --no-suggests \ + --no-conflicts --no-breaks --no-replaces --no-enhances \ + --no-pre-depends ${SELECTED_PACKAGES} | grep "^\w") + +# Build package index +sudo apt-get install -y dpkg-dev +dpkg-scanpackages "." /dev/null | gzip -9c > Packages.gz + +echo "Packages extracted to: $(pwd)${pdir}" \ No newline at end of file diff --git a/tools/offline_deployment/offline_02_1-install_APT_packages.sh b/tools/offline_deployment/offline_02_1-install_APT_packages.sh new file mode 100755 index 00000000..6f23f226 --- /dev/null +++ b/tools/offline_deployment/offline_02_1-install_APT_packages.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +unset DIR + +# Import APT packages +pdir="/offline_packages" +abs_pdir="$(pwd)${pdir}" +sudo mv /etc/apt/sources.list /etc/apt/sources.list.bak +cat < sources.list +deb [trusted=yes] file://${abs_pdir} ./ +EOF +sudo mv sources.list /etc/apt/sources.list + +# Install APT packages +sudo apt-get update +SELECTED_PACKAGES="wget unzip curl tar docker.io docker-compose" +sudo apt-get install -y $SELECTED_PACKAGES + +# Cleanup +sudo apt-get clean +sudo mv /etc/apt/sources.list.bak /etc/apt/sources.list + +# Setup Docker +sudo usermod -aG docker $(whoami) +sudo systemctl enable docker.service + +echo "Packages were installed. We need to reboot!" + diff --git a/tools/offline_deployment/offline_02_2-import_Docker_images.sh b/tools/offline_deployment/offline_02_2-import_Docker_images.sh new file mode 100755 index 00000000..e2509e7f --- /dev/null +++ b/tools/offline_deployment/offline_02_2-import_Docker_images.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +unset DIR + +# Info: Docker image name is already set in previous scripts +## Set image tag in Compose to avoid image build +#sed -i 's^dockerfile: backend/Dockerfile.prod^dockerfile: backend/Dockerfile.prod\n image: doccano-backend:custom^g' ../../docker-compose.prod.yml +#sed -i 's^dockerfile: nginx/Dockerfile^dockerfile: nginx/Dockerfile\n image: doccano-nginx:custom^g' ../../docker-compose.prod.yml + +# Load docker images +docker image load -i doccano-backend.tar +docker image load -i doccano-nginx.tar +docker image load -i postgres.tar +docker image load -i rabbitmq.tar diff --git a/tools/offline_deployment/offline_03_1-runDoccano.sh b/tools/offline_deployment/offline_03_1-runDoccano.sh new file mode 100755 index 00000000..95ac7ec2 --- /dev/null +++ b/tools/offline_deployment/offline_03_1-runDoccano.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd $DIR +unset DIR + +docker-compose -f ../../docker-compose.prod.yml up -d diff --git a/tools/offline_deployment/offline_patcher.py b/tools/offline_deployment/offline_patcher.py new file mode 100644 index 00000000..720dddd7 --- /dev/null +++ b/tools/offline_deployment/offline_patcher.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +import sys, os, re +import uuid +from urllib.parse import urljoin +import requests + +"""Script Information + +This script scans all files of a given directory [1] for URL addresses and +hyperlink references. +All found URLs are requested for Content-Type. +For certain Content-Types (like js, css, or fonts), the file is downloaded and +stored locally into a given directory [2] and the existing URLs are altered +to a local URL location (with a given URL prefix [3]). + +Downloaded files are scanned for URLs recursively. +Relative references in CSS files are an edge case that is +handled separately by a specific regex pattern. + +Arguments: + 1. + 2. + 3. + +Example: + - Given: + - File ./webspace/index.html, containing URL: https://example.com/library.js + - Directory ./webspace/static, containing static files, + serving content on HTTP location: /staticfiles + + - Call: + $> python3 offline_patcher.py webspace/ webspace/static /staticfiles + + - Result: + - Library from https://example.com/library.js is stored as file: + webspace/static/offline_.js + - Link in file webspace/index.html is replaced to: + /staticfiles/offline_.js + - File webspace/static/offline_.js is scanned recursively for URLs + +Author: Johann Frei +""" + + +def main(): + # root folder to scan for URLs + root_folder = sys.argv[1] + # offline folder to store static offline files + offline_folder = sys.argv[2] + # offline link prefix + offline_prefix = sys.argv[3] + + offline_file = os.path.join(offline_folder, "offline_{}.{}") + offline_link = offline_prefix + "/offline_{}.{}" + + mime_ptn = re.compile(r"(?P(?P[\w^\/]+)\/(?P[\S\.^\;]+))(\;|$)", re.IGNORECASE) + + # regex to find matches like: "https://[:]/a/link/location.html" + link_ptn = re.compile(r"[\(\'\"\ ](?Phttps?:\/\/(?P(?P((?=[^\(\)\'\"\ \:\/])(?=[\S]).)+))(?P\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P[\(\)\'\"\ ])") + # regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard) + link_ptn_url = re.compile(r"url\([\"\']?(?P((?=[^\)\"\'])(?=[\S]).)+)[\"\']?\)") + + # block special hosts + forbidden_hosts = [ + re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository + re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository + re.compile(r"^[0-9\.]+$"), # avoid IP addresses + re.compile(r"^[^\.]+$"), # needs a dot in host + ] + + # only support certain content types + supported_mime_types = [ + # (filter function -> bool, file extension -> str) + (lambda m: m["t2"] == "javascript", lambda m: "js"), + (lambda m: m["t2"] == "css", lambda m: "css"), + (lambda m: m["t1"] == "font", lambda m: m["t2"]), + ] + + + # load all initial files + files_to_check = [] + for cur_dir, n_dir, n_files in os.walk(root_folder): + files_to_check += [ os.path.join(cur_dir, f) for f in n_files ] + + cached_urls = {} + valid_urls = {} + file_origins = {} + + i = 0 + while i < len(files_to_check): + file_i = files_to_check[i] + try: + print("Inspect", file_i) + with open(file_i, "r", encoding="utf-8") as f: + t = f.read() + + link_findings_default = [ { + "abs": match.group("link"), + "found": match.group("link"), + "host": match.group("host") + } for match in link_ptn.finditer(t) ] + + # extract relative urls and convert them to absolute http urls + link_findings_url_prefix = [] + for match in link_ptn_url.finditer(t): + if os.path.abspath(file_i) in file_origins and not match.group("link").startswith("http"): + link_abs = urljoin(file_origins[os.path.abspath(file_i)], match.group("link")) + item = { + "abs": link_abs, + "found": match.group("link"), + "host": link_ptn.match( "\"" + link_abs + "\"").group("host") + } + link_findings_url_prefix.append(item) + + for spot in link_findings_default + link_findings_url_prefix: + absolute_link = spot["abs"] + found_link = spot["found"] + found_host = spot["host"] + + if absolute_link not in valid_urls: + # check link + if True in [ True for fh in forbidden_hosts if fh.match(absolute_link) is not None ]: + # host is forbidden + valid_urls[absolute_link] = False + else: + # host is not forbidden + # check mime type + response = requests.head(absolute_link, allow_redirects=True) + mime = response.headers.get("Content-Type", None) + if mime is None: + valid_urls[absolute_link] = False + else: + mime_match = mime_ptn.match(mime) + if mime_match is None: + valid_urls[absolute_link] = False + else: + final_fext = None + # try supported content types + for smt, get_fext in supported_mime_types: + if smt(mime_match): + final_fext = get_fext(mime_match) + break + if final_fext is None: + # mime not supported + valid_urls[absolute_link] = False + else: + # mime is supported -> store and remember file + valid_urls[absolute_link] = True + file_unique = uuid.uuid4() + target_link = offline_link.format(file_unique, final_fext) + target_file = offline_file.format(file_unique, final_fext) + + # download file + try: + file_response = requests.get(absolute_link, allow_redirects=True) + file_response.raise_for_status() + with open(target_file, 'wb') as download_file: + for chunk in file_response.iter_content(100000): + download_file.write(chunk) + # also check downloaded file for links later + files_to_check.append(target_file) + + print("Downloaded file:", absolute_link) + except: + print("Link could not been downloaded:", absolute_link) + + # register downloaded file + cached_urls[absolute_link] = { + "input_link": absolute_link, + "target_link": target_link, + "file": target_file, + "fext": final_fext, + "found": [ {"file": file_i, "found_link": found_link} ] + } + # store reverse lookup for recursive url("../rel/link") patterns + file_origins[os.path.abspath(target_file)] = absolute_link + + if valid_urls[absolute_link]: + # add to cached urls entries + cached_urls[absolute_link]["found"].append({"file": file_i, "found_link": found_link}) + + print("Checked file:", file_i) + except UnicodeDecodeError: + print("Skip file (No unicode):", file_i) + except: + print("Unknown error... Skip file:", file_i) + + # look at next file + i+= 1 + + # replace files with offline link + for _, cached in cached_urls.items(): + for edit_file in cached["found"]: + with open(edit_file["file"], "r", encoding="utf-8") as f: + file_content = f.read() + with open(edit_file["file"], "w", encoding="utf-8") as f: + f.write(file_content.replace(edit_file["found_link"], cached["target_link"])) + print("Patched to", len(cached["found"]), "file with link:", cached["target_link"]) + + print("Done") + +if __name__ == "__main__": + main()