From 8d32cad2529e8d34367fb157f1e7d1a02d1b51bf Mon Sep 17 00:00:00 2001
From: Johann Frei <johann_frei@yahoo.de>
Date: Wed, 14 Apr 2021 19:47:58 +0200
Subject: [PATCH] Add offline patcher script

---
 .../offline_02_2-import_Docker_images.sh      |   7 +
 offline_deployment/offline_patcher.py         | 135 ++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 offline_deployment/offline_patcher.py
diff --git a/offline_deployment/offline_02_2-import_Docker_images.sh b/offline_deployment/offline_02_2-import_Docker_images.sh
index 58c0fb9c..fdbcda64 100755
--- a/offline_deployment/offline_02_2-import_Docker_images.sh
+++ b/offline_deployment/offline_02_2-import_Docker_images.sh
@@ -8,6 +8,13 @@ unset DIR
 sed -i 's^dockerfile: app/Dockerfile.prod^dockerfile: app/Dockerfile.prod\n    image: doccano-app:custom^g' ../docker-compose.prod.yml
 sed -i 's^dockerfile: nginx/Dockerfile^dockerfile: nginx/Dockerfile\n    image: doccano-nginx:custom^g' ../docker-compose.prod.yml
 
+# Modify Dockerfile for nginx to add python3 and offline patch
+sed -i 's|FROM nginx|COPY offline_deployment/offline_patcher.py /patch.py\
+RUN apk add -U --no-cache py3-requests \\\
+  \&\& mkdir -p /app/dist/static/offline \&\& python3 /patch.py /app/dist /app/dist/static/offline\
+\
+FROM nginx|' ../nginx/Dockerfile
+
 # Load docker images
 docker image load -i doccano-app.tar
 docker image load -i doccano-nginx.tar
diff --git a/offline_deployment/offline_patcher.py b/offline_deployment/offline_patcher.py
new file mode 100644
index 00000000..cf49b127
--- /dev/null
+++ b/offline_deployment/offline_patcher.py
@@ -0,0 +1,135 @@
+import sys, os, re
+from glob import glob
+import uuid
+import requests
+
+def main():
+    # root folder to scan for URLs
+    root_folder = sys.argv[1]
+    # offline folder to store static offline files
+    offline_folder = sys.argv[2]
+
+
+    offline_file = os.path.join(offline_folder, "offline_{}.{}")
+    offline_link = "offline/offline_{}.{}"
+
+    mime_ptn = re.compile(r"(?P<mime>(?P<t1>[\w^\/]+)\/(?P<t2>[\S\.^\;]+))(\;|$)", re.IGNORECASE)
+    #link_ptn = re.compile(r"(?P<encl>[\S\"\'])(?P<link>https?:\/\/(?P<host>[\S^:\/)]+)(?P<port>\:[0-9]+)?\/((?!(?P=encl)).)+)(?P=encl)", re.IGNORECASE)
+    link_ptn = re.compile(r"[\(\'\"\ ](?P<link>https?:\/\/(?P<host>[\S^:\/)]+)(?P<port>\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P<encl_stop>[\(\)\'\"\ ])")
+
+    # Block special hosts
+    forbidden_hosts = [
+        re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository
+        re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository
+        re.compile(r"^[0-9\.]+$"), # avoid IP addresses
+        re.compile(r"^[^\.]+$"), # needs a dot in host
+    ]
+
+    # only support certain content types
+    supported_mime_types = [
+        # (filter function -> bool, file extension -> str)
+        (lambda m: m["t2"] == "javascript", lambda m: "js"),
+        (lambda m: m["t2"] == "css", lambda m: "css"),
+        (lambda m: m["t1"] == "font", lambda m: m["t2"]),
+    ]
+
+
+    # load all initial files
+    files_to_check = []
+    for cur_dir, n_dir, n_files in os.walk(root_folder):
+        files_to_check += [ os.path.join(cur_dir, f) for f in n_files ]
+
+    cached_urls = {}
+    valid_urls = {}
+
+    i = 0
+    while i < len(files_to_check):
+        file_i = files_to_check[i]
+        try:
+            print("Inspect", file_i)
+            with open(file_i, "r", encoding="utf-8") as f:
+                t = f.read()
+
+            for match in link_ptn.finditer(t):
+                found_link = match.group("link")
+                found_host = match.group("host")
+
+                if found_link not in valid_urls:
+                    # check link
+                    if True in [ True for fh in forbidden_hosts if fh.match(found_link) is not None ]:
+                        # host is forbidden
+                        valid_urls[found_link] = False
+                    else:
+                        # host is not forbidden
+                        # check mime type
+                        response = requests.head(found_link, allow_redirects=True)
+                        mime = response.headers.get("Content-Type", None)
+                        if mime is None:
+                            valid_urls[found_link] = False
+                        else:
+                            mime_match = mime_ptn.match(mime)
+                            if mime_match is None:
+                                valid_urls[found_link] = False
+                            else:
+                                final_fext = None
+                                for smt, get_fext in supported_mime_types:
+                                    if smt(mime_match):
+                                        final_fext = get_fext(mime_match)
+                                        break
+                                if final_fext is None:
+                                    # mime not supported
+                                    valid_urls[found_link] = False
+                                else:
+                                    # mime is supported -> store and remember file
+                                    valid_urls[found_link] = True
+                                    file_unique = uuid.uuid4()
+                                    target_link = offline_link.format(file_unique, final_fext)
+                                    target_file = offline_file.format(file_unique, final_fext)
+
+                                    # download file
+                                    try:
+                                        file_response = requests.get(found_link, allow_redirects=True)
+                                        file_response.raise_for_status()
+                                        with open(target_file, 'wb') as download_file:
+                                            for chunk in file_response.iter_content(100000):
+                                                download_file.write(chunk)
+                                        # also check downloaded file
+                                        files_to_check.append(target_file)
+
+                                        print("Downloaded file:", found_link)
+                                    except:
+                                        print("Link could not been downloaded:", found_link)
+
+                                    # register downloaded file
+                                    cached_urls[found_link] = {
+                                        "input_link": found_link,
+                                        "target_link": target_link,
+                                        "file": target_file,
+                                        "fext": final_fext,
+                                        "found": [ file_i ]
+                                    }
+
+                if valid_urls[found_link]:
+                    # add to cached urls entries
+                    cached_urls[found_link]["found"].append(file_i)
+
+            print("Checked file:", file_i)
+        except:
+            print("Skip file:", file_i)
+
+        # look at next file
+        i+= 1
+
+    # replace files with offline link
+    for _, cached in cached_urls.items():
+        for edit_file in cached["found"]:
+            with open(edit_file, "r", encoding="utf-8") as f:
+                file_content = f.read()
+            with open(edit_file, "w", encoding="utf-8") as f:
+                f.write(file_content.replace(cached["input_link"], cached["target_link"]))
+        print("Patched to", len(cached["found"]), "file with link:", cached["target_link"])
+
+    print("Done")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file