Browse Source

Add offline patcher script

pull/1322/head
Johann Frei 3 years ago
parent
commit
8d32cad252
2 changed files with 142 additions and 0 deletions
  1. 7
      offline_deployment/offline_02_2-import_Docker_images.sh
  2. 135
      offline_deployment/offline_patcher.py

7
offline_deployment/offline_02_2-import_Docker_images.sh

@ -8,6 +8,13 @@ unset DIR
sed -i 's^dockerfile: app/Dockerfile.prod^dockerfile: app/Dockerfile.prod\n image: doccano-app:custom^g' ../docker-compose.prod.yml
sed -i 's^dockerfile: nginx/Dockerfile^dockerfile: nginx/Dockerfile\n image: doccano-nginx:custom^g' ../docker-compose.prod.yml
# Modify Dockerfile for nginx to add python3 and offline patch
sed -i 's|FROM nginx|COPY offline_deployment/offline_patcher.py /patch.py\
RUN apk add -U --no-cache py3-requests \\\
\&\& mkdir -p /app/dist/static/offline \&\& python3 /patch.py /app/dist /app/dist/static/offline\
\
FROM nginx|' ../nginx/Dockerfile
# Load docker images
docker image load -i doccano-app.tar
docker image load -i doccano-nginx.tar

135
offline_deployment/offline_patcher.py

@ -0,0 +1,135 @@
import sys, os, re
from glob import glob
import uuid
import requests
def main():
# root folder to scan for URLs
root_folder = sys.argv[1]
# offline folder to store static offline files
offline_folder = sys.argv[2]
offline_file = os.path.join(offline_folder, "offline_{}.{}")
offline_link = "offline/offline_{}.{}"
mime_ptn = re.compile(r"(?P<mime>(?P<t1>[\w^\/]+)\/(?P<t2>[\S\.^\;]+))(\;|$)", re.IGNORECASE)
#link_ptn = re.compile(r"(?P<encl>[\S\"\'])(?P<link>https?:\/\/(?P<host>[\S^:\/)]+)(?P<port>\:[0-9]+)?\/((?!(?P=encl)).)+)(?P=encl)", re.IGNORECASE)
link_ptn = re.compile(r"[\(\'\"\ ](?P<link>https?:\/\/(?P<host>[\S^:\/)]+)(?P<port>\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P<encl_stop>[\(\)\'\"\ ])")
# Block special hosts
forbidden_hosts = [
re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository
re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository
re.compile(r"^[0-9\.]+$"), # avoid IP addresses
re.compile(r"^[^\.]+$"), # needs a dot in host
]
# only support certain content types
supported_mime_types = [
# (filter function -> bool, file extension -> str)
(lambda m: m["t2"] == "javascript", lambda m: "js"),
(lambda m: m["t2"] == "css", lambda m: "css"),
(lambda m: m["t1"] == "font", lambda m: m["t2"]),
]
# load all initial files
files_to_check = []
for cur_dir, n_dir, n_files in os.walk(root_folder):
files_to_check += [ os.path.join(cur_dir, f) for f in n_files ]
cached_urls = {}
valid_urls = {}
i = 0
while i < len(files_to_check):
file_i = files_to_check[i]
try:
print("Inspect", file_i)
with open(file_i, "r", encoding="utf-8") as f:
t = f.read()
for match in link_ptn.finditer(t):
found_link = match.group("link")
found_host = match.group("host")
if found_link not in valid_urls:
# check link
if True in [ True for fh in forbidden_hosts if fh.match(found_link) is not None ]:
# host is forbidden
valid_urls[found_link] = False
else:
# host is not forbidden
# check mime type
response = requests.head(found_link, allow_redirects=True)
mime = response.headers.get("Content-Type", None)
if mime is None:
valid_urls[found_link] = False
else:
mime_match = mime_ptn.match(mime)
if mime_match is None:
valid_urls[found_link] = False
else:
final_fext = None
for smt, get_fext in supported_mime_types:
if smt(mime_match):
final_fext = get_fext(mime_match)
break
if final_fext is None:
# mime not supported
valid_urls[found_link] = False
else:
# mime is supported -> store and remember file
valid_urls[found_link] = True
file_unique = uuid.uuid4()
target_link = offline_link.format(file_unique, final_fext)
target_file = offline_file.format(file_unique, final_fext)
# download file
try:
file_response = requests.get(found_link, allow_redirects=True)
file_response.raise_for_status()
with open(target_file, 'wb') as download_file:
for chunk in file_response.iter_content(100000):
download_file.write(chunk)
# also check downloaded file
files_to_check.append(target_file)
print("Downloaded file:", found_link)
except:
print("Link could not been downloaded:", found_link)
# register downloaded file
cached_urls[found_link] = {
"input_link": found_link,
"target_link": target_link,
"file": target_file,
"fext": final_fext,
"found": [ file_i ]
}
if valid_urls[found_link]:
# add to cached urls entries
cached_urls[found_link]["found"].append(file_i)
print("Checked file:", file_i)
except:
print("Skip file:", file_i)
# look at next file
i+= 1
# replace files with offline link
for _, cached in cached_urls.items():
for edit_file in cached["found"]:
with open(edit_file, "r", encoding="utf-8") as f:
file_content = f.read()
with open(edit_file, "w", encoding="utf-8") as f:
f.write(file_content.replace(cached["input_link"], cached["target_link"]))
print("Patched to", len(cached["found"]), "file with link:", cached["target_link"])
print("Done")
if __name__ == "__main__":
main()
Loading…
Cancel
Save