Browse Source

Add info to offline patcher script

pull/1322/head
Johann Frei 3 years ago
parent
commit
19248a9741
1 changed files with 42 additions and 6 deletions
  1. 48
      offline_deployment/offline_patcher.py

48
offline_deployment/offline_patcher.py

@ -1,9 +1,47 @@
#!/usr/bin/env python3
import sys, os, re
from glob import glob
import uuid
from urllib.parse import urljoin
import requests
"""Script Information
This script scans all files of a given directory [1] for URL addresses and
hyperlink references.
All found URLs are requested for Content-Type.
For certain Content-Types (like js, css, or fonts), the file is downloaded and
stored locally into a given directory [2] and the existing URLs are altered
to a local URL location (with a given URL prefix [3]).
Downloaded files are scanned for URLs recursively.
Relative references in CSS files are an edge case that is
handled separately by a specific regex pattern.
Arguments:
1. <root directory [1]>
2. <local offline storage directory [2]>
3. <HTTP URL location prefix [3]>
Example:
- Given:
- File ./webspace/index.html, containing URL: https://example.com/library.js
- Directory ./webspace/static, containing static files,
serving content on HTTP location: /staticfiles
- Call:
$> python3 offline_patcher.py webspace/ webspace/static /staticfiles
- Result:
- Library from https://example.com/library.js is stored as file:
webspace/static/offline_<uuid>.js
- Link in file webspace/index.html is replaced to:
/staticfiles/offline_<uuid>.js
- File webspace/static/offline_<uuid>.js is scanned recursively for URLs
Author: Johann Frei
"""
def main():
# root folder to scan for URLs
root_folder = sys.argv[1]
@ -12,19 +50,17 @@ def main():
# offline link prefix
offline_prefix = sys.argv[3]
offline_file = os.path.join(offline_folder, "offline_{}.{}")
offline_link = offline_prefix + "/offline_{}.{}"
mime_ptn = re.compile(r"(?P<mime>(?P<t1>[\w^\/]+)\/(?P<t2>[\S\.^\;]+))(\;|$)", re.IGNORECASE)
#link_ptn = re.compile(r"(?P<encl>[\S\"\'])(?P<link>https?:\/\/(?P<host>[\S^:\/)]+)(?P<port>\:[0-9]+)?\/((?!(?P=encl)).)+)(?P=encl)", re.IGNORECASE)
# Regex to find matches like: "https://<host>[:<port>]/a/link/location.html"
# regex to find matches like: "https://<host>[:<port>]/a/link/location.html"
link_ptn = re.compile(r"[\(\'\"\ ](?P<link>https?:\/\/(?P<host>(?P<h_host>((?=[^\(\)\'\"\ \:\/])(?=[\S]).)+))(?P<port>\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P<encl_stop>[\(\)\'\"\ ])")
# Regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard)
# regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard)
link_ptn_url = re.compile(r"url\([\"\']?(?P<link>((?=[^\)\"\'])(?=[\S]).)+)[\"\']?\)")
# Block special hosts
# block special hosts
forbidden_hosts = [
re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository
re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository

Loading…
Cancel
Save