#!/usr/bin/env python3 import sys, os, re import uuid from urllib.parse import urljoin import requests """Script Information This script scans all files of a given directory [1] for URL addresses and hyperlink references. All found URLs are requested for Content-Type. For certain Content-Types (like js, css, or fonts), the file is downloaded and stored locally into a given directory [2] and the existing URLs are altered to a local URL location (with a given URL prefix [3]). Downloaded files are scanned for URLs recursively. Relative references in CSS files are an edge case that is handled separately by a specific regex pattern. Arguments: 1. 2. 3. Example: - Given: - File ./webspace/index.html, containing URL: https://example.com/library.js - Directory ./webspace/static, containing static files, serving content on HTTP location: /staticfiles - Call: $> python3 offline_patcher.py webspace/ webspace/static /staticfiles - Result: - Library from https://example.com/library.js is stored as file: webspace/static/offline_.js - Link in file webspace/index.html is replaced to: /staticfiles/offline_.js - File webspace/static/offline_.js is scanned recursively for URLs Author: Johann Frei """ def main(): # root folder to scan for URLs root_folder = sys.argv[1] # offline folder to store static offline files offline_folder = sys.argv[2] # offline link prefix offline_prefix = sys.argv[3] offline_file = os.path.join(offline_folder, "offline_{}.{}") offline_link = offline_prefix + "/offline_{}.{}" mime_ptn = re.compile(r"(?P(?P[\w^\/]+)\/(?P[\S\.^\;]+))(\;|$)", re.IGNORECASE) # regex to find matches like: "https://[:]/a/link/location.html" link_ptn = re.compile(r"[$\'\"\ ](?Phttps?:\/\/(?P(?P((?=[^\($\'\"\ \:\/])(?=[\S]).)+))(?P\:[0-9]+)?\/[^\'\"\ ]+)(?P[\'\"\ ])") # regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard) link_ptn_url = re.compile(r"url$[\"\']?(?P((?=[^$\"\'])(?=[\S]).)+)[\"\']?\)") # block special hosts forbidden_hosts = [ re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository re.compile(r"^[0-9\.]+$"), # avoid IP addresses re.compile(r"^[^\.]+$"), # needs a dot in host ] # only support certain content types supported_mime_types = [ # (filter function -> bool, file extension -> str) (lambda m: m["t2"] == "javascript", lambda m: "js"), (lambda m: m["t2"] == "css", lambda m: "css"), (lambda m: m["t1"] == "font", lambda m: m["t2"]), ] # load all initial files files_to_check = [] for cur_dir, n_dir, n_files in os.walk(root_folder): files_to_check += [ os.path.join(cur_dir, f) for f in n_files ] cached_urls = {} valid_urls = {} file_origins = {} i = 0 while i < len(files_to_check): file_i = files_to_check[i] try: print("Inspect", file_i) with open(file_i, "r", encoding="utf-8") as f: t = f.read() link_findings_default = [ { "abs": match.group("link"), "found": match.group("link"), "host": match.group("host") } for match in link_ptn.finditer(t) ] # extract relative urls and convert them to absolute http urls link_findings_url_prefix = [] for match in link_ptn_url.finditer(t): if os.path.abspath(file_i) in file_origins and not match.group("link").startswith("http"): link_abs = urljoin(file_origins[os.path.abspath(file_i)], match.group("link")) item = { "abs": link_abs, "found": match.group("link"), "host": link_ptn.match( "\"" + link_abs + "\"").group("host") } link_findings_url_prefix.append(item) for spot in link_findings_default + link_findings_url_prefix: absolute_link = spot["abs"] found_link = spot["found"] found_host = spot["host"] if absolute_link not in valid_urls: # check link if True in [ True for fh in forbidden_hosts if fh.match(absolute_link) is not None ]: # host is forbidden valid_urls[absolute_link] = False else: # host is not forbidden # check mime type response = requests.head(absolute_link, allow_redirects=True) mime = response.headers.get("Content-Type", None) if mime is None: valid_urls[absolute_link] = False else: mime_match = mime_ptn.match(mime) if mime_match is None: valid_urls[absolute_link] = False else: final_fext = None # try supported content types for smt, get_fext in supported_mime_types: if smt(mime_match): final_fext = get_fext(mime_match) break if final_fext is None: # mime not supported valid_urls[absolute_link] = False else: # mime is supported -> store and remember file valid_urls[absolute_link] = True file_unique = uuid.uuid4() target_link = offline_link.format(file_unique, final_fext) target_file = offline_file.format(file_unique, final_fext) # download file try: file_response = requests.get(absolute_link, allow_redirects=True) file_response.raise_for_status() with open(target_file, 'wb') as download_file: for chunk in file_response.iter_content(100000): download_file.write(chunk) # also check downloaded file for links later files_to_check.append(target_file) print("Downloaded file:", absolute_link) except: print("Link could not been downloaded:", absolute_link) # register downloaded file cached_urls[absolute_link] = { "input_link": absolute_link, "target_link": target_link, "file": target_file, "fext": final_fext, "found": [ {"file": file_i, "found_link": found_link} ] } # store reverse lookup for recursive url("../rel/link") patterns file_origins[os.path.abspath(target_file)] = absolute_link if valid_urls[absolute_link]: # add to cached urls entries cached_urls[absolute_link]["found"].append({"file": file_i, "found_link": found_link}) print("Checked file:", file_i) except UnicodeDecodeError: print("Skip file (No unicode):", file_i) except: print("Unknown error... Skip file:", file_i) # look at next file i+= 1 # replace files with offline link for _, cached in cached_urls.items(): for edit_file in cached["found"]: with open(edit_file["file"], "r", encoding="utf-8") as f: file_content = f.read() with open(edit_file["file"], "w", encoding="utf-8") as f: f.write(file_content.replace(edit_file["found_link"], cached["target_link"])) print("Patched to", len(cached["found"]), "file with link:", cached["target_link"]) print("Done") if __name__ == "__main__": main()