You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

203 lines
8.8 KiB

  1. #!/usr/bin/env python3
  2. import sys, os, re
  3. import uuid
  4. from urllib.parse import urljoin
  5. import requests
  6. """Script Information
  7. This script scans all files of a given directory [1] for URL addresses and
  8. hyperlink references.
  9. All found URLs are requested for Content-Type.
  10. For certain Content-Types (like js, css, or fonts), the file is downloaded and
  11. stored locally into a given directory [2] and the existing URLs are altered
  12. to a local URL location (with a given URL prefix [3]).
  13. Downloaded files are scanned for URLs recursively.
  14. Relative references in CSS files are an edge case that is
  15. handled separately by a specific regex pattern.
  16. Arguments:
  17. 1. <root directory [1]>
  18. 2. <local offline storage directory [2]>
  19. 3. <HTTP URL location prefix [3]>
  20. Example:
  21. - Given:
  22. - File ./webspace/index.html, containing URL: https://example.com/library.js
  23. - Directory ./webspace/static, containing static files,
  24. serving content on HTTP location: /staticfiles
  25. - Call:
  26. $> python3 offline_patcher.py webspace/ webspace/static /staticfiles
  27. - Result:
  28. - Library from https://example.com/library.js is stored as file:
  29. webspace/static/offline_<uuid>.js
  30. - Link in file webspace/index.html is replaced to:
  31. /staticfiles/offline_<uuid>.js
  32. - File webspace/static/offline_<uuid>.js is scanned recursively for URLs
  33. Author: Johann Frei
  34. """
  35. def main():
  36. # root folder to scan for URLs
  37. root_folder = sys.argv[1]
  38. # offline folder to store static offline files
  39. offline_folder = sys.argv[2]
  40. # offline link prefix
  41. offline_prefix = sys.argv[3]
  42. offline_file = os.path.join(offline_folder, "offline_{}.{}")
  43. offline_link = offline_prefix + "/offline_{}.{}"
  44. mime_ptn = re.compile(r"(?P<mime>(?P<t1>[\w^\/]+)\/(?P<t2>[\S\.^\;]+))(\;|$)", re.IGNORECASE)
  45. # regex to find matches like: "https://<host>[:<port>]/a/link/location.html"
  46. link_ptn = re.compile(r"[\(\'\"\ ](?P<link>https?:\/\/(?P<host>(?P<h_host>((?=[^\(\)\'\"\ \:\/])(?=[\S]).)+))(?P<port>\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P<encl_stop>[\(\)\'\"\ ])")
  47. # regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard)
  48. link_ptn_url = re.compile(r"url\([\"\']?(?P<link>((?=[^\)\"\'])(?=[\S]).)+)[\"\']?\)")
  49. # block special hosts
  50. forbidden_hosts = [
  51. re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository
  52. re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository
  53. re.compile(r"^[0-9\.]+$"), # avoid IP addresses
  54. re.compile(r"^[^\.]+$"), # needs a dot in host
  55. ]
  56. # only support certain content types
  57. supported_mime_types = [
  58. # (filter function -> bool, file extension -> str)
  59. (lambda m: m["t2"] == "javascript", lambda m: "js"),
  60. (lambda m: m["t2"] == "css", lambda m: "css"),
  61. (lambda m: m["t1"] == "font", lambda m: m["t2"]),
  62. ]
  63. # load all initial files
  64. files_to_check = []
  65. for cur_dir, n_dir, n_files in os.walk(root_folder):
  66. files_to_check += [ os.path.join(cur_dir, f) for f in n_files ]
  67. cached_urls = {}
  68. valid_urls = {}
  69. file_origins = {}
  70. i = 0
  71. while i < len(files_to_check):
  72. file_i = files_to_check[i]
  73. try:
  74. print("Inspect", file_i)
  75. with open(file_i, "r", encoding="utf-8") as f:
  76. t = f.read()
  77. link_findings_default = [ {
  78. "abs": match.group("link"),
  79. "found": match.group("link"),
  80. "host": match.group("host")
  81. } for match in link_ptn.finditer(t) ]
  82. # extract relative urls and convert them to absolute http urls
  83. link_findings_url_prefix = []
  84. for match in link_ptn_url.finditer(t):
  85. if os.path.abspath(file_i) in file_origins and not match.group("link").startswith("http"):
  86. link_abs = urljoin(file_origins[os.path.abspath(file_i)], match.group("link"))
  87. item = {
  88. "abs": link_abs,
  89. "found": match.group("link"),
  90. "host": link_ptn.match( "\"" + link_abs + "\"").group("host")
  91. }
  92. link_findings_url_prefix.append(item)
  93. for spot in link_findings_default + link_findings_url_prefix:
  94. absolute_link = spot["abs"]
  95. found_link = spot["found"]
  96. found_host = spot["host"]
  97. if absolute_link not in valid_urls:
  98. # check link
  99. if True in [ True for fh in forbidden_hosts if fh.match(absolute_link) is not None ]:
  100. # host is forbidden
  101. valid_urls[absolute_link] = False
  102. else:
  103. # host is not forbidden
  104. # check mime type
  105. response = requests.head(absolute_link, allow_redirects=True)
  106. mime = response.headers.get("Content-Type", None)
  107. if mime is None:
  108. valid_urls[absolute_link] = False
  109. else:
  110. mime_match = mime_ptn.match(mime)
  111. if mime_match is None:
  112. valid_urls[absolute_link] = False
  113. else:
  114. final_fext = None
  115. # try supported content types
  116. for smt, get_fext in supported_mime_types:
  117. if smt(mime_match):
  118. final_fext = get_fext(mime_match)
  119. break
  120. if final_fext is None:
  121. # mime not supported
  122. valid_urls[absolute_link] = False
  123. else:
  124. # mime is supported -> store and remember file
  125. valid_urls[absolute_link] = True
  126. file_unique = uuid.uuid4()
  127. target_link = offline_link.format(file_unique, final_fext)
  128. target_file = offline_file.format(file_unique, final_fext)
  129. # download file
  130. try:
  131. file_response = requests.get(absolute_link, allow_redirects=True)
  132. file_response.raise_for_status()
  133. with open(target_file, 'wb') as download_file:
  134. for chunk in file_response.iter_content(100000):
  135. download_file.write(chunk)
  136. # also check downloaded file for links later
  137. files_to_check.append(target_file)
  138. print("Downloaded file:", absolute_link)
  139. except:
  140. print("Link could not been downloaded:", absolute_link)
  141. # register downloaded file
  142. cached_urls[absolute_link] = {
  143. "input_link": absolute_link,
  144. "target_link": target_link,
  145. "file": target_file,
  146. "fext": final_fext,
  147. "found": [ {"file": file_i, "found_link": found_link} ]
  148. }
  149. # store reverse lookup for recursive url("../rel/link") patterns
  150. file_origins[os.path.abspath(target_file)] = absolute_link
  151. if valid_urls[absolute_link]:
  152. # add to cached urls entries
  153. cached_urls[absolute_link]["found"].append({"file": file_i, "found_link": found_link})
  154. print("Checked file:", file_i)
  155. except UnicodeDecodeError:
  156. print("Skip file (No unicode):", file_i)
  157. except:
  158. print("Unknown error... Skip file:", file_i)
  159. # look at next file
  160. i+= 1
  161. # replace files with offline link
  162. for _, cached in cached_urls.items():
  163. for edit_file in cached["found"]:
  164. with open(edit_file["file"], "r", encoding="utf-8") as f:
  165. file_content = f.read()
  166. with open(edit_file["file"], "w", encoding="utf-8") as f:
  167. f.write(file_content.replace(edit_file["found_link"], cached["target_link"]))
  168. print("Patched to", len(cached["found"]), "file with link:", cached["target_link"])
  169. print("Done")
  170. if __name__ == "__main__":
  171. main()