You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

167 lines
7.7 KiB

  1. import sys, os, re
  2. from glob import glob
  3. import uuid
  4. from urllib.parse import urljoin
  5. import requests
  6. def main():
  7. # root folder to scan for URLs
  8. root_folder = sys.argv[1]
  9. # offline folder to store static offline files
  10. offline_folder = sys.argv[2]
  11. # offline link prefix
  12. offline_prefix = sys.argv[3]
  13. offline_file = os.path.join(offline_folder, "offline_{}.{}")
  14. offline_link = offline_prefix + "/offline_{}.{}"
  15. mime_ptn = re.compile(r"(?P<mime>(?P<t1>[\w^\/]+)\/(?P<t2>[\S\.^\;]+))(\;|$)", re.IGNORECASE)
  16. #link_ptn = re.compile(r"(?P<encl>[\S\"\'])(?P<link>https?:\/\/(?P<host>[\S^:\/)]+)(?P<port>\:[0-9]+)?\/((?!(?P=encl)).)+)(?P=encl)", re.IGNORECASE)
  17. # Regex to find matches like: "https://<host>[:<port>]/a/link/location.html"
  18. link_ptn = re.compile(r"[\(\'\"\ ](?P<link>https?:\/\/(?P<host>(?P<h_host>((?=[^\(\)\'\"\ \:\/])(?=[\S]).)+))(?P<port>\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P<encl_stop>[\(\)\'\"\ ])")
  19. # Regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard)
  20. link_ptn_url = re.compile(r"url\([\"\']?(?P<link>((?=[^\)\"\'])(?=[\S]).)+)[\"\']?\)")
  21. # Block special hosts
  22. forbidden_hosts = [
  23. re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository
  24. re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository
  25. re.compile(r"^[0-9\.]+$"), # avoid IP addresses
  26. re.compile(r"^[^\.]+$"), # needs a dot in host
  27. ]
  28. # only support certain content types
  29. supported_mime_types = [
  30. # (filter function -> bool, file extension -> str)
  31. (lambda m: m["t2"] == "javascript", lambda m: "js"),
  32. (lambda m: m["t2"] == "css", lambda m: "css"),
  33. (lambda m: m["t1"] == "font", lambda m: m["t2"]),
  34. ]
  35. # load all initial files
  36. files_to_check = []
  37. for cur_dir, n_dir, n_files in os.walk(root_folder):
  38. files_to_check += [ os.path.join(cur_dir, f) for f in n_files ]
  39. cached_urls = {}
  40. valid_urls = {}
  41. file_origins = {}
  42. i = 0
  43. while i < len(files_to_check):
  44. file_i = files_to_check[i]
  45. try:
  46. print("Inspect", file_i)
  47. with open(file_i, "r", encoding="utf-8") as f:
  48. t = f.read()
  49. link_findings_default = [ {
  50. "abs": match.group("link"),
  51. "found": match.group("link"),
  52. "host": match.group("host")
  53. } for match in link_ptn.finditer(t) ]
  54. # extract relative urls and convert them to absolute http urls
  55. link_findings_url_prefix = []
  56. for match in link_ptn_url.finditer(t):
  57. if os.path.abspath(file_i) in file_origins and not match.group("link").startswith("http"):
  58. link_abs = urljoin(file_origins[os.path.abspath(file_i)], match.group("link"))
  59. item = {
  60. "abs": link_abs,
  61. "found": match.group("link"),
  62. "host": link_ptn.match( "\"" + link_abs + "\"").group("host")
  63. }
  64. link_findings_url_prefix.append(item)
  65. for spot in link_findings_default + link_findings_url_prefix:
  66. absolute_link = spot["abs"]
  67. found_link = spot["found"]
  68. found_host = spot["host"]
  69. if absolute_link not in valid_urls:
  70. # check link
  71. if True in [ True for fh in forbidden_hosts if fh.match(absolute_link) is not None ]:
  72. # host is forbidden
  73. valid_urls[absolute_link] = False
  74. else:
  75. # host is not forbidden
  76. # check mime type
  77. response = requests.head(absolute_link, allow_redirects=True)
  78. mime = response.headers.get("Content-Type", None)
  79. if mime is None:
  80. valid_urls[absolute_link] = False
  81. else:
  82. mime_match = mime_ptn.match(mime)
  83. if mime_match is None:
  84. valid_urls[absolute_link] = False
  85. else:
  86. final_fext = None
  87. # try supported content types
  88. for smt, get_fext in supported_mime_types:
  89. if smt(mime_match):
  90. final_fext = get_fext(mime_match)
  91. break
  92. if final_fext is None:
  93. # mime not supported
  94. valid_urls[absolute_link] = False
  95. else:
  96. # mime is supported -> store and remember file
  97. valid_urls[absolute_link] = True
  98. file_unique = uuid.uuid4()
  99. target_link = offline_link.format(file_unique, final_fext)
  100. target_file = offline_file.format(file_unique, final_fext)
  101. # download file
  102. try:
  103. file_response = requests.get(absolute_link, allow_redirects=True)
  104. file_response.raise_for_status()
  105. with open(target_file, 'wb') as download_file:
  106. for chunk in file_response.iter_content(100000):
  107. download_file.write(chunk)
  108. # also check downloaded file for links later
  109. files_to_check.append(target_file)
  110. print("Downloaded file:", absolute_link)
  111. except:
  112. print("Link could not been downloaded:", absolute_link)
  113. # register downloaded file
  114. cached_urls[absolute_link] = {
  115. "input_link": absolute_link,
  116. "target_link": target_link,
  117. "file": target_file,
  118. "fext": final_fext,
  119. "found": [ {"file": file_i, "found_link": found_link} ]
  120. }
  121. # store reverse lookup for recursive url("../rel/link") patterns
  122. file_origins[os.path.abspath(target_file)] = absolute_link
  123. if valid_urls[absolute_link]:
  124. # add to cached urls entries
  125. cached_urls[absolute_link]["found"].append({"file": file_i, "found_link": found_link})
  126. print("Checked file:", file_i)
  127. except UnicodeDecodeError:
  128. print("Skip file (No unicode):", file_i)
  129. except:
  130. print("Unknown error... Skip file:", file_i)
  131. # look at next file
  132. i+= 1
  133. # replace files with offline link
  134. for _, cached in cached_urls.items():
  135. for edit_file in cached["found"]:
  136. with open(edit_file["file"], "r", encoding="utf-8") as f:
  137. file_content = f.read()
  138. with open(edit_file["file"], "w", encoding="utf-8") as f:
  139. f.write(file_content.replace(edit_file["found_link"], cached["target_link"]))
  140. print("Patched to", len(cached["found"]), "file with link:", cached["target_link"])
  141. print("Done")
  142. if __name__ == "__main__":
  143. main()