mirror of https://github.com/doccano/doccano.git
pythondatasetsactive-learningtext-annotationdatasetnatural-language-processingdata-labelingmachine-learningannotation-tool
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
203 lines
8.8 KiB
203 lines
8.8 KiB
#!/usr/bin/env python3
|
|
import sys, os, re
|
|
import uuid
|
|
from urllib.parse import urljoin
|
|
import requests
|
|
|
|
"""Script Information
|
|
|
|
This script scans all files of a given directory [1] for URL addresses and
|
|
hyperlink references.
|
|
All found URLs are requested for Content-Type.
|
|
For certain Content-Types (like js, css, or fonts), the file is downloaded and
|
|
stored locally into a given directory [2] and the existing URLs are altered
|
|
to a local URL location (with a given URL prefix [3]).
|
|
|
|
Downloaded files are scanned for URLs recursively.
|
|
Relative references in CSS files are an edge case that is
|
|
handled separately by a specific regex pattern.
|
|
|
|
Arguments:
|
|
1. <root directory [1]>
|
|
2. <local offline storage directory [2]>
|
|
3. <HTTP URL location prefix [3]>
|
|
|
|
Example:
|
|
- Given:
|
|
- File ./webspace/index.html, containing URL: https://example.com/library.js
|
|
- Directory ./webspace/static, containing static files,
|
|
serving content on HTTP location: /staticfiles
|
|
|
|
- Call:
|
|
$> python3 offline_patcher.py webspace/ webspace/static /staticfiles
|
|
|
|
- Result:
|
|
- Library from https://example.com/library.js is stored as file:
|
|
webspace/static/offline_<uuid>.js
|
|
- Link in file webspace/index.html is replaced to:
|
|
/staticfiles/offline_<uuid>.js
|
|
- File webspace/static/offline_<uuid>.js is scanned recursively for URLs
|
|
|
|
Author: Johann Frei
|
|
"""
|
|
|
|
|
|
def main():
|
|
# root folder to scan for URLs
|
|
root_folder = sys.argv[1]
|
|
# offline folder to store static offline files
|
|
offline_folder = sys.argv[2]
|
|
# offline link prefix
|
|
offline_prefix = sys.argv[3]
|
|
|
|
offline_file = os.path.join(offline_folder, "offline_{}.{}")
|
|
offline_link = offline_prefix + "/offline_{}.{}"
|
|
|
|
mime_ptn = re.compile(r"(?P<mime>(?P<t1>[\w^\/]+)\/(?P<t2>[\S\.^\;]+))(\;|$)", re.IGNORECASE)
|
|
|
|
# regex to find matches like: "https://<host>[:<port>]/a/link/location.html"
|
|
link_ptn = re.compile(r"[\(\'\"\ ](?P<link>https?:\/\/(?P<host>(?P<h_host>((?=[^\(\)\'\"\ \:\/])(?=[\S]).)+))(?P<port>\:[0-9]+)?\/[^\(\)\'\"\ ]+)(?P<encl_stop>[\(\)\'\"\ ])")
|
|
# regex to find matches like: url(../relative/parent_directory/links/without/quotes/are/hard)
|
|
link_ptn_url = re.compile(r"url\([\"\']?(?P<link>((?=[^\)\"\'])(?=[\S]).)+)[\"\']?\)")
|
|
|
|
# block special hosts
|
|
forbidden_hosts = [
|
|
re.compile(r"^.*registry\.npmjs\.org$"), # No yarnpkg repository
|
|
re.compile(r"^.*yarnpkg\.com$"), # No yarnpkg repository
|
|
re.compile(r"^[0-9\.]+$"), # avoid IP addresses
|
|
re.compile(r"^[^\.]+$"), # needs a dot in host
|
|
]
|
|
|
|
# only support certain content types
|
|
supported_mime_types = [
|
|
# (filter function -> bool, file extension -> str)
|
|
(lambda m: m["t2"] == "javascript", lambda m: "js"),
|
|
(lambda m: m["t2"] == "css", lambda m: "css"),
|
|
(lambda m: m["t1"] == "font", lambda m: m["t2"]),
|
|
]
|
|
|
|
|
|
# load all initial files
|
|
files_to_check = []
|
|
for cur_dir, n_dir, n_files in os.walk(root_folder):
|
|
files_to_check += [ os.path.join(cur_dir, f) for f in n_files ]
|
|
|
|
cached_urls = {}
|
|
valid_urls = {}
|
|
file_origins = {}
|
|
|
|
i = 0
|
|
while i < len(files_to_check):
|
|
file_i = files_to_check[i]
|
|
try:
|
|
print("Inspect", file_i)
|
|
with open(file_i, "r", encoding="utf-8") as f:
|
|
t = f.read()
|
|
|
|
link_findings_default = [ {
|
|
"abs": match.group("link"),
|
|
"found": match.group("link"),
|
|
"host": match.group("host")
|
|
} for match in link_ptn.finditer(t) ]
|
|
|
|
# extract relative urls and convert them to absolute http urls
|
|
link_findings_url_prefix = []
|
|
for match in link_ptn_url.finditer(t):
|
|
if os.path.abspath(file_i) in file_origins and not match.group("link").startswith("http"):
|
|
link_abs = urljoin(file_origins[os.path.abspath(file_i)], match.group("link"))
|
|
item = {
|
|
"abs": link_abs,
|
|
"found": match.group("link"),
|
|
"host": link_ptn.match( "\"" + link_abs + "\"").group("host")
|
|
}
|
|
link_findings_url_prefix.append(item)
|
|
|
|
for spot in link_findings_default + link_findings_url_prefix:
|
|
absolute_link = spot["abs"]
|
|
found_link = spot["found"]
|
|
found_host = spot["host"]
|
|
|
|
if absolute_link not in valid_urls:
|
|
# check link
|
|
if True in [ True for fh in forbidden_hosts if fh.match(absolute_link) is not None ]:
|
|
# host is forbidden
|
|
valid_urls[absolute_link] = False
|
|
else:
|
|
# host is not forbidden
|
|
# check mime type
|
|
response = requests.head(absolute_link, allow_redirects=True)
|
|
mime = response.headers.get("Content-Type", None)
|
|
if mime is None:
|
|
valid_urls[absolute_link] = False
|
|
else:
|
|
mime_match = mime_ptn.match(mime)
|
|
if mime_match is None:
|
|
valid_urls[absolute_link] = False
|
|
else:
|
|
final_fext = None
|
|
# try supported content types
|
|
for smt, get_fext in supported_mime_types:
|
|
if smt(mime_match):
|
|
final_fext = get_fext(mime_match)
|
|
break
|
|
if final_fext is None:
|
|
# mime not supported
|
|
valid_urls[absolute_link] = False
|
|
else:
|
|
# mime is supported -> store and remember file
|
|
valid_urls[absolute_link] = True
|
|
file_unique = uuid.uuid4()
|
|
target_link = offline_link.format(file_unique, final_fext)
|
|
target_file = offline_file.format(file_unique, final_fext)
|
|
|
|
# download file
|
|
try:
|
|
file_response = requests.get(absolute_link, allow_redirects=True)
|
|
file_response.raise_for_status()
|
|
with open(target_file, 'wb') as download_file:
|
|
for chunk in file_response.iter_content(100000):
|
|
download_file.write(chunk)
|
|
# also check downloaded file for links later
|
|
files_to_check.append(target_file)
|
|
|
|
print("Downloaded file:", absolute_link)
|
|
except:
|
|
print("Link could not been downloaded:", absolute_link)
|
|
|
|
# register downloaded file
|
|
cached_urls[absolute_link] = {
|
|
"input_link": absolute_link,
|
|
"target_link": target_link,
|
|
"file": target_file,
|
|
"fext": final_fext,
|
|
"found": [ {"file": file_i, "found_link": found_link} ]
|
|
}
|
|
# store reverse lookup for recursive url("../rel/link") patterns
|
|
file_origins[os.path.abspath(target_file)] = absolute_link
|
|
|
|
if valid_urls[absolute_link]:
|
|
# add to cached urls entries
|
|
cached_urls[absolute_link]["found"].append({"file": file_i, "found_link": found_link})
|
|
|
|
print("Checked file:", file_i)
|
|
except UnicodeDecodeError:
|
|
print("Skip file (No unicode):", file_i)
|
|
except:
|
|
print("Unknown error... Skip file:", file_i)
|
|
|
|
# look at next file
|
|
i+= 1
|
|
|
|
# replace files with offline link
|
|
for _, cached in cached_urls.items():
|
|
for edit_file in cached["found"]:
|
|
with open(edit_file["file"], "r", encoding="utf-8") as f:
|
|
file_content = f.read()
|
|
with open(edit_file["file"], "w", encoding="utf-8") as f:
|
|
f.write(file_content.replace(edit_file["found_link"], cached["target_link"]))
|
|
print("Patched to", len(cached["found"]), "file with link:", cached["target_link"])
|
|
|
|
print("Done")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|