diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-08-22 23:21:26 +0200 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-08-22 23:21:26 +0200 |
commit | fe5c210f23632a99f29555e9f6a2f10a08a32c65 (patch) | |
tree | 81fe9872bf11f51e95ea674eeb67bfe21ff9685e | |
parent | 29af9d23d65a86b6037d307ad86f6d1681ff110c (diff) | |
download | fatcat-fe5c210f23632a99f29555e9f6a2f10a08a32c65.tar.gz fatcat-fe5c210f23632a99f29555e9f6a2f10a08a32c65.zip |
improvements to wayback_static importer
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 35 |
1 files changed, 29 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 72f4d658..2125017a 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -81,6 +81,7 @@ def fetch_wbm(url): return resp.content def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): + sys.stderr.write(embed_url + "\n") assert embed_url.startswith('/web/') embed_url = embed_url.split('/') timestamp = embed_url[2] @@ -125,22 +126,44 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): else: return None +def wayback_url_to_relative(url): + """ + Wayback URLs can be relative or absolute in rewritten documents. This + function converts any form of rewritten URL to a relative (to + web.archive.org) one, or returns None if it isn't a rewritten URL at all. + """ + if url.startswith('https://web.archive.org/'): + url = url[23:] + elif url.startswith('http://web.archive.org/'): + url = url[22:] + + if url.startswith('/web/'): + return url + else: + return None + def extract_embeds(soup): embeds = set() # <link href=""> for tag in soup.find_all('link', href=True): - if tag['href'].startswith('/web/'): - embeds.add(tag['href']) + if tag['rel'] not in ('stylesheet',): + continue + url = wayback_url_to_relative(tag['href']) + if url: + embeds.add(url) # <img src=""> for tag in soup.find_all('img', src=True): - if tag['src'].startswith('/web/'): - embeds.add(tag['src']) + url = wayback_url_to_relative(tag['src']) + if url: + embeds.add(url) + # <script src=""> for tag in soup.find_all('script', src=True): - if tag['src'].startswith('/web/'): - embeds.add(tag['src']) + url = wayback_url_to_relative(tag['src']) + if url: + embeds.add(url) return list(embeds) |