summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-08-22 23:21:26 +0200
committerBryan Newbold <bnewbold@robocracy.org>2019-08-22 23:21:26 +0200
commitfe5c210f23632a99f29555e9f6a2f10a08a32c65 (patch)
tree81fe9872bf11f51e95ea674eeb67bfe21ff9685e
parent29af9d23d65a86b6037d307ad86f6d1681ff110c (diff)
downloadfatcat-fe5c210f23632a99f29555e9f6a2f10a08a32c65.tar.gz
fatcat-fe5c210f23632a99f29555e9f6a2f10a08a32c65.zip
improvements to wayback_static importer
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py35
1 files changed, 29 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 72f4d658..2125017a 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -81,6 +81,7 @@ def fetch_wbm(url):
return resp.content
def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
+ sys.stderr.write(embed_url + "\n")
assert embed_url.startswith('/web/')
embed_url = embed_url.split('/')
timestamp = embed_url[2]
@@ -125,22 +126,44 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
else:
return None
+def wayback_url_to_relative(url):
+ """
+ Wayback URLs can be relative or absolute in rewritten documents. This
+ function converts any form of rewritten URL to a relative (to
+ web.archive.org) one, or returns None if it isn't a rewritten URL at all.
+ """
+ if url.startswith('https://web.archive.org/'):
+ url = url[23:]
+ elif url.startswith('http://web.archive.org/'):
+ url = url[22:]
+
+ if url.startswith('/web/'):
+ return url
+ else:
+ return None
+
def extract_embeds(soup):
embeds = set()
# <link href="">
for tag in soup.find_all('link', href=True):
- if tag['href'].startswith('/web/'):
- embeds.add(tag['href'])
+ if tag['rel'] not in ('stylesheet',):
+ continue
+ url = wayback_url_to_relative(tag['href'])
+ if url:
+ embeds.add(url)
# <img src="">
for tag in soup.find_all('img', src=True):
- if tag['src'].startswith('/web/'):
- embeds.add(tag['src'])
+ url = wayback_url_to_relative(tag['src'])
+ if url:
+ embeds.add(url)
+
# <script src="">
for tag in soup.find_all('script', src=True):
- if tag['src'].startswith('/web/'):
- embeds.add(tag['src'])
+ url = wayback_url_to_relative(tag['src'])
+ if url:
+ embeds.add(url)
return list(embeds)