diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-08-22 23:21:26 +0200 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-08-22 23:21:26 +0200 | 
| commit | fe5c210f23632a99f29555e9f6a2f10a08a32c65 (patch) | |
| tree | 81fe9872bf11f51e95ea674eeb67bfe21ff9685e | |
| parent | 29af9d23d65a86b6037d307ad86f6d1681ff110c (diff) | |
| download | fatcat-fe5c210f23632a99f29555e9f6a2f10a08a32c65.tar.gz fatcat-fe5c210f23632a99f29555e9f6a2f10a08a32c65.zip | |
improvements to wayback_static importer
| -rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 35 | 
1 files changed, 29 insertions, 6 deletions
| diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 72f4d658..2125017a 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -81,6 +81,7 @@ def fetch_wbm(url):      return resp.content  def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): +    sys.stderr.write(embed_url + "\n")      assert embed_url.startswith('/web/')      embed_url = embed_url.split('/')      timestamp = embed_url[2] @@ -125,22 +126,44 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):      else:          return None +def wayback_url_to_relative(url): +    """ +    Wayback URLs can be relative or absolute in rewritten documents. This +    function converts any form of rewritten URL to a relative (to +    web.archive.org) one, or returns None if it isn't a rewritten URL at all. +    """ +    if url.startswith('https://web.archive.org/'): +        url = url[23:] +    elif url.startswith('http://web.archive.org/'): +        url = url[22:] + +    if url.startswith('/web/'): +        return url +    else: +        return None +  def extract_embeds(soup):      embeds = set()      # <link href="">      for tag in soup.find_all('link', href=True): -        if tag['href'].startswith('/web/'): -            embeds.add(tag['href']) +        if tag['rel'] not in ('stylesheet',): +            continue +        url = wayback_url_to_relative(tag['href']) +        if url: +            embeds.add(url)      # <img src="">      for tag in soup.find_all('img', src=True): -        if tag['src'].startswith('/web/'): -            embeds.add(tag['src']) +        url = wayback_url_to_relative(tag['src']) +        if url: +            embeds.add(url) +      # <script src="">      for tag in soup.find_all('script', src=True): -        if tag['src'].startswith('/web/'): -            embeds.add(tag['src']) +        url = wayback_url_to_relative(tag['src']) +        if url: +            embeds.add(url)      return list(embeds) | 
