diff options
Diffstat (limited to 'python/fatcat_tools/importers/wayback_static.py')
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 166 |
1 files changed, 94 insertions, 72 deletions
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 196f86ff..22fefad3 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -33,22 +33,23 @@ REQ_SESSION = requests.Session() def parse_wbm_url(url): """Takes a wayback machine URL, and returns a tuple: - (timestamp, datetime, original_url) + (timestamp, datetime, original_url) """ - chunks = url.split('/') + chunks = url.split("/") assert len(chunks) >= 6 - assert chunks[2] == 'web.archive.org' - assert chunks[3] == 'web' - return (chunks[4], - parse_wbm_timestamp(chunks[4]), - '/'.join(chunks[5:])) + assert chunks[2] == "web.archive.org" + assert chunks[3] == "web" + return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) + def test_parse_wbm_url(): u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" assert parse_wbm_url(u) == ( "20010712114837", datetime.datetime(2001, 7, 12, 11, 48, 37), - "http://www.dlib.org/dlib/june01/reich/06reich.html") + "http://www.dlib.org/dlib/june01/reich/06reich.html", + ) + def parse_wbm_timestamp(timestamp): """ @@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp): python datetime object (UTC) """ # strip any "im_" or "id_" suffix - if timestamp.endswith('_'): + if timestamp.endswith("_"): timestamp = timestamp[:-3] # inflexible; require the full second-precision timestamp assert len(timestamp) == 14 @@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp): day=int(timestamp[6:8]), hour=int(timestamp[8:10]), minute=int(timestamp[10:12]), - second=int(timestamp[12:14])) + second=int(timestamp[12:14]), + ) + def test_parse_wbm_timestamp(): - assert parse_wbm_timestamp("20010712114837") == \ - datetime.datetime(2001, 7, 12, 11, 48, 37) + assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) + def fetch_wbm(url): resp = REQ_SESSION.get(url) @@ -78,31 +81,35 @@ def fetch_wbm(url): assert resp.content return resp.content + def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): sys.stderr.write(embed_url + "\n") - assert embed_url.startswith('/web/') - embed_url = embed_url.split('/') + assert embed_url.startswith("/web/") + embed_url = embed_url.split("/") timestamp = embed_url[2] - if timestamp.endswith('_'): + if timestamp.endswith("_"): timestamp = timestamp[:-3] - url = '/'.join(embed_url[3:]) - #print((timestamp, url)) - resp = REQ_SESSION.get(CDX_API_BASE, params=dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - )) + url = "/".join(embed_url[3:]) + # print((timestamp, url)) + resp = REQ_SESSION.get( + CDX_API_BASE, + params=dict( + url=url, + closest=timestamp, + sort="closest", + resolveRevisits="true", + matchType="exact", + limit=1, + ), + ) resp.raise_for_status() - #print(resp.url) + # print(resp.url) if resp.content: - hit = resp.content.decode('utf-8').split('\n')[0] + hit = resp.content.decode("utf-8").split("\n")[0] if cdx_output: cdx_output.write(hit + "\n") - cdx = hit.split(' ') - cdx = [x if (x and x != '-') else None for x in cdx] + cdx = hit.split(" ") + cdx = [x if (x and x != "-") else None for x in cdx] webcapture_cdx = WebcaptureCdxLine( surt=cdx[0], timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", @@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): sha256=None, ) if verify_hashes: - resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( - cdx[1], # raw timestamp - webcapture_cdx.url)) + resp = REQ_SESSION.get( + GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp + ) resp.raise_for_status() assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() @@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): else: return None + def wayback_url_to_relative(url): """ Wayback URLs can be relative or absolute in rewritten documents. This function converts any form of rewritten URL to a relative (to web.archive.org) one, or returns None if it isn't a rewritten URL at all. """ - if url.startswith('https://web.archive.org/'): + if url.startswith("https://web.archive.org/"): url = url[23:] - elif url.startswith('http://web.archive.org/'): + elif url.startswith("http://web.archive.org/"): url = url[22:] - if url.startswith('/web/'): + if url.startswith("/web/"): return url else: return None + def extract_embeds(soup): embeds = set() # <link href=""> - for tag in soup.find_all('link', href=True): - if tag['rel'] not in ('stylesheet',): + for tag in soup.find_all("link", href=True): + if tag["rel"] not in ("stylesheet",): continue - url = wayback_url_to_relative(tag['href']) + url = wayback_url_to_relative(tag["href"]) if url: embeds.add(url) # <img src=""> - for tag in soup.find_all('img', src=True): - url = wayback_url_to_relative(tag['src']) + for tag in soup.find_all("img", src=True): + url = wayback_url_to_relative(tag["src"]) if url: embeds.add(url) # <script src=""> - for tag in soup.find_all('script', src=True): - url = wayback_url_to_relative(tag['src']) + for tag in soup.find_all("script", src=True): + url = wayback_url_to_relative(tag["src"]) if url: embeds.add(url) return list(embeds) + def static_wayback_webcapture(wayback_url, cdx_output=None): """ Given a complete wayback machine capture URL, like: @@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None): wbm_html = fetch_wbm(wayback_url) raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - #with open(rewritten_path, 'r') as fp: + # with open(rewritten_path, 'r') as fp: # soup = BeautifulSoup(fp, "lxml") soup = BeautifulSoup(wbm_html, "lxml") embeds = extract_embeds(soup) - cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url), - cdx_output=cdx_output) + cdx_obj = lookup_cdx( + "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output + ) cdx_list = [cdx_obj] for url in embeds: cdx_obj = lookup_cdx(url, cdx_output=cdx_output) cdx_list.append(cdx_obj) - archive_urls = [WebcaptureUrl( - rel="wayback", - url="https://web.archive.org/web/", - )] + archive_urls = [ + WebcaptureUrl( + rel="wayback", + url="https://web.archive.org/web/", + ) + ] wc = WebcaptureEntity( cdx=cdx_list, timestamp=timestamp.isoformat() + "Z", original_url=original_url, archive_urls=archive_urls, - release_ids=None) + release_ids=None, + ) return wc + def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): """ Returns a tuple: (editgroup_id, edit). If failed, both are None """ raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) - git_rev = subprocess.check_output( - ["git", "describe", "--always"]).strip().decode('utf-8') + git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") release = api.get_release(release_id, expand="webcaptures") @@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): for wc in release.webcaptures: if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): # skipping: already existed - print("release {} already had webcapture {} {}".format( - release_id, raw_timestamp, original_url)) + print( + "release {} already had webcapture {} {}".format( + release_id, raw_timestamp, original_url + ) + ) return (None, None) wc = static_wayback_webcapture(wayback_url) assert len(wc.cdx) >= 1 wc.release_ids = [release_id] if not editgroup_id: - eg = api.create_editgroup(Editgroup( - description="One-off import of static web content from wayback machine", - extra=dict( - git_rev=git_rev, - agent="fatcat_tools.auto_wayback_static"))) + eg = api.create_editgroup( + Editgroup( + description="One-off import of static web content from wayback machine", + extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"), + ) + ) editgroup_id = eg.editgroup_id edit = api.create_webcapture(eg.editgroup_id, wc) return (editgroup_id, edit) + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--verbose', - action='store_true', - help="verbose output") - parser.add_argument('wayback_url', - type=str, - help="URL of wayback capture to extract from") - parser.add_argument('--json-output', - type=argparse.FileType('w'), default=sys.stdout, - help="where to write out webcapture entity (as JSON)") - parser.add_argument('--cdx-output', - type=argparse.FileType('w'), default=None, - help="(optional) file to write out CDX stub") + parser.add_argument("--verbose", action="store_true", help="verbose output") + parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") + parser.add_argument( + "--json-output", + type=argparse.FileType("w"), + default=sys.stdout, + help="where to write out webcapture entity (as JSON)", + ) + parser.add_argument( + "--cdx-output", + type=argparse.FileType("w"), + default=None, + help="(optional) file to write out CDX stub", + ) args = parser.parse_args() @@ -254,5 +275,6 @@ def main(): wc_dict = api_client.sanitize_for_serialization(wc) print(json.dumps(wc_dict)) -if __name__ == '__main__': + +if __name__ == "__main__": main() |