diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 236 |
2 files changed, 237 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 70f38f5b..fe3db59d 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -18,5 +18,6 @@ from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter +from .wayback_static import auto_wayback_static #from .kafka_source import KafkaSource #from .file_source import FileSource diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py new file mode 100755 index 00000000..114920f7 --- /dev/null +++ b/python/fatcat_tools/importers/wayback_static.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +""" +Helpers to create Web Capture entities from extracted wayback content. + +Works as a stand-alone script (for debugging) or as library routines. +""" + +import sys +import json +import base64 +import hashlib +import requests +import datetime +import argparse +import subprocess +from bs4 import BeautifulSoup + +from fatcat_client import * + +CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" +GWB_URL_BASE = "https://web.archive.org/web" +REQ_SESSION = requests.Session() + + +def b32_hex(s): + """copy/pasta from elsewhere""" + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def parse_wbm_url(url): + """Takes a wayback machine URL, and returns a tuple: + + (timestamp, datetime, original_url) + """ + chunks = url.split('/') + assert len(chunks) >= 6 + assert chunks[2] == 'web.archive.org' + assert chunks[3] == 'web' + return (chunks[4], + parse_wbm_timestamp(chunks[4]), + '/'.join(chunks[5:])) + +def test_parse_wbm_url(): + u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" + assert parse_wbm_url(u) == ( + "20010712114837", + datetime.datetime(2001, 7, 12, 11, 48, 37), + "http://www.dlib.org/dlib/june01/reich/06reich.html") + +def parse_wbm_timestamp(timestamp): + """ + Takes a complete WBM timestamp string (like "20020327115625") and returns a + python datetime object (UTC) + """ + # strip any "im_" or "id_" suffix + if timestamp.endswith('_'): + timestamp = timestamp[:-3] + # inflexible; require the full second-precision timestamp + assert len(timestamp) == 14 + return datetime.datetime( + year=int(timestamp[0:4]), + month=int(timestamp[4:6]), + day=int(timestamp[6:8]), + hour=int(timestamp[8:10]), + minute=int(timestamp[10:12]), + second=int(timestamp[12:14])) + +def test_parse_wbm_timestamp(): + assert parse_wbm_timestamp("20010712114837") == \ + datetime.datetime(2001, 7, 12, 11, 48, 37) + +def fetch_wbm(url): + resp = REQ_SESSION.get(url) + resp.raise_for_status() + assert resp.content + return resp.content + +def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): + assert embed_url.startswith('/web/') + embed_url = embed_url.split('/') + timestamp = embed_url[2] + if timestamp.endswith('_'): + timestamp = timestamp[:-3] + url = '/'.join(embed_url[3:]) + #print((timestamp, url)) + resp = REQ_SESSION.get(CDX_API_BASE, params=dict( + url=url, + closest=timestamp, + sort="closest", + resolveRevisits="true", + matchType="exact", + limit=1, + )) + resp.raise_for_status() + #print(resp.url) + if resp.content: + hit = resp.content.decode('utf-8').split('\n')[0] + if cdx_output: + cdx_output.write(hit + "\n") + cdx = hit.split(' ') + cdx = [x if (x and x != '-') else None for x in cdx] + webcapture_cdx = WebcaptureEntityCdx( + surt=cdx[0], + timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", + url=cdx[2], + mimetype=cdx[3], + status_code=(cdx[4] and int(cdx[4])) or None, + sha1=b32_hex(cdx[5]), + sha256=None, + ) + if verify_hashes: + resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( + cdx[1], # raw timestamp + webcapture_cdx.url)) + resp.raise_for_status() + assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() + webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() + return webcapture_cdx + else: + return None + +def extract_embeds(soup): + + embeds = set() + + # <link href=""> + for tag in soup.find_all('link', href=True): + if tag['href'].startswith('/web/'): + embeds.add(tag['href']) + # <img src=""> + for tag in soup.find_all('img', src=True): + if tag['src'].startswith('/web/'): + embeds.add(tag['src']) + # <script src=""> + for tag in soup.find_all('script', src=True): + if tag['src'].startswith('/web/'): + embeds.add(tag['src']) + + return list(embeds) + +def static_wayback_webcapture(wayback_url, cdx_output=None): + """ + Given a complete wayback machine capture URL, like: + + http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html + + Will return a new ("bare") fatcat webcapture entity python object, with all + the CDX entries filled in. + """ + + wbm_html = fetch_wbm(wayback_url) + raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) + #with open(rewritten_path, 'r') as fp: + # soup = BeautifulSoup(fp, "lxml") + soup = BeautifulSoup(wbm_html, "lxml") + embeds = extract_embeds(soup) + cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url), + cdx_output=cdx_output) + cdx_list = [cdx_obj] + for url in embeds: + cdx_obj = lookup_cdx(url, cdx_output=cdx_output) + cdx_list.append(cdx_obj) + archive_urls = [WebcaptureEntityArchiveUrls( + rel="wayback", + url="https://web.archive.org/web/", + )] + wc = WebcaptureEntity( + cdx=cdx_list, + timestamp=timestamp.isoformat() + "Z", + original_url=original_url, + archive_urls=archive_urls, + release_ids=None) + return wc + +def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): + """ + Returns a tuple: (editgroup_id, edit). If failed, both are None + """ + + raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url) + git_rev = subprocess.check_output( + ["git", "describe", "--always"]).strip().decode('utf-8') + + release = api.get_release(release_id, expand="webcaptures") + + # check for existing webcapture with same parameters + for wc in release.webcaptures: + if wc.original_url == original_url and wc.timestamp.date() == timestamp.date(): + # skipping: already existed + print("release {} already had webcapture {} {}".format( + release_id, raw_timestamp, original_url)) + return (None, None) + + wc = static_wayback_webcapture(wayback_url) + assert len(wc.cdx) >= 1 + wc.release_ids = [release_id] + if not editgroup_id: + eg = api.create_editgroup(Editgroup( + description="One-off import of static web content from wayback machine", + extra=dict( + git_rev=git_rev, + agent="fatcat_tools.auto_wayback_static"))) + editgroup_id = eg.editgroup_id + edit = api.create_webcapture(wc, editgroup_id=editgroup_id) + return (editgroup_id, edit) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--verbose', + action='store_true', + help="verbose output") + parser.add_argument('wayback_url', + type=str, + help="URL of wayback capture to extract from") + parser.add_argument('--json-output', + type=argparse.FileType('w'), default=sys.stdout, + help="where to write out webcapture entity (as JSON)") + parser.add_argument('--cdx-output', + type=argparse.FileType('w'), default=None, + help="(optional) file to write out CDX stub") + + args = parser.parse_args() + + # entity-to-JSON code; duplicate of entity_to_dict() + api_client = ApiClient() + wc = static_wayback_webcapture(args.wayback_url, cdx_output=args.cdx_output) + wc_dict = api_client.sanitize_for_serialization(wc) + print(json.dumps(wc_dict)) + +if __name__ == '__main__': + main() |