From c038294850e836c5dd24fd3dc89e77065a9d2f85 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 19 Mar 2019 19:07:35 -0700 Subject: new importer: wayback_static --- python/fatcat_tools/importers/__init__.py | 1 + python/fatcat_tools/importers/wayback_static.py | 236 ++++++++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100755 python/fatcat_tools/importers/wayback_static.py (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 70f38f5b..fe3db59d 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -18,5 +18,6 @@ from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter from .orcid import OrcidImporter +from .wayback_static import auto_wayback_static #from .kafka_source import KafkaSource #from .file_source import FileSource diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py new file mode 100755 index 00000000..114920f7 --- /dev/null +++ b/python/fatcat_tools/importers/wayback_static.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 + +""" +Helpers to create Web Capture entities from extracted wayback content. + +Works as a stand-alone script (for debugging) or as library routines. +""" + +import sys +import json +import base64 +import hashlib +import requests +import datetime +import argparse +import subprocess +from bs4 import BeautifulSoup + +from fatcat_client import * + +CDX_API_BASE = "https://web.archive.org/cdx/search/cdx" +GWB_URL_BASE = "https://web.archive.org/web" +REQ_SESSION = requests.Session() + + +def b32_hex(s): + """copy/pasta from elsewhere""" + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def parse_wbm_url(url): + """Takes a wayback machine URL, and returns a tuple: + + (timestamp, datetime, original_url) + """ + chunks = url.split('/') + assert len(chunks) >= 6 + assert chunks[2] == 'web.archive.org' + assert chunks[3] == 'web' + return (chunks[4], + parse_wbm_timestamp(chunks[4]), + '/'.join(chunks[5:])) + +def test_parse_wbm_url(): + u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" + assert parse_wbm_url(u) == ( + "20010712114837", + datetime.datetime(2001, 7, 12, 11, 48, 37), + "http://www.dlib.org/dlib/june01/reich/06reich.html") + +def parse_wbm_timestamp(timestamp): + """ + Takes a complete WBM timestamp string (like "20020327115625") and returns a + python datetime object (UTC) + """ + # strip any "im_" or "id_" suffix + if timestamp.endswith('_'): + timestamp = timestamp[:-3] + # inflexible; require the full second-precision timestamp + assert len(timestamp) == 14 + return datetime.datetime( + year=int(timestamp[0:4]), + month=int(timestamp[4:6]), + day=int(timestamp[6:8]), + hour=int(timestamp[8:10]), + minute=int(timestamp[10:12]), + second=int(timestamp[12:14])) + +def test_parse_wbm_timestamp(): + assert parse_wbm_timestamp("20010712114837") == \ + datetime.datetime(2001, 7, 12, 11, 48, 37) + +def fetch_wbm(url): + resp = REQ_SESSION.get(url) + resp.raise_for_status() + assert resp.content + return resp.content + +def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): + assert embed_url.startswith('/web/') + embed_url = embed_url.split('/') + timestamp = embed_url[2] + if timestamp.endswith('_'): + timestamp = timestamp[:-3] + url = '/'.join(embed_url[3:]) + #print((timestamp, url)) + resp = REQ_SESSION.get(CDX_API_BASE, params=dict( + url=url, + closest=timestamp, + sort="closest", + resolveRevisits="true", + matchType="exact", + limit=1, + )) + resp.raise_for_status() + #print(resp.url) + if resp.content: + hit = resp.content.decode('utf-8').split('\n')[0] + if cdx_output: + cdx_output.write(hit + "\n") + cdx = hit.split(' ') + cdx = [x if (x and x != '-') else None for x in cdx] + webcapture_cdx = WebcaptureEntityCdx( + surt=cdx[0], + timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", + url=cdx[2], + mimetype=cdx[3], + status_code=(cdx[4] and int(cdx[4])) or None, + sha1=b32_hex(cdx[5]), + sha256=None, + ) + if verify_hashes: + resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format( + cdx[1], # raw timestamp + webcapture_cdx.url)) + resp.raise_for_status() + assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex() + webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex() + return webcapture_cdx + else: + return None + +def extract_embeds(soup): + + embeds = set() + + # + for tag in soup.find_all('link', href=True): + if tag['href'].startswith('/web/'): + embeds.add(tag['href']) + # + for tag in soup.find_all('img', src=True): + if tag['src'].startswith('/web/'): + embeds.add(tag['src']) + #