#!/usr/bin/env python3
"""
Helpers to create Web Capture entities from extracted wayback content.
Works as a stand-alone script (for debugging) or as library routines.
"""
import argparse
import datetime
import hashlib
import json
import subprocess
import sys
import requests
from bs4 import BeautifulSoup
from fatcat_openapi_client import (
ApiClient,
Editgroup,
WebcaptureCdxLine,
WebcaptureEntity,
WebcaptureUrl,
)
from .common import b32_hex
CDX_API_BASE = "https://web.archive.org/cdx/search/cdx"
GWB_URL_BASE = "https://web.archive.org/web"
REQ_SESSION = requests.Session()
def parse_wbm_url(url):
"""Takes a wayback machine URL, and returns a tuple:
(timestamp, datetime, original_url)
"""
chunks = url.split("/")
assert len(chunks) >= 6
assert chunks[2] == "web.archive.org"
assert chunks[3] == "web"
return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
def test_parse_wbm_url():
u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
assert parse_wbm_url(u) == (
"20010712114837",
datetime.datetime(2001, 7, 12, 11, 48, 37),
"http://www.dlib.org/dlib/june01/reich/06reich.html",
)
def parse_wbm_timestamp(timestamp):
"""
Takes a complete WBM timestamp string (like "20020327115625") and returns a
python datetime object (UTC)
"""
# strip any "im_" or "id_" suffix
if timestamp.endswith("_"):
timestamp = timestamp[:-3]
# inflexible; require the full second-precision timestamp
assert len(timestamp) == 14
return datetime.datetime(
year=int(timestamp[0:4]),
month=int(timestamp[4:6]),
day=int(timestamp[6:8]),
hour=int(timestamp[8:10]),
minute=int(timestamp[10:12]),
second=int(timestamp[12:14]),
)
def test_parse_wbm_timestamp():
assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
def fetch_wbm(url):
resp = REQ_SESSION.get(url)
resp.raise_for_status()
assert resp.content
return resp.content
def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sys.stderr.write(embed_url + "\n")
assert embed_url.startswith("/web/")
embed_url = embed_url.split("/")
timestamp = embed_url[2]
if timestamp.endswith("_"):
timestamp = timestamp[:-3]
url = "/".join(embed_url[3:])
# print((timestamp, url))
resp = REQ_SESSION.get(
CDX_API_BASE,
params=dict(
url=url,
closest=timestamp,
sort="closest",
resolveRevisits="true",
matchType="exact",
limit=1,
),
)
resp.raise_for_status()
# print(resp.url)
if resp.content:
hit = resp.content.decode("utf-8").split("\n")[0]
if cdx_output:
cdx_output.write(hit + "\n")
cdx = hit.split(" ")
cdx = [x if (x and x != "-") else None for x in cdx]
webcapture_cdx = WebcaptureCdxLine(
surt=cdx[0],
timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
url=cdx[2],
mimetype=cdx[3],
status_code=(cdx[4] and int(cdx[4])) or None,
sha1=b32_hex(cdx[5]),
sha256=None,
)
if verify_hashes:
resp = REQ_SESSION.get(
GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
)
resp.raise_for_status()
assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
webcapture_cdx.size = len(resp.content)
return webcapture_cdx
else:
return None
def wayback_url_to_relative(url):
"""
Wayback URLs can be relative or absolute in rewritten documents. This
function converts any form of rewritten URL to a relative (to
web.archive.org) one, or returns None if it isn't a rewritten URL at all.
"""
if url.startswith("https://web.archive.org/"):
url = url[23:]
elif url.startswith("http://web.archive.org/"):
url = url[22:]
if url.startswith("/web/"):
return url
else:
return None
def extract_embeds(soup):
embeds = set()
#
for tag in soup.find_all("link", href=True):
if tag["rel"] not in ("stylesheet",):
continue
url = wayback_url_to_relative(tag["href"])
if url:
embeds.add(url)
#
for tag in soup.find_all("img", src=True):
url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
#