summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/wayback_static.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/wayback_static.py')
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py166
1 files changed, 94 insertions, 72 deletions
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 196f86ff..22fefad3 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -33,22 +33,23 @@ REQ_SESSION = requests.Session()
def parse_wbm_url(url):
"""Takes a wayback machine URL, and returns a tuple:
- (timestamp, datetime, original_url)
+ (timestamp, datetime, original_url)
"""
- chunks = url.split('/')
+ chunks = url.split("/")
assert len(chunks) >= 6
- assert chunks[2] == 'web.archive.org'
- assert chunks[3] == 'web'
- return (chunks[4],
- parse_wbm_timestamp(chunks[4]),
- '/'.join(chunks[5:]))
+ assert chunks[2] == "web.archive.org"
+ assert chunks[3] == "web"
+ return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:]))
+
def test_parse_wbm_url():
u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html"
assert parse_wbm_url(u) == (
"20010712114837",
datetime.datetime(2001, 7, 12, 11, 48, 37),
- "http://www.dlib.org/dlib/june01/reich/06reich.html")
+ "http://www.dlib.org/dlib/june01/reich/06reich.html",
+ )
+
def parse_wbm_timestamp(timestamp):
"""
@@ -56,7 +57,7 @@ def parse_wbm_timestamp(timestamp):
python datetime object (UTC)
"""
# strip any "im_" or "id_" suffix
- if timestamp.endswith('_'):
+ if timestamp.endswith("_"):
timestamp = timestamp[:-3]
# inflexible; require the full second-precision timestamp
assert len(timestamp) == 14
@@ -66,11 +67,13 @@ def parse_wbm_timestamp(timestamp):
day=int(timestamp[6:8]),
hour=int(timestamp[8:10]),
minute=int(timestamp[10:12]),
- second=int(timestamp[12:14]))
+ second=int(timestamp[12:14]),
+ )
+
def test_parse_wbm_timestamp():
- assert parse_wbm_timestamp("20010712114837") == \
- datetime.datetime(2001, 7, 12, 11, 48, 37)
+ assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37)
+
def fetch_wbm(url):
resp = REQ_SESSION.get(url)
@@ -78,31 +81,35 @@ def fetch_wbm(url):
assert resp.content
return resp.content
+
def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sys.stderr.write(embed_url + "\n")
- assert embed_url.startswith('/web/')
- embed_url = embed_url.split('/')
+ assert embed_url.startswith("/web/")
+ embed_url = embed_url.split("/")
timestamp = embed_url[2]
- if timestamp.endswith('_'):
+ if timestamp.endswith("_"):
timestamp = timestamp[:-3]
- url = '/'.join(embed_url[3:])
- #print((timestamp, url))
- resp = REQ_SESSION.get(CDX_API_BASE, params=dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- ))
+ url = "/".join(embed_url[3:])
+ # print((timestamp, url))
+ resp = REQ_SESSION.get(
+ CDX_API_BASE,
+ params=dict(
+ url=url,
+ closest=timestamp,
+ sort="closest",
+ resolveRevisits="true",
+ matchType="exact",
+ limit=1,
+ ),
+ )
resp.raise_for_status()
- #print(resp.url)
+ # print(resp.url)
if resp.content:
- hit = resp.content.decode('utf-8').split('\n')[0]
+ hit = resp.content.decode("utf-8").split("\n")[0]
if cdx_output:
cdx_output.write(hit + "\n")
- cdx = hit.split(' ')
- cdx = [x if (x and x != '-') else None for x in cdx]
+ cdx = hit.split(" ")
+ cdx = [x if (x and x != "-") else None for x in cdx]
webcapture_cdx = WebcaptureCdxLine(
surt=cdx[0],
timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
@@ -113,9 +120,9 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
sha256=None,
)
if verify_hashes:
- resp = REQ_SESSION.get(GWB_URL_BASE + "/{}id_/{}".format(
- cdx[1], # raw timestamp
- webcapture_cdx.url))
+ resp = REQ_SESSION.get(
+ GWB_URL_BASE + "/{}id_/{}".format(cdx[1], webcapture_cdx.url) # raw timestamp
+ )
resp.raise_for_status()
assert webcapture_cdx.sha1 == hashlib.sha1(resp.content).digest().hex()
webcapture_cdx.sha256 = hashlib.sha256(resp.content).digest().hex()
@@ -124,47 +131,50 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None):
else:
return None
+
def wayback_url_to_relative(url):
"""
Wayback URLs can be relative or absolute in rewritten documents. This
function converts any form of rewritten URL to a relative (to
web.archive.org) one, or returns None if it isn't a rewritten URL at all.
"""
- if url.startswith('https://web.archive.org/'):
+ if url.startswith("https://web.archive.org/"):
url = url[23:]
- elif url.startswith('http://web.archive.org/'):
+ elif url.startswith("http://web.archive.org/"):
url = url[22:]
- if url.startswith('/web/'):
+ if url.startswith("/web/"):
return url
else:
return None
+
def extract_embeds(soup):
embeds = set()
# <link href="">
- for tag in soup.find_all('link', href=True):
- if tag['rel'] not in ('stylesheet',):
+ for tag in soup.find_all("link", href=True):
+ if tag["rel"] not in ("stylesheet",):
continue
- url = wayback_url_to_relative(tag['href'])
+ url = wayback_url_to_relative(tag["href"])
if url:
embeds.add(url)
# <img src="">
- for tag in soup.find_all('img', src=True):
- url = wayback_url_to_relative(tag['src'])
+ for tag in soup.find_all("img", src=True):
+ url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
# <script src="">
- for tag in soup.find_all('script', src=True):
- url = wayback_url_to_relative(tag['src'])
+ for tag in soup.find_all("script", src=True):
+ url = wayback_url_to_relative(tag["src"])
if url:
embeds.add(url)
return list(embeds)
+
def static_wayback_webcapture(wayback_url, cdx_output=None):
"""
Given a complete wayback machine capture URL, like:
@@ -177,36 +187,40 @@ def static_wayback_webcapture(wayback_url, cdx_output=None):
wbm_html = fetch_wbm(wayback_url)
raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- #with open(rewritten_path, 'r') as fp:
+ # with open(rewritten_path, 'r') as fp:
# soup = BeautifulSoup(fp, "lxml")
soup = BeautifulSoup(wbm_html, "lxml")
embeds = extract_embeds(soup)
- cdx_obj = lookup_cdx("/web/{}/{}".format(raw_timestamp, original_url),
- cdx_output=cdx_output)
+ cdx_obj = lookup_cdx(
+ "/web/{}/{}".format(raw_timestamp, original_url), cdx_output=cdx_output
+ )
cdx_list = [cdx_obj]
for url in embeds:
cdx_obj = lookup_cdx(url, cdx_output=cdx_output)
cdx_list.append(cdx_obj)
- archive_urls = [WebcaptureUrl(
- rel="wayback",
- url="https://web.archive.org/web/",
- )]
+ archive_urls = [
+ WebcaptureUrl(
+ rel="wayback",
+ url="https://web.archive.org/web/",
+ )
+ ]
wc = WebcaptureEntity(
cdx=cdx_list,
timestamp=timestamp.isoformat() + "Z",
original_url=original_url,
archive_urls=archive_urls,
- release_ids=None)
+ release_ids=None,
+ )
return wc
+
def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
"""
Returns a tuple: (editgroup_id, edit). If failed, both are None
"""
raw_timestamp, timestamp, original_url = parse_wbm_url(wayback_url)
- git_rev = subprocess.check_output(
- ["git", "describe", "--always"]).strip().decode('utf-8')
+ git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
release = api.get_release(release_id, expand="webcaptures")
@@ -214,37 +228,44 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None):
for wc in release.webcaptures:
if wc.original_url == original_url and wc.timestamp.date() == timestamp.date():
# skipping: already existed
- print("release {} already had webcapture {} {}".format(
- release_id, raw_timestamp, original_url))
+ print(
+ "release {} already had webcapture {} {}".format(
+ release_id, raw_timestamp, original_url
+ )
+ )
return (None, None)
wc = static_wayback_webcapture(wayback_url)
assert len(wc.cdx) >= 1
wc.release_ids = [release_id]
if not editgroup_id:
- eg = api.create_editgroup(Editgroup(
- description="One-off import of static web content from wayback machine",
- extra=dict(
- git_rev=git_rev,
- agent="fatcat_tools.auto_wayback_static")))
+ eg = api.create_editgroup(
+ Editgroup(
+ description="One-off import of static web content from wayback machine",
+ extra=dict(git_rev=git_rev, agent="fatcat_tools.auto_wayback_static"),
+ )
+ )
editgroup_id = eg.editgroup_id
edit = api.create_webcapture(eg.editgroup_id, wc)
return (editgroup_id, edit)
+
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--verbose',
- action='store_true',
- help="verbose output")
- parser.add_argument('wayback_url',
- type=str,
- help="URL of wayback capture to extract from")
- parser.add_argument('--json-output',
- type=argparse.FileType('w'), default=sys.stdout,
- help="where to write out webcapture entity (as JSON)")
- parser.add_argument('--cdx-output',
- type=argparse.FileType('w'), default=None,
- help="(optional) file to write out CDX stub")
+ parser.add_argument("--verbose", action="store_true", help="verbose output")
+ parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from")
+ parser.add_argument(
+ "--json-output",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="where to write out webcapture entity (as JSON)",
+ )
+ parser.add_argument(
+ "--cdx-output",
+ type=argparse.FileType("w"),
+ default=None,
+ help="(optional) file to write out CDX stub",
+ )
args = parser.parse_args()
@@ -254,5 +275,6 @@ def main():
wc_dict = api_client.sanitize_for_serialization(wc)
print(json.dumps(wc_dict))
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()