From 40b6702116fccd86f9cd3ddeb572c03d3a5977ea Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 5 Feb 2020 20:59:08 -0800 Subject: handle alternative dt format in WARC headers If there is a UTC timestamp, with trailing 'Z' indicating timezone, that is valid but increases string length by one. --- python/sandcrawler/ia.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 2a334cc..e71f1e8 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -341,9 +341,11 @@ class WaybackClient: raise WaybackError( "found revisit record, but won't resolve (loop?)") revisit_uri, revisit_dt = gwb_record.refers_to # convert revisit_dt - assert len(revisit_dt) == 19 # len("2018-07-24T11:56:49") + # len("2018-07-24T11:56:49"), or with "Z" + assert len(revisit_dt) in (19, 20) revisit_uri = revisit_uri.decode('utf-8') - revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '') + revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '') + assert len(revisit_dt) == 14 revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) body = self.fetch_petabox_body( csize=revisit_cdx.warc_csize, -- cgit v1.2.3