diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-05 20:59:08 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-05 20:59:12 -0800 |
commit | 40b6702116fccd86f9cd3ddeb572c03d3a5977ea (patch) | |
tree | 683595378a9442aa41df2d723d142edaed28328e /python | |
parent | 8b24a512de1fca5937a48fd6b5e2f101eee4f418 (diff) | |
download | sandcrawler-40b6702116fccd86f9cd3ddeb572c03d3a5977ea.tar.gz sandcrawler-40b6702116fccd86f9cd3ddeb572c03d3a5977ea.zip |
handle alternative dt format in WARC headers
If there is a UTC timestamp, with trailing 'Z' indicating timezone, that
is valid but increases string length by one.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 2a334cc..e71f1e8 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -341,9 +341,11 @@ class WaybackClient: raise WaybackError( "found revisit record, but won't resolve (loop?)") revisit_uri, revisit_dt = gwb_record.refers_to # convert revisit_dt - assert len(revisit_dt) == 19 # len("2018-07-24T11:56:49") + # len("2018-07-24T11:56:49"), or with "Z" + assert len(revisit_dt) in (19, 20) revisit_uri = revisit_uri.decode('utf-8') - revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '') + revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '') + assert len(revisit_dt) == 14 revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) body = self.fetch_petabox_body( csize=revisit_cdx.warc_csize, |