aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-05 20:59:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-05 20:59:12 -0800
commit40b6702116fccd86f9cd3ddeb572c03d3a5977ea (patch)
tree683595378a9442aa41df2d723d142edaed28328e
parent8b24a512de1fca5937a48fd6b5e2f101eee4f418 (diff)
downloadsandcrawler-40b6702116fccd86f9cd3ddeb572c03d3a5977ea.tar.gz
sandcrawler-40b6702116fccd86f9cd3ddeb572c03d3a5977ea.zip
handle alternative dt format in WARC headers
If there is a UTC timestamp, with trailing 'Z' indicating timezone, that is valid but increases string length by one.
-rw-r--r--python/sandcrawler/ia.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 2a334cc..e71f1e8 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -341,9 +341,11 @@ class WaybackClient:
raise WaybackError( "found revisit record, but won't resolve (loop?)")
revisit_uri, revisit_dt = gwb_record.refers_to
# convert revisit_dt
- assert len(revisit_dt) == 19 # len("2018-07-24T11:56:49")
+ # len("2018-07-24T11:56:49"), or with "Z"
+ assert len(revisit_dt) in (19, 20)
revisit_uri = revisit_uri.decode('utf-8')
- revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '')
+ revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '')
+ assert len(revisit_dt) == 14
revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt)
body = self.fetch_petabox_body(
csize=revisit_cdx.warc_csize,