aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 23:29:59 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 23:29:59 -0800
commitf6e4cd94ad89d7c6c186b556ccc07534c8fe5919 (patch)
tree04f72af1b0254b5d6f7716e06d620d549a5005c0
parent1b74e8a4dee21bd260040dad8072e4fb48456b3c (diff)
downloadsandcrawler-f6e4cd94ad89d7c6c186b556ccc07534c8fe5919.tar.gz
sandcrawler-f6e4cd94ad89d7c6c186b556ccc07534c8fe5919.zip
ingest: handle broken revisit records
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 7230ee0..07e46c3 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -373,8 +373,11 @@ class WaybackClient:
revisit_cdx = None
if gwb_record.is_revisit():
if not resolve_revisit:
- raise WaybackError( "found revisit record, but won't resolve (loop?)")
+ raise WaybackError("found revisit record, but won't resolve (loop?)")
revisit_uri, revisit_dt = gwb_record.refers_to
+ if not (revisit_uri and revisit_dt):
+ raise WaybackError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ warc_path, warc_offset))
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
assert len(revisit_dt) in (19, 20)