From f6e4cd94ad89d7c6c186b556ccc07534c8fe5919 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 23:29:59 -0800 Subject: ingest: handle broken revisit records --- python/sandcrawler/ia.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 7230ee0..07e46c3 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -373,8 +373,11 @@ class WaybackClient: revisit_cdx = None if gwb_record.is_revisit(): if not resolve_revisit: - raise WaybackError( "found revisit record, but won't resolve (loop?)") + raise WaybackError("found revisit record, but won't resolve (loop?)") revisit_uri, revisit_dt = gwb_record.refers_to + if not (revisit_uri and revisit_dt): + raise WaybackError("revisit record missing URI and/or DT: warc:{} offset:{}".format( + warc_path, warc_offset)) # convert revisit_dt # len("2018-07-24T11:56:49"), or with "Z" assert len(revisit_dt) in (19, 20) -- cgit v1.2.3