diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 23:29:59 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 23:29:59 -0800 |
commit | f6e4cd94ad89d7c6c186b556ccc07534c8fe5919 (patch) | |
tree | 04f72af1b0254b5d6f7716e06d620d549a5005c0 | |
parent | 1b74e8a4dee21bd260040dad8072e4fb48456b3c (diff) | |
download | sandcrawler-f6e4cd94ad89d7c6c186b556ccc07534c8fe5919.tar.gz sandcrawler-f6e4cd94ad89d7c6c186b556ccc07534c8fe5919.zip |
ingest: handle broken revisit records
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 7230ee0..07e46c3 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -373,8 +373,11 @@ class WaybackClient: revisit_cdx = None if gwb_record.is_revisit(): if not resolve_revisit: - raise WaybackError( "found revisit record, but won't resolve (loop?)") + raise WaybackError("found revisit record, but won't resolve (loop?)") revisit_uri, revisit_dt = gwb_record.refers_to + if not (revisit_uri and revisit_dt): + raise WaybackError("revisit record missing URI and/or DT: warc:{} offset:{}".format( + warc_path, warc_offset)) # convert revisit_dt # len("2018-07-24T11:56:49"), or with "Z" assert len(revisit_dt) in (19, 20) |