diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 16:12:29 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 16:12:29 -0800 |
commit | 2bf0095335203d200370e23922a6ff38ac98201c (patch) | |
tree | 312240cbb069f681a7544775f0d49d903f31239f | |
parent | 29d53a3b8cd27cb7a40ca9588a85ccb49dd98352 (diff) | |
download | sandcrawler-2bf0095335203d200370e23922a6ff38ac98201c.tar.gz sandcrawler-2bf0095335203d200370e23922a6ff38ac98201c.zip |
filter out archive.org and web.archive.org (until implemented)
-rw-r--r-- | python/sandcrawler/ingest.py | 13 |
1 files changed, 12 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 11b8a4c..bcb6608 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -70,7 +70,7 @@ class IngestFileWorker(SandcrawlerWorker): """ if not self.try_existing_ingest: return None - raise NotImplementedError + raise NotImplementedError("can't pre-check ingests yet") # this "return True" is just here to make pylint happy return True @@ -82,6 +82,13 @@ class IngestFileWorker(SandcrawlerWorker): """ via = "none" resource = None + + if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"): + raise NotImplementedError("handling direct wayback links not supported yet") + + if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"): + raise NotImplementedError("fetching from archive.org not implemented yet") + if self.try_wayback: via = "wayback" resource = self.wayback_client.lookup_resource(url, best_mimetype) @@ -192,6 +199,10 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = 'wayback-error' result['error_message'] = str(e)[:1600] return result + except NotImplementedError as e: + result['status'] = 'not-implemented' + result['error_message'] = str(e)[:1600] + return result if not resource.hit: result['status'] = resource.status |