aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 16:12:29 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 16:12:29 -0800
commit2bf0095335203d200370e23922a6ff38ac98201c (patch)
tree312240cbb069f681a7544775f0d49d903f31239f
parent29d53a3b8cd27cb7a40ca9588a85ccb49dd98352 (diff)
downloadsandcrawler-2bf0095335203d200370e23922a6ff38ac98201c.tar.gz
sandcrawler-2bf0095335203d200370e23922a6ff38ac98201c.zip
filter out archive.org and web.archive.org (until implemented)
-rw-r--r--python/sandcrawler/ingest.py13
1 files changed, 12 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 11b8a4c..bcb6608 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -70,7 +70,7 @@ class IngestFileWorker(SandcrawlerWorker):
"""
if not self.try_existing_ingest:
return None
- raise NotImplementedError
+ raise NotImplementedError("can't pre-check ingests yet")
# this "return True" is just here to make pylint happy
return True
@@ -82,6 +82,13 @@ class IngestFileWorker(SandcrawlerWorker):
"""
via = "none"
resource = None
+
+ if url.startswith("http://web.archive.org/web/") or url.startswith("https://web.archive.org/web/"):
+ raise NotImplementedError("handling direct wayback links not supported yet")
+
+ if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
+ raise NotImplementedError("fetching from archive.org not implemented yet")
+
if self.try_wayback:
via = "wayback"
resource = self.wayback_client.lookup_resource(url, best_mimetype)
@@ -192,6 +199,10 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = 'wayback-error'
result['error_message'] = str(e)[:1600]
return result
+ except NotImplementedError as e:
+ result['status'] = 'not-implemented'
+ result['error_message'] = str(e)[:1600]
+ return result
if not resource.hit:
result['status'] = resource.status