aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-08 17:10:41 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-08 17:10:41 -0700
commit6b392a1a48f03c3bd2dc991756c28abef2e24a08 (patch)
tree9d3b6fc51cce82256d51c8b5688c501bf0bb9b20 /python
parent92754a7a12ec56dd958d879ececbc4f19e9623b0 (diff)
downloadsandcrawler-6b392a1a48f03c3bd2dc991756c28abef2e24a08.tar.gz
sandcrawler-6b392a1a48f03c3bd2dc991756c28abef2e24a08.zip
add hkvalidate.perfdrive.com to domain blocklist
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index d63baff..58f3783 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -70,6 +70,9 @@ class IngestFileWorker(SandcrawlerWorker):
self.try_spn2 = kwargs.get('try_spn2', True)
self.base_url_blocklist = [
+ # robot blocking
+ "://hkvalidate.perfdrive.com/",
+
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
"://web.archive.org/web/",