From 6b392a1a48f03c3bd2dc991756c28abef2e24a08 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 8 Aug 2020 17:10:41 -0700 Subject: add hkvalidate.perfdrive.com to domain blocklist --- python/sandcrawler/ingest.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index d63baff..58f3783 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -70,6 +70,9 @@ class IngestFileWorker(SandcrawlerWorker): self.try_spn2 = kwargs.get('try_spn2', True) self.base_url_blocklist = [ + # robot blocking + "://hkvalidate.perfdrive.com/", + # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", "://web.archive.org/web/", -- cgit v1.2.3