diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-08 17:10:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-08 17:10:41 -0700 |
commit | 6b392a1a48f03c3bd2dc991756c28abef2e24a08 (patch) | |
tree | 9d3b6fc51cce82256d51c8b5688c501bf0bb9b20 | |
parent | 92754a7a12ec56dd958d879ececbc4f19e9623b0 (diff) | |
download | sandcrawler-6b392a1a48f03c3bd2dc991756c28abef2e24a08.tar.gz sandcrawler-6b392a1a48f03c3bd2dc991756c28abef2e24a08.zip |
add hkvalidate.perfdrive.com to domain blocklist
-rw-r--r-- | python/sandcrawler/ingest.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index d63baff..58f3783 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -70,6 +70,9 @@ class IngestFileWorker(SandcrawlerWorker): self.try_spn2 = kwargs.get('try_spn2', True) self.base_url_blocklist = [ + # robot blocking + "://hkvalidate.perfdrive.com/", + # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", "://web.archive.org/web/", |