From 695a80a64f02f4c23bb938ecfffeef146344841f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 7 Jul 2022 13:17:49 -0700 Subject: ingest: IEEE domain is blocking us --- python/sandcrawler/ingest_file.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index eca8bf8..c79973f 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -116,8 +116,9 @@ class IngestFileWorker(SandcrawlerWorker): self.max_html_resources = 200 self.base_url_blocklist = [ - # robot blocking + # robot blocking / rate-limited "://hkvalidate.perfdrive.com/", + "://ieeexplore.ieee.org/", # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", "://www.archive.org/", -- cgit v1.2.3