aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-07 13:17:49 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-07 13:17:49 -0700
commit695a80a64f02f4c23bb938ecfffeef146344841f (patch)
tree8a8aef3b3e9880bb6404039f0423b22bcdb0b7aa
parentfcc5a1648d2e49e7002ca569ed668d3318a75584 (diff)
downloadsandcrawler-695a80a64f02f4c23bb938ecfffeef146344841f.tar.gz
sandcrawler-695a80a64f02f4c23bb938ecfffeef146344841f.zip
ingest: IEEE domain is blocking us
-rw-r--r--python/sandcrawler/ingest_file.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index eca8bf8..c79973f 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -116,8 +116,9 @@ class IngestFileWorker(SandcrawlerWorker):
self.max_html_resources = 200
self.base_url_blocklist = [
- # robot blocking
+ # robot blocking / rate-limited
"://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
"://www.archive.org/",