aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest_file.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index eca8bf8..c79973f 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -116,8 +116,9 @@ class IngestFileWorker(SandcrawlerWorker):
self.max_html_resources = 200
self.base_url_blocklist = [
- # robot blocking
+ # robot blocking / rate-limited
"://hkvalidate.perfdrive.com/",
+ "://ieeexplore.ieee.org/",
# temporary, until we implement specific fetch and 'petabox' output
"://archive.org/",
"://www.archive.org/",