aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-04-20 16:04:42 -0700
committerBryan Newbold <bnewbold@archive.org>2022-04-20 16:04:42 -0700
commitb64f740e36c2a2bd78cba913c1a3aa9bd807d2d7 (patch)
treeb747705ad40079319c4aedd2a5a0f5cfdede8395
parent85c32687c45d076aa9cc90673f92d682c73a28d5 (diff)
downloadsandcrawler-b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7.tar.gz
sandcrawler-b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7.zip
block isiarticles.com from future PDF crawls
-rw-r--r--python/sandcrawler/ingest_file.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 4ec37c1..11f2df9 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -153,6 +153,8 @@ class IngestFileWorker(SandcrawlerWorker):
"doi.org/10.15468/", # gbif.org: database entry
# deprecated domain (doesn't redirect correctly)
"://edoc.mpg.de/",
+ # bogus/spam PDFs
+ "://isiarticles.com/",
]
self.wall_blocklist = [