From b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 20 Apr 2022 16:04:42 -0700 Subject: block isiarticles.com from future PDF crawls --- python/sandcrawler/ingest_file.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 4ec37c1..11f2df9 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -153,6 +153,8 @@ class IngestFileWorker(SandcrawlerWorker): "doi.org/10.15468/", # gbif.org: database entry # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", + # bogus/spam PDFs + "://isiarticles.com/", ] self.wall_blocklist = [ -- cgit v1.2.3