diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-04-20 16:04:42 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-04-20 16:04:42 -0700 |
commit | b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7 (patch) | |
tree | b747705ad40079319c4aedd2a5a0f5cfdede8395 | |
parent | 85c32687c45d076aa9cc90673f92d682c73a28d5 (diff) | |
download | sandcrawler-b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7.tar.gz sandcrawler-b64f740e36c2a2bd78cba913c1a3aa9bd807d2d7.zip |
block isiarticles.com from future PDF crawls
-rw-r--r-- | python/sandcrawler/ingest_file.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 4ec37c1..11f2df9 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -153,6 +153,8 @@ class IngestFileWorker(SandcrawlerWorker): "doi.org/10.15468/", # gbif.org: database entry # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", + # bogus/spam PDFs + "://isiarticles.com/", ] self.wall_blocklist = [ |