aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-23 13:17:59 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-23 13:17:59 -0700
commit12a51fd28ca64338fca040ab7c470a70bf7a2a1b (patch)
tree69e29992b332da3edc9513dd416fe54b69f9696f
parent2b6f6c98598442fff04e76c658e8eb331fba4c9f (diff)
downloadsandcrawler-12a51fd28ca64338fca040ab7c470a70bf7a2a1b.tar.gz
sandcrawler-12a51fd28ca64338fca040ab7c470a70bf7a2a1b.zip
ingest: skip JSTOR DOI prefixes
-rw-r--r--python/sandcrawler/ingest.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index d2a9980..6d8b162 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -101,6 +101,9 @@ class IngestFileWorker(SandcrawlerWorker):
# Historical non-paper content:
"dhz.uni-passau.de/", # newspapers
"digital.ucd.ie/", # ireland national historical
+
+ # DOI prefixes
+ "://doi.org/10.2307/", # JSTOR; slow and many redirects
]
self.wall_blocklist = [