diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-23 13:17:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-23 13:17:59 -0700 |
commit | 12a51fd28ca64338fca040ab7c470a70bf7a2a1b (patch) | |
tree | 69e29992b332da3edc9513dd416fe54b69f9696f | |
parent | 2b6f6c98598442fff04e76c658e8eb331fba4c9f (diff) | |
download | sandcrawler-12a51fd28ca64338fca040ab7c470a70bf7a2a1b.tar.gz sandcrawler-12a51fd28ca64338fca040ab7c470a70bf7a2a1b.zip |
ingest: skip JSTOR DOI prefixes
-rw-r--r-- | python/sandcrawler/ingest.py | 3 |
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index d2a9980..6d8b162 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -101,6 +101,9 @@ class IngestFileWorker(SandcrawlerWorker): # Historical non-paper content: "dhz.uni-passau.de/", # newspapers "digital.ucd.ie/", # ireland national historical + + # DOI prefixes + "://doi.org/10.2307/", # JSTOR; slow and many redirects ] self.wall_blocklist = [ |