From 644e412c38c8897e171e3aa1244f1aa6955d8e65 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Aug 2020 17:37:03 -0700 Subject: ingest: actually use force_get flag with SPN The code path was there, but wasn't actually flagging in our most popular daily domains yet. Hopefully will make a big difference in SPN throughput. --- python/sandcrawler/ingest.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 918a832..d910665 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -113,6 +113,19 @@ class IngestFileWorker(SandcrawlerWorker): # future possibly to increase download efficiency (wget/fetch being # faster than browser fetch) self.spn2_simple_get_domains = [ + # direct PDF links + "://arxiv.org/pdf/", + "://europepmc.org/backend/ptpmcrender.fcgi", + "://pdfs.semanticscholar.org/", + "://res.mdpi.com/", + + # platform sites + "://zenodo.org/", + "://figshare.org/", + "://springernature.figshare.com/", + + # popular simple cloud storage or direct links + "://s3-eu-west-1.amazonaws.com/", ] -- cgit v1.2.3