diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 13:30:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:29 -0700 |
commit | ba324ae5a6051c47d4cf7524c28caeda7abd6fc5 (patch) | |
tree | 8dfd640ddec952b37b9ad0438ee54cb7594d6b85 /python/sandcrawler/ingest_file.py | |
parent | 6cccac03451f46cb59897871e6631debca558771 (diff) | |
download | sandcrawler-ba324ae5a6051c47d4cf7524c28caeda7abd6fc5.tar.gz sandcrawler-ba324ae5a6051c47d4cf7524c28caeda7abd6fc5.zip |
move SPNv2 'simple_get' logic to SPN client
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 28 |
1 files changed, 1 insertions, 27 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index ce38e13..afaa329 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -152,27 +152,6 @@ class IngestFileWorker(SandcrawlerWorker): "error=cookies_not_supported", ] - # these are special-case web domains for which we want SPN2 to not run - # a headless browser (brozzler), but instead simply run wget. - # the motivation could be to work around browser issues, or in the - # future possibly to increase download efficiency (wget/fetch being - # faster than browser fetch) - self.spn2_simple_get_domains = [ - # direct PDF links - "://arxiv.org/pdf/", - "://europepmc.org/backend/ptpmcrender.fcgi", - "://pdfs.semanticscholar.org/", - "://res.mdpi.com/", - - # platform sites - "://zenodo.org/", - "://figshare.org/", - "://springernature.figshare.com/", - - # popular simple cloud storage or direct links - "://s3-eu-west-1.amazonaws.com/", - ] - self.src_valid_mimetypes = [ "text/x-tex", "application/gzip", @@ -266,12 +245,7 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure): via = "spn2" - force_simple_get = 0 - for domain in self.spn2_simple_get_domains: - if domain in url: - force_simple_get = 1 - break - resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get) + resource = self.spn_client.crawl_resource(url, self.wayback_client) print("[FETCH {:>6}] {} {}".format( via, (resource and resource.status), |