diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/fileset_strategies.py | 24 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 31 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 28 |
3 files changed, 31 insertions, 52 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 2577d2b..d1193ee 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -197,23 +197,6 @@ class WebFilesetStrategy(FilesetIngestStrategy): self.try_spn2 = True self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) - # XXX: this is copypasta, and also should be part of SPN client, not here - self.spn2_simple_get_domains = [ - # direct PDF links - "://arxiv.org/pdf/", - "://europepmc.org/backend/ptpmcrender.fcgi", - "://pdfs.semanticscholar.org/", - "://res.mdpi.com/", - - # platform sites - "://zenodo.org/", - "://figshare.org/", - "://springernature.figshare.com/", - - # popular simple cloud storage or direct links - "://s3-eu-west-1.amazonaws.com/", - ] - def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult: """ For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt @@ -234,12 +217,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')): via = "spn2" - force_simple_get = 0 - for domain in self.spn2_simple_get_domains: - if domain in fetch_url: - force_simple_get = 1 - break - resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=force_simple_get) + resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client) print("[FETCH {:>6}] {} {}".format( via, diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 0c3f621..a2ca346 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -808,7 +808,28 @@ class SavePageNowClient: self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0) - def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0): + # these are special-case web domains for which we want SPN2 to not run + # a headless browser (brozzler), but instead simply run wget. + # the motivation could be to work around browser issues, or in the + # future possibly to increase download efficiency (wget/fetch being + # faster than browser fetch) + self.simple_get_domains = [ + # direct PDF links + "://arxiv.org/pdf/", + "://europepmc.org/backend/ptpmcrender.fcgi", + "://pdfs.semanticscholar.org/", + "://res.mdpi.com/", + + # platform sites + "://zenodo.org/", + "://figshare.org/", + "://springernature.figshare.com/", + + # popular simple cloud storage or direct links + "://s3-eu-west-1.amazonaws.com/", + ] + + def save_url_now_v2(self, request_url, force_simple_get=None, capture_outlinks=0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -842,6 +863,12 @@ class SavePageNowClient: None, None, ) + if force_simple_get is None: + force_simple_get = 0 + for domain in self.simple_get_domains: + if domain in request_url: + force_simple_get = 1 + break resp = self.v2_session.post( self.v2endpoint, data={ @@ -929,7 +956,7 @@ class SavePageNowClient: None, ) - def crawl_resource(self, start_url, wayback_client, force_simple_get=0): + def crawl_resource(self, start_url, wayback_client, force_simple_get=None): """ Runs a SPN2 crawl, then fetches body. diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index ce38e13..afaa329 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -152,27 +152,6 @@ class IngestFileWorker(SandcrawlerWorker): "error=cookies_not_supported", ] - # these are special-case web domains for which we want SPN2 to not run - # a headless browser (brozzler), but instead simply run wget. - # the motivation could be to work around browser issues, or in the - # future possibly to increase download efficiency (wget/fetch being - # faster than browser fetch) - self.spn2_simple_get_domains = [ - # direct PDF links - "://arxiv.org/pdf/", - "://europepmc.org/backend/ptpmcrender.fcgi", - "://pdfs.semanticscholar.org/", - "://res.mdpi.com/", - - # platform sites - "://zenodo.org/", - "://figshare.org/", - "://springernature.figshare.com/", - - # popular simple cloud storage or direct links - "://s3-eu-west-1.amazonaws.com/", - ] - self.src_valid_mimetypes = [ "text/x-tex", "application/gzip", @@ -266,12 +245,7 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure): via = "spn2" - force_simple_get = 0 - for domain in self.spn2_simple_get_domains: - if domain in url: - force_simple_get = 1 - break - resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get) + resource = self.spn_client.crawl_resource(url, self.wayback_client) print("[FETCH {:>6}] {} {}".format( via, (resource and resource.status), |