aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-15 13:30:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:29 -0700
commitba324ae5a6051c47d4cf7524c28caeda7abd6fc5 (patch)
tree8dfd640ddec952b37b9ad0438ee54cb7594d6b85
parent6cccac03451f46cb59897871e6631debca558771 (diff)
downloadsandcrawler-ba324ae5a6051c47d4cf7524c28caeda7abd6fc5.tar.gz
sandcrawler-ba324ae5a6051c47d4cf7524c28caeda7abd6fc5.zip
move SPNv2 'simple_get' logic to SPN client
-rw-r--r--python/sandcrawler/fileset_strategies.py24
-rw-r--r--python/sandcrawler/ia.py31
-rw-r--r--python/sandcrawler/ingest_file.py28
3 files changed, 31 insertions, 52 deletions
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 2577d2b..d1193ee 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -197,23 +197,6 @@ class WebFilesetStrategy(FilesetIngestStrategy):
self.try_spn2 = True
self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
- # XXX: this is copypasta, and also should be part of SPN client, not here
- self.spn2_simple_get_domains = [
- # direct PDF links
- "://arxiv.org/pdf/",
- "://europepmc.org/backend/ptpmcrender.fcgi",
- "://pdfs.semanticscholar.org/",
- "://res.mdpi.com/",
-
- # platform sites
- "://zenodo.org/",
- "://figshare.org/",
- "://springernature.figshare.com/",
-
- # popular simple cloud storage or direct links
- "://s3-eu-west-1.amazonaws.com/",
- ]
-
def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
"""
For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
@@ -234,12 +217,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
via = "spn2"
- force_simple_get = 0
- for domain in self.spn2_simple_get_domains:
- if domain in fetch_url:
- force_simple_get = 1
- break
- resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=force_simple_get)
+ resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client)
print("[FETCH {:>6}] {} {}".format(
via,
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 0c3f621..a2ca346 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -808,7 +808,28 @@ class SavePageNowClient:
self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0)
- def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0):
+ # these are special-case web domains for which we want SPN2 to not run
+ # a headless browser (brozzler), but instead simply run wget.
+ # the motivation could be to work around browser issues, or in the
+ # future possibly to increase download efficiency (wget/fetch being
+ # faster than browser fetch)
+ self.simple_get_domains = [
+ # direct PDF links
+ "://arxiv.org/pdf/",
+ "://europepmc.org/backend/ptpmcrender.fcgi",
+ "://pdfs.semanticscholar.org/",
+ "://res.mdpi.com/",
+
+ # platform sites
+ "://zenodo.org/",
+ "://figshare.org/",
+ "://springernature.figshare.com/",
+
+ # popular simple cloud storage or direct links
+ "://s3-eu-west-1.amazonaws.com/",
+ ]
+
+ def save_url_now_v2(self, request_url, force_simple_get=None, capture_outlinks=0):
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -842,6 +863,12 @@ class SavePageNowClient:
None,
None,
)
+ if force_simple_get is None:
+ force_simple_get = 0
+ for domain in self.simple_get_domains:
+ if domain in request_url:
+ force_simple_get = 1
+ break
resp = self.v2_session.post(
self.v2endpoint,
data={
@@ -929,7 +956,7 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
+ def crawl_resource(self, start_url, wayback_client, force_simple_get=None):
"""
Runs a SPN2 crawl, then fetches body.
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index ce38e13..afaa329 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -152,27 +152,6 @@ class IngestFileWorker(SandcrawlerWorker):
"error=cookies_not_supported",
]
- # these are special-case web domains for which we want SPN2 to not run
- # a headless browser (brozzler), but instead simply run wget.
- # the motivation could be to work around browser issues, or in the
- # future possibly to increase download efficiency (wget/fetch being
- # faster than browser fetch)
- self.spn2_simple_get_domains = [
- # direct PDF links
- "://arxiv.org/pdf/",
- "://europepmc.org/backend/ptpmcrender.fcgi",
- "://pdfs.semanticscholar.org/",
- "://res.mdpi.com/",
-
- # platform sites
- "://zenodo.org/",
- "://figshare.org/",
- "://springernature.figshare.com/",
-
- # popular simple cloud storage or direct links
- "://s3-eu-west-1.amazonaws.com/",
- ]
-
self.src_valid_mimetypes = [
"text/x-tex",
"application/gzip",
@@ -266,12 +245,7 @@ class IngestFileWorker(SandcrawlerWorker):
if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
via = "spn2"
- force_simple_get = 0
- for domain in self.spn2_simple_get_domains:
- if domain in url:
- force_simple_get = 1
- break
- resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
+ resource = self.spn_client.crawl_resource(url, self.wayback_client)
print("[FETCH {:>6}] {} {}".format(
via,
(resource and resource.status),