From ba324ae5a6051c47d4cf7524c28caeda7abd6fc5 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 15 Oct 2021 13:30:37 -0700
Subject: move SPNv2 'simple_get' logic to SPN client

---
 python/sandcrawler/fileset_strategies.py | 24 +-----------------------
 python/sandcrawler/ia.py                 | 31 +++++++++++++++++++++++++++++--
 python/sandcrawler/ingest_file.py        | 28 +---------------------------
 3 files changed, 31 insertions(+), 52 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 2577d2b..d1193ee 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -197,23 +197,6 @@ class WebFilesetStrategy(FilesetIngestStrategy):
         self.try_spn2 = True
         self.spn_client = SavePageNowClient(spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
 
-        # XXX: this is copypasta, and also should be part of SPN client, not here
-        self.spn2_simple_get_domains = [
-            # direct PDF links
-            "://arxiv.org/pdf/",
-            "://europepmc.org/backend/ptpmcrender.fcgi",
-            "://pdfs.semanticscholar.org/",
-            "://res.mdpi.com/",
-
-            # platform sites
-            "://zenodo.org/",
-            "://figshare.org/",
-            "://springernature.figshare.com/",
-
-            # popular simple cloud storage or direct links
-            "://s3-eu-west-1.amazonaws.com/",
-        ]
-
     def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult:
         """
         For each manifest item individually, run 'fetch_resource' and record stats, terminal_url, terminal_dt
@@ -234,12 +217,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
 
             if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')):
                 via = "spn2"
-                force_simple_get = 0
-                for domain in self.spn2_simple_get_domains:
-                    if domain in fetch_url:
-                        force_simple_get = 1
-                        break
-                resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client, force_simple_get=force_simple_get)
+                resource = self.spn_client.crawl_resource(fetch_url, self.wayback_client)
 
             print("[FETCH {:>6}] {}  {}".format(
                     via,
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 0c3f621..a2ca346 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -808,7 +808,28 @@ class SavePageNowClient:
 
         self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0)
 
-    def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0):
+        # these are special-case web domains for which we want SPN2 to not run
+        # a headless browser (brozzler), but instead simply run wget.
+        # the motivation could be to work around browser issues, or in the
+        # future possibly to increase download efficiency (wget/fetch being
+        # faster than browser fetch)
+        self.simple_get_domains = [
+            # direct PDF links
+            "://arxiv.org/pdf/",
+            "://europepmc.org/backend/ptpmcrender.fcgi",
+            "://pdfs.semanticscholar.org/",
+            "://res.mdpi.com/",
+
+            # platform sites
+            "://zenodo.org/",
+            "://figshare.org/",
+            "://springernature.figshare.com/",
+
+            # popular simple cloud storage or direct links
+            "://s3-eu-west-1.amazonaws.com/",
+        ]
+
+    def save_url_now_v2(self, request_url, force_simple_get=None, capture_outlinks=0):
         """
         Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
         at all, or raises an exception if there was an error with SPN itself.
@@ -842,6 +863,12 @@ class SavePageNowClient:
                 None,
                 None,
             )
+        if force_simple_get is None:
+            force_simple_get = 0
+            for domain in self.simple_get_domains:
+                if domain in request_url:
+                    force_simple_get = 1
+                    break
         resp = self.v2_session.post(
             self.v2endpoint,
             data={
@@ -929,7 +956,7 @@ class SavePageNowClient:
                 None,
             )
 
-    def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
+    def crawl_resource(self, start_url, wayback_client, force_simple_get=None):
         """
         Runs a SPN2 crawl, then fetches body.
 
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index ce38e13..afaa329 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -152,27 +152,6 @@ class IngestFileWorker(SandcrawlerWorker):
             "error=cookies_not_supported",
         ]
 
-        # these are special-case web domains for which we want SPN2 to not run
-        # a headless browser (brozzler), but instead simply run wget.
-        # the motivation could be to work around browser issues, or in the
-        # future possibly to increase download efficiency (wget/fetch being
-        # faster than browser fetch)
-        self.spn2_simple_get_domains = [
-            # direct PDF links
-            "://arxiv.org/pdf/",
-            "://europepmc.org/backend/ptpmcrender.fcgi",
-            "://pdfs.semanticscholar.org/",
-            "://res.mdpi.com/",
-
-            # platform sites
-            "://zenodo.org/",
-            "://figshare.org/",
-            "://springernature.figshare.com/",
-
-            # popular simple cloud storage or direct links
-            "://s3-eu-west-1.amazonaws.com/",
-        ]
-
         self.src_valid_mimetypes = [
             "text/x-tex",
             "application/gzip",
@@ -266,12 +245,7 @@ class IngestFileWorker(SandcrawlerWorker):
 
         if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture') or soft404 or old_failure):
             via = "spn2"
-            force_simple_get = 0
-            for domain in self.spn2_simple_get_domains:
-                if domain in url:
-                    force_simple_get = 1
-                    break
-            resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
+            resource = self.spn_client.crawl_resource(url, self.wayback_client)
         print("[FETCH {:>6}] {}  {}".format(
                 via,
                 (resource and resource.status),
-- 
cgit v1.2.3