implement (unused) force_get flag for SPN2

I hoped this feature would make it possible to crawl journals.lww.com PDFs, because the token URLs work with `wget`, but it still doesn't seem to work. Maybe because of user agent? Anyways, this feature might be useful for crawling efficiency, so adding to master.
author: Bryan Newbold <bnewbold@archive.org> 2020-03-18 18:49:05 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-18 18:49:09 -0700
commit: cb16d18137c936a634b75bf0eb6acb43c77d9290 (patch)
tree: 4b8b72aa7cd1d5a9da81c6233ea10b6cdc837d2a /python
parent: e1b3edd7af59fe0fd4272a4696387ea09a22a6c0 (diff)
download: sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.tar.gz
sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.zip
2 files changed, 19 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e31ff30..0a0e0ae 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -758,7 +758,7 @@ class SavePageNowClient:
         self.poll_count = 60
         self.poll_seconds = 3.0
 
-    def save_url_now_v2(self, request_url):
+    def save_url_now_v2(self, request_url, force_get=0):
         """
         Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
         at all, or raises an exception if there was an error with SPN itself.
@@ -797,6 +797,7 @@ class SavePageNowClient:
                 'capture_all': 1,
                 'capture_screenshot': 0,
                 'if_not_archived_within': '1d',
+                'force_get': force_get,
             },
         )
         if resp.status_code == 429:
@@ -866,14 +867,14 @@ class SavePageNowClient:
                 None,
             )
 
-    def crawl_resource(self, start_url, wayback_client):
+    def crawl_resource(self, start_url, wayback_client, force_get=0):
         """
         Runs a SPN2 crawl, then fetches body from wayback.
 
         TODO: possible to fetch from petabox?
         """
 
-        spn_result = self.save_url_now_v2(start_url)
+        spn_result = self.save_url_now_v2(start_url, force_get=force_get)
 
         if not spn_result.success:
             status = spn_result.status
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 5dc5b55..c9a697c 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -93,6 +93,15 @@ class IngestFileWorker(SandcrawlerWorker):
             "digital.ucd.ie/",      # ireland national historical
         ]
 
+        # these are special-case web domains for which we want SPN2 to not run
+        # a headless browser (brozzler), but instead simply run wget.
+        # the motivation could be to work around browser issues, or in the
+        # future possibly to increase download efficiency (wget/fetch being
+        # faster than browser fetch)
+        self.spn2_simple_get_domains = [
+        ]
+
+
     def check_existing_ingest(self, base_url):
         """
         Check in sandcrawler-db (postgres) to see if we have already ingested
@@ -138,7 +147,12 @@ class IngestFileWorker(SandcrawlerWorker):
 
         if self.try_spn2 and (not resource or not resource.hit or soft404):
             via = "spn2"
-            resource = self.spn_client.crawl_resource(url, self.wayback_client)
+            force_get = 0
+            for domain in self.spn2_simple_get_domains:
+                if domain in url:
+                    force_get = 1
+                    break
+            resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get)
         print("[FETCH {}\t] {}\t{}".format(
                 via,
                 resource.status,
author	Bryan Newbold <bnewbold@archive.org>	2020-03-18 18:49:05 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-18 18:49:09 -0700
commit	cb16d18137c936a634b75bf0eb6acb43c77d9290 (patch)
tree	4b8b72aa7cd1d5a9da81c6233ea10b6cdc837d2a /python
parent	e1b3edd7af59fe0fd4272a4696387ea09a22a6c0 (diff)
download	sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.tar.gz sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.zip