diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-18 18:49:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-18 18:49:09 -0700 |
commit | cb16d18137c936a634b75bf0eb6acb43c77d9290 (patch) | |
tree | 4b8b72aa7cd1d5a9da81c6233ea10b6cdc837d2a | |
parent | e1b3edd7af59fe0fd4272a4696387ea09a22a6c0 (diff) | |
download | sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.tar.gz sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.zip |
implement (unused) force_get flag for SPN2
I hoped this feature would make it possible to crawl journals.lww.com
PDFs, because the token URLs work with `wget`, but it still doesn't seem
to work. Maybe because of user agent?
Anyways, this feature might be useful for crawling efficiency, so adding
to master.
-rw-r--r-- | python/sandcrawler/ia.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 16 |
2 files changed, 19 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e31ff30..0a0e0ae 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -758,7 +758,7 @@ class SavePageNowClient: self.poll_count = 60 self.poll_seconds = 3.0 - def save_url_now_v2(self, request_url): + def save_url_now_v2(self, request_url, force_get=0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -797,6 +797,7 @@ class SavePageNowClient: 'capture_all': 1, 'capture_screenshot': 0, 'if_not_archived_within': '1d', + 'force_get': force_get, }, ) if resp.status_code == 429: @@ -866,14 +867,14 @@ class SavePageNowClient: None, ) - def crawl_resource(self, start_url, wayback_client): + def crawl_resource(self, start_url, wayback_client, force_get=0): """ Runs a SPN2 crawl, then fetches body from wayback. TODO: possible to fetch from petabox? """ - spn_result = self.save_url_now_v2(start_url) + spn_result = self.save_url_now_v2(start_url, force_get=force_get) if not spn_result.success: status = spn_result.status diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5dc5b55..c9a697c 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -93,6 +93,15 @@ class IngestFileWorker(SandcrawlerWorker): "digital.ucd.ie/", # ireland national historical ] + # these are special-case web domains for which we want SPN2 to not run + # a headless browser (brozzler), but instead simply run wget. + # the motivation could be to work around browser issues, or in the + # future possibly to increase download efficiency (wget/fetch being + # faster than browser fetch) + self.spn2_simple_get_domains = [ + ] + + def check_existing_ingest(self, base_url): """ Check in sandcrawler-db (postgres) to see if we have already ingested @@ -138,7 +147,12 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_spn2 and (not resource or not resource.hit or soft404): via = "spn2" - resource = self.spn_client.crawl_resource(url, self.wayback_client) + force_get = 0 + for domain in self.spn2_simple_get_domains: + if domain in url: + force_get = 1 + break + resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get) print("[FETCH {}\t] {}\t{}".format( via, resource.status, |