diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-13 19:58:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-13 19:58:57 -0700 |
commit | 21492d84fa5a00b7b3e008aefb0799c37bc0c4f2 (patch) | |
tree | 8d9d8742a8dcf7e77ad93588b1774bcebfc42022 | |
parent | ff36a4372e3f24efa531bfe6156e4ee08d458e08 (diff) | |
download | sandcrawler-21492d84fa5a00b7b3e008aefb0799c37bc0c4f2.tar.gz sandcrawler-21492d84fa5a00b7b3e008aefb0799c37bc0c4f2.zip |
ingest: quick hack to capture CNKI outlinks
-rw-r--r-- | python/sandcrawler/ia.py | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index d6580e6..24ff619 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -769,7 +769,7 @@ class SavePageNowClient: self.poll_count = 60 self.poll_seconds = 3.0 - def save_url_now_v2(self, request_url, force_get=0): + def save_url_now_v2(self, request_url, force_get=0, capture_outlinks=0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -789,6 +789,8 @@ class SavePageNowClient: TODO: parse SPN error codes (status string) and handle better. Eg, non-200 remote statuses, invalid hosts/URLs, timeouts, backoff, etc. """ + if capture_outlinks: + print(" capturing outlinks!", file=sys.stdout) if not (self.ia_access_key and self.ia_secret_key): raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)") if request_url.startswith("ftp://"): @@ -806,6 +808,7 @@ class SavePageNowClient: data={ 'url': request_url, 'capture_all': 1, + 'capture_outlinks': capture_outlinks, 'capture_screenshot': 0, 'if_not_archived_within': '1d', 'force_get': force_get, @@ -887,7 +890,11 @@ class SavePageNowClient: TODO: possible to fetch from petabox? """ - spn_result = self.save_url_now_v2(start_url, force_get=force_get) + # HACK: capture CNKI domains with outlinks (for COVID-19 crawling) + if 'gzbd.cnki.net/' in start_url: + spn_result = self.save_url_now_v2(start_url, force_get=force_get, capture_outlinks=1) + else: + spn_result = self.save_url_now_v2(start_url, force_get=force_get) if not spn_result.success: status = spn_result.status |