aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-13 19:58:57 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-13 19:58:57 -0700
commit21492d84fa5a00b7b3e008aefb0799c37bc0c4f2 (patch)
tree8d9d8742a8dcf7e77ad93588b1774bcebfc42022
parentff36a4372e3f24efa531bfe6156e4ee08d458e08 (diff)
downloadsandcrawler-21492d84fa5a00b7b3e008aefb0799c37bc0c4f2.tar.gz
sandcrawler-21492d84fa5a00b7b3e008aefb0799c37bc0c4f2.zip
ingest: quick hack to capture CNKI outlinks
-rw-r--r--python/sandcrawler/ia.py11
1 files changed, 9 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index d6580e6..24ff619 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -769,7 +769,7 @@ class SavePageNowClient:
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url, force_get=0):
+ def save_url_now_v2(self, request_url, force_get=0, capture_outlinks=0):
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -789,6 +789,8 @@ class SavePageNowClient:
TODO: parse SPN error codes (status string) and handle better. Eg,
non-200 remote statuses, invalid hosts/URLs, timeouts, backoff, etc.
"""
+ if capture_outlinks:
+ print(" capturing outlinks!", file=sys.stdout)
if not (self.ia_access_key and self.ia_secret_key):
raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
if request_url.startswith("ftp://"):
@@ -806,6 +808,7 @@ class SavePageNowClient:
data={
'url': request_url,
'capture_all': 1,
+ 'capture_outlinks': capture_outlinks,
'capture_screenshot': 0,
'if_not_archived_within': '1d',
'force_get': force_get,
@@ -887,7 +890,11 @@ class SavePageNowClient:
TODO: possible to fetch from petabox?
"""
- spn_result = self.save_url_now_v2(start_url, force_get=force_get)
+ # HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
+ if 'gzbd.cnki.net/' in start_url:
+ spn_result = self.save_url_now_v2(start_url, force_get=force_get, capture_outlinks=1)
+ else:
+ spn_result = self.save_url_now_v2(start_url, force_get=force_get)
if not spn_result.success:
status = spn_result.status