From 55ef20102eaf8123dfc41e1d7ae80c50607c99f4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Jan 2020 12:19:44 -0800 Subject: fix http/https issue with GlobalWayback library --- python/sandcrawler/ia.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 74cd978..198c8aa 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -206,7 +206,8 @@ class WaybackClient: else: self.cdx_client = CdxApiClient() # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'https://archive.org/serve/') + # this *does* want to be http://, not https:// + self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') # gwb library will fall back to reading from /opt/.petabox/webdata.secret self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/') -- cgit v1.2.3