From 2a5f6a25123d6b725327de844da38df735b04d3f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 2 Feb 2020 21:59:14 -0800 Subject: wayback: try to resolve HTTPException due to many HTTP headers This is withing GWB wayback code. Trying two things: - bump default max headers from 100 to 1000 in the (global?) http.client module itself. I didn't think through whether we would expect this to actually work - catch the exception, record it, move on --- python/sandcrawler/ia.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index af92035..de5654c 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -8,6 +8,11 @@ import requests import datetime from collections import namedtuple +import http.client + +# not sure this will really work. Should go before wayback imports. +http.client._MAXHEADERS = 1000 + import wayback.exception from http.client import IncompleteRead from wayback.resourcestore import ResourceStore @@ -310,7 +315,10 @@ class WaybackClient: # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. - status_code = gwb_record.get_status()[0] + try: + status_code = gwb_record.get_status()[0] + except http.client.HTTPException: + raise WaybackError("too many HTTP headers (in wayback fetch)") location = gwb_record.get_location() or None if status_code is None and gwb_record.target_uri.startswith(b"ftp://"): -- cgit v1.2.3