From 2a5f6a25123d6b725327de844da38df735b04d3f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Sun, 2 Feb 2020 21:59:14 -0800
Subject: wayback: try to resolve HTTPException due to many HTTP headers

This is withing GWB wayback code. Trying two things:

- bump default max headers from 100 to 1000 in the (global?) http.client
module itself. I didn't think through whether we would expect this to
actually work
- catch the exception, record it, move on
---
 python/sandcrawler/ia.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index af92035..de5654c 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -8,6 +8,11 @@ import requests
 import datetime
 from collections import namedtuple
 
+import http.client
+
+# not sure this will really work. Should go before wayback imports.
+http.client._MAXHEADERS = 1000
+
 import wayback.exception
 from http.client import IncompleteRead
 from wayback.resourcestore import ResourceStore
@@ -310,7 +315,10 @@ class WaybackClient:
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
-        status_code = gwb_record.get_status()[0]
+        try:
+            status_code = gwb_record.get_status()[0]
+        except http.client.HTTPException:
+            raise WaybackError("too many HTTP headers (in wayback fetch)")
         location = gwb_record.get_location() or None
 
         if status_code is None and gwb_record.target_uri.startswith(b"ftp://"):
-- 
cgit v1.2.3