wayback: try to resolve HTTPException due to many HTTP headers

This is withing GWB wayback code. Trying two things: - bump default max headers from 100 to 1000 in the (global?) http.client module itself. I didn't think through whether we would expect this to actually work - catch the exception, record it, move on
author: Bryan Newbold <bnewbold@archive.org> 2020-02-02 21:59:14 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-02-02 21:59:16 -0800
commit: 2a5f6a25123d6b725327de844da38df735b04d3f (patch)
tree: 27c2869ff522f74eff47f189c97bfcbd97eb9652
parent: 6a29e396da2289e6524097ef906b63358eb3cfec (diff)
download: sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.tar.gz
sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.zip
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index af92035..de5654c 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -8,6 +8,11 @@ import requests
 import datetime
 from collections import namedtuple
 
+import http.client
+
+# not sure this will really work. Should go before wayback imports.
+http.client._MAXHEADERS = 1000
+
 import wayback.exception
 from http.client import IncompleteRead
 from wayback.resourcestore import ResourceStore
@@ -310,7 +315,10 @@ class WaybackClient:
         # many petabox errors. Do want jobs to fail loud and clear when the
         # whole cluster is down though.
 
-        status_code = gwb_record.get_status()[0]
+        try:
+            status_code = gwb_record.get_status()[0]
+        except http.client.HTTPException:
+            raise WaybackError("too many HTTP headers (in wayback fetch)")
         location = gwb_record.get_location() or None
 
         if status_code is None and gwb_record.target_uri.startswith(b"ftp://"):
author	Bryan Newbold <bnewbold@archive.org>	2020-02-02 21:59:14 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-02-02 21:59:16 -0800
commit	2a5f6a25123d6b725327de844da38df735b04d3f (patch)
tree	27c2869ff522f74eff47f189c97bfcbd97eb9652
parent	6a29e396da2289e6524097ef906b63358eb3cfec (diff)
download	sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.tar.gz sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.zip