aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-02 21:59:14 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-02 21:59:16 -0800
commit2a5f6a25123d6b725327de844da38df735b04d3f (patch)
tree27c2869ff522f74eff47f189c97bfcbd97eb9652
parent6a29e396da2289e6524097ef906b63358eb3cfec (diff)
downloadsandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.tar.gz
sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.zip
wayback: try to resolve HTTPException due to many HTTP headers
This is withing GWB wayback code. Trying two things: - bump default max headers from 100 to 1000 in the (global?) http.client module itself. I didn't think through whether we would expect this to actually work - catch the exception, record it, move on
-rw-r--r--python/sandcrawler/ia.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index af92035..de5654c 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -8,6 +8,11 @@ import requests
import datetime
from collections import namedtuple
+import http.client
+
+# not sure this will really work. Should go before wayback imports.
+http.client._MAXHEADERS = 1000
+
import wayback.exception
from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
@@ -310,7 +315,10 @@ class WaybackClient:
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
- status_code = gwb_record.get_status()[0]
+ try:
+ status_code = gwb_record.get_status()[0]
+ except http.client.HTTPException:
+ raise WaybackError("too many HTTP headers (in wayback fetch)")
location = gwb_record.get_location() or None
if status_code is None and gwb_record.target_uri.startswith(b"ftp://"):