diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-02 21:59:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-02 21:59:16 -0800 |
commit | 2a5f6a25123d6b725327de844da38df735b04d3f (patch) | |
tree | 27c2869ff522f74eff47f189c97bfcbd97eb9652 | |
parent | 6a29e396da2289e6524097ef906b63358eb3cfec (diff) | |
download | sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.tar.gz sandcrawler-2a5f6a25123d6b725327de844da38df735b04d3f.zip |
wayback: try to resolve HTTPException due to many HTTP headers
This is withing GWB wayback code. Trying two things:
- bump default max headers from 100 to 1000 in the (global?) http.client
module itself. I didn't think through whether we would expect this to
actually work
- catch the exception, record it, move on
-rw-r--r-- | python/sandcrawler/ia.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index af92035..de5654c 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -8,6 +8,11 @@ import requests import datetime from collections import namedtuple +import http.client + +# not sure this will really work. Should go before wayback imports. +http.client._MAXHEADERS = 1000 + import wayback.exception from http.client import IncompleteRead from wayback.resourcestore import ResourceStore @@ -310,7 +315,10 @@ class WaybackClient: # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. - status_code = gwb_record.get_status()[0] + try: + status_code = gwb_record.get_status()[0] + except http.client.HTTPException: + raise WaybackError("too many HTTP headers (in wayback fetch)") location = gwb_record.get_location() or None if status_code is None and gwb_record.target_uri.startswith(b"ftp://"): |