diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-31 14:01:31 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-31 14:01:31 -0700 |
commit | fa657c6e1046aa8ffc8f6c59eebf9797d914c31a (patch) | |
tree | 981d8f76023bebe25255021e726ce323c3850771 /check_issn_urls.py | |
parent | ed452c219ca29c06426b03691a9369724aa7e251 (diff) | |
download | chocula-fa657c6e1046aa8ffc8f6c59eebf9797d914c31a.tar.gz chocula-fa657c6e1046aa8ffc8f6c59eebf9797d914c31a.zip |
more issn URL checker fixes
Diffstat (limited to 'check_issn_urls.py')
-rwxr-xr-x | check_issn_urls.py | 31 |
1 files changed, 23 insertions, 8 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py index 8c99997..399fd93 100755 --- a/check_issn_urls.py +++ b/check_issn_urls.py @@ -2,10 +2,12 @@ """ Check journal homepage status (live web and wayback) - Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can: - parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json + # be sure to randomize order if you are going to use high parallelism so no + # individual domain gets swamped. also remember this hits CDX API multiple + # times. + parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json Input columns (no header): @@ -31,6 +33,7 @@ HTTP status will be -1 if domain does not even resolve. import os import sys import json +import time import requests @@ -79,12 +82,20 @@ def sniff_blocked(resp): def check_gwb(url, match_type='exact'): if '//web.archive.org/' in url: return None - resp = requests.get('https://web.archive.org/cdx/search/cdx', params={ - 'url': url, - 'matchType': match_type, - 'limit': -1, - 'filter': 'statuscode:200' - }) + # crude/bad retry loop to work around CDX API throttling + for i in range(5): + resp = requests.get('https://web.archive.org/cdx/search/cdx', params={ + 'url': url, + 'matchType': match_type, + 'limit': -1, + 'filter': 'statuscode:200' + }) + if resp.status_code == 200: + break + time.sleep(5) + if not resp.status_code == 200: + sys.stderr.write("CDX ERR {}: {}".format(resp.status_code, url)) + return 'error' line = resp.text.strip().split('\n')[0] if line: dt = line.split()[1] @@ -119,6 +130,10 @@ def check_url(issnl, url): info['error'] = 'ChunkedEncodingError' info['terminal_status_code'] = info['status_code'] = -1 return info + except requests.exceptions.ContentDecodingError: + info['error'] = 'ContentDecodingError' + info['terminal_status_code'] = info['status_code'] = -1 + return info if resp.history: info['status_code'] = resp.history[0].status_code |