From 13906e817033583757ab3a08eee7b5cebf327da8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 31 Jul 2019 14:01:31 -0700 Subject: more issn URL checker fixes --- extra/journal_metadata/check_issn_urls.py | 31 +++++++++++++++++++++++-------- extra/journal_metadata/chocula_schema.sql | 7 ++++--- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py index 8c99997e..399fd93f 100755 --- a/extra/journal_metadata/check_issn_urls.py +++ b/extra/journal_metadata/check_issn_urls.py @@ -2,10 +2,12 @@ """ Check journal homepage status (live web and wayback) - Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can: - parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json + # be sure to randomize order if you are going to use high parallelism so no + # individual domain gets swamped. also remember this hits CDX API multiple + # times. + parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json Input columns (no header): @@ -31,6 +33,7 @@ HTTP status will be -1 if domain does not even resolve. import os import sys import json +import time import requests @@ -79,12 +82,20 @@ def sniff_blocked(resp): def check_gwb(url, match_type='exact'): if '//web.archive.org/' in url: return None - resp = requests.get('https://web.archive.org/cdx/search/cdx', params={ - 'url': url, - 'matchType': match_type, - 'limit': -1, - 'filter': 'statuscode:200' - }) + # crude/bad retry loop to work around CDX API throttling + for i in range(5): + resp = requests.get('https://web.archive.org/cdx/search/cdx', params={ + 'url': url, + 'matchType': match_type, + 'limit': -1, + 'filter': 'statuscode:200' + }) + if resp.status_code == 200: + break + time.sleep(5) + if not resp.status_code == 200: + sys.stderr.write("CDX ERR {}: {}".format(resp.status_code, url)) + return 'error' line = resp.text.strip().split('\n')[0] if line: dt = line.split()[1] @@ -119,6 +130,10 @@ def check_url(issnl, url): info['error'] = 'ChunkedEncodingError' info['terminal_status_code'] = info['status_code'] = -1 return info + except requests.exceptions.ContentDecodingError: + info['error'] = 'ContentDecodingError' + info['terminal_status_code'] = info['status_code'] = -1 + return info if resp.history: info['status_code'] = resp.history[0].status_code diff --git a/extra/journal_metadata/chocula_schema.sql b/extra/journal_metadata/chocula_schema.sql index 99462794..46b282d0 100644 --- a/extra/journal_metadata/chocula_schema.sql +++ b/extra/journal_metadata/chocula_schema.sql @@ -74,13 +74,14 @@ CREATE TABLE IF NOT EXISTS homepage domain TEXT NOT NULL, suffix TEXT NOT NULL, status_code INTEGER, + crawl_error TEXT, terminal_url TEXT, terminal_status_code INTEGER, platform_software TEXT, - scope TEXT, - has_issn BOOLEAN, + issnl_in_body BOOLEAN, blocked BOOLEAN, - latest_gwb_success TEXT, + gwb_url_success_dt TEXT, + gwb_terminal_url_success_dt TEXT, UNIQUE(issnl, surt) ); CREATE INDEX IF NOT EXISTS homepage_url_idx ON homepage(url); -- cgit v1.2.3