aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-31 14:01:31 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-31 14:01:31 -0700
commit13906e817033583757ab3a08eee7b5cebf327da8 (patch)
tree77027aabb689099dc0880014fd08747c12fb54ba
parentc3727c86008dc9aa64ab4e61037d27a9be9d3ea6 (diff)
downloadfatcat-13906e817033583757ab3a08eee7b5cebf327da8.tar.gz
fatcat-13906e817033583757ab3a08eee7b5cebf327da8.zip
more issn URL checker fixes
-rwxr-xr-xextra/journal_metadata/check_issn_urls.py31
-rw-r--r--extra/journal_metadata/chocula_schema.sql7
2 files changed, 27 insertions, 11 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py
index 8c99997e..399fd93f 100755
--- a/extra/journal_metadata/check_issn_urls.py
+++ b/extra/journal_metadata/check_issn_urls.py
@@ -2,10 +2,12 @@
"""
Check journal homepage status (live web and wayback)
-
Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can:
- parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json
+ # be sure to randomize order if you are going to use high parallelism so no
+ # individual domain gets swamped. also remember this hits CDX API multiple
+ # times.
+ parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json
Input columns (no header):
@@ -31,6 +33,7 @@ HTTP status will be -1 if domain does not even resolve.
import os
import sys
import json
+import time
import requests
@@ -79,12 +82,20 @@ def sniff_blocked(resp):
def check_gwb(url, match_type='exact'):
if '//web.archive.org/' in url:
return None
- resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
- 'url': url,
- 'matchType': match_type,
- 'limit': -1,
- 'filter': 'statuscode:200'
- })
+ # crude/bad retry loop to work around CDX API throttling
+ for i in range(5):
+ resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
+ 'url': url,
+ 'matchType': match_type,
+ 'limit': -1,
+ 'filter': 'statuscode:200'
+ })
+ if resp.status_code == 200:
+ break
+ time.sleep(5)
+ if not resp.status_code == 200:
+ sys.stderr.write("CDX ERR {}: {}".format(resp.status_code, url))
+ return 'error'
line = resp.text.strip().split('\n')[0]
if line:
dt = line.split()[1]
@@ -119,6 +130,10 @@ def check_url(issnl, url):
info['error'] = 'ChunkedEncodingError'
info['terminal_status_code'] = info['status_code'] = -1
return info
+ except requests.exceptions.ContentDecodingError:
+ info['error'] = 'ContentDecodingError'
+ info['terminal_status_code'] = info['status_code'] = -1
+ return info
if resp.history:
info['status_code'] = resp.history[0].status_code
diff --git a/extra/journal_metadata/chocula_schema.sql b/extra/journal_metadata/chocula_schema.sql
index 99462794..46b282d0 100644
--- a/extra/journal_metadata/chocula_schema.sql
+++ b/extra/journal_metadata/chocula_schema.sql
@@ -74,13 +74,14 @@ CREATE TABLE IF NOT EXISTS homepage
domain TEXT NOT NULL,
suffix TEXT NOT NULL,
status_code INTEGER,
+ crawl_error TEXT,
terminal_url TEXT,
terminal_status_code INTEGER,
platform_software TEXT,
- scope TEXT,
- has_issn BOOLEAN,
+ issnl_in_body BOOLEAN,
blocked BOOLEAN,
- latest_gwb_success TEXT,
+ gwb_url_success_dt TEXT,
+ gwb_terminal_url_success_dt TEXT,
UNIQUE(issnl, surt)
);
CREATE INDEX IF NOT EXISTS homepage_url_idx ON homepage(url);