more issn URL checker fixes

author: Bryan Newbold <bnewbold@robocracy.org> 2019-07-31 14:01:31 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-07-31 14:01:31 -0700
commit: 13906e817033583757ab3a08eee7b5cebf327da8 (patch)
tree: 77027aabb689099dc0880014fd08747c12fb54ba
parent: c3727c86008dc9aa64ab4e61037d27a9be9d3ea6 (diff)
download: fatcat-13906e817033583757ab3a08eee7b5cebf327da8.tar.gz
fatcat-13906e817033583757ab3a08eee7b5cebf327da8.zip
2 files changed, 27 insertions, 11 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py
index 8c99997e..399fd93f 100755
--- a/extra/journal_metadata/check_issn_urls.py
+++ b/extra/journal_metadata/check_issn_urls.py
@@ -2,10 +2,12 @@
 """
 Check journal homepage status (live web and wayback)
 
-
 Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can:
 
-    parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json
+    # be sure to randomize order if you are going to use high parallelism so no
+    # individual domain gets swamped. also remember this hits CDX API multiple
+    # times.
+    parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json
 
 Input columns (no header):
 
@@ -31,6 +33,7 @@ HTTP status will be -1 if domain does not even resolve.
 import os
 import sys
 import json
+import time
 import requests
 
 
@@ -79,12 +82,20 @@ def sniff_blocked(resp):
 def check_gwb(url, match_type='exact'):
     if '//web.archive.org/' in url:
         return None
-    resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
-        'url': url,
-        'matchType': match_type,
-        'limit': -1,
-        'filter': 'statuscode:200'
-    })
+    # crude/bad retry loop to work around CDX API throttling
+    for i in range(5):
+        resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
+            'url': url,
+            'matchType': match_type,
+            'limit': -1,
+            'filter': 'statuscode:200'
+        })
+        if resp.status_code == 200:
+            break
+        time.sleep(5)
+    if not resp.status_code == 200:
+        sys.stderr.write("CDX ERR {}: {}".format(resp.status_code, url))
+        return 'error'
     line = resp.text.strip().split('\n')[0]
     if line:
         dt = line.split()[1]
@@ -119,6 +130,10 @@ def check_url(issnl, url):
         info['error'] = 'ChunkedEncodingError'
         info['terminal_status_code'] = info['status_code'] = -1
         return info
+    except requests.exceptions.ContentDecodingError:
+        info['error'] = 'ContentDecodingError'
+        info['terminal_status_code'] = info['status_code'] = -1
+        return info
 
     if resp.history:
         info['status_code'] = resp.history[0].status_code
diff --git a/extra/journal_metadata/chocula_schema.sql b/extra/journal_metadata/chocula_schema.sql
index 99462794..46b282d0 100644
--- a/extra/journal_metadata/chocula_schema.sql
+++ b/extra/journal_metadata/chocula_schema.sql
@@ -74,13 +74,14 @@ CREATE TABLE IF NOT EXISTS homepage
      domain TEXT NOT NULL,
      suffix TEXT NOT NULL,
      status_code INTEGER,
+     crawl_error TEXT,
      terminal_url TEXT,
      terminal_status_code INTEGER,
      platform_software TEXT,
-     scope TEXT,
-     has_issn BOOLEAN,
+     issnl_in_body BOOLEAN,
      blocked BOOLEAN,
-     latest_gwb_success TEXT,
+     gwb_url_success_dt TEXT,
+     gwb_terminal_url_success_dt TEXT,
      UNIQUE(issnl, surt)
     );
 CREATE INDEX IF NOT EXISTS homepage_url_idx ON homepage(url);
author	Bryan Newbold <bnewbold@robocracy.org>	2019-07-31 14:01:31 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-07-31 14:01:31 -0700
commit	13906e817033583757ab3a08eee7b5cebf327da8 (patch)
tree	77027aabb689099dc0880014fd08747c12fb54ba
parent	c3727c86008dc9aa64ab4e61037d27a9be9d3ea6 (diff)
download	fatcat-13906e817033583757ab3a08eee7b5cebf327da8.tar.gz fatcat-13906e817033583757ab3a08eee7b5cebf327da8.zip