major improvements to ISSN URL checker

author: Bryan Newbold <bnewbold@robocracy.org> 2019-07-30 23:08:35 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-07-30 23:08:35 -0700
commit: c3727c86008dc9aa64ab4e61037d27a9be9d3ea6 (patch)
tree: 76b330d577350b5eb070c5d47f84c1680e2b807e
parent: 8e9dd0046fa5d8a117568d15463ace7363323964 (diff)
download: fatcat-c3727c86008dc9aa64ab4e61037d27a9be9d3ea6.tar.gz
fatcat-c3727c86008dc9aa64ab4e61037d27a9be9d3ea6.zip
1 files changed, 121 insertions, 20 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py
index 009e18b6..8c99997e 100755
--- a/extra/journal_metadata/check_issn_urls.py
+++ b/extra/journal_metadata/check_issn_urls.py
@@ -1,48 +1,149 @@
 #!/usr/bin/env python3
 """
-Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to
-stdout.
+Check journal homepage status (live web and wayback)
 
-The stdin thing means you can:
 
-    parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv
+Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can:
 
-For each URL, do a request and record:
+    parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json
 
-    ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL
+Input columns (no header):
 
-HTTP status will be -1 if domain does not even resolve.
+    ISSN-L, URL
+
+For each URL, do a request and record, as JSON:
 
-"local HTTP status" is the HTTP status code modulo same-domain redirects. This
-is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc.
-Will be the same as HTTP status if redirect is non-local.
+    issnl: passthrough
+    url: passthrough
+    status_code: initial HTTP crawl status
+    terminal_url: final URL (or original if no redirects)
+    terminal_status_code: final URL (or original if no redirects)
+    terminal_content_type: content type (mimetype)
+    platform_software: slug of hosting platform, if detected
+    issnl_in_body: whether raw issnl appears in body text
+    blocked: whether we think crawler was "blocked"
+    gwb_url_success_dt: latest wayback datetime that an HTTP 200 exists
+    gwb_terminal_url_success_dt: latest wayback datetime that an HTTP 200 exists
 
-TODO: detect domain squating/spam?
+HTTP status will be -1 if domain does not even resolve.
 """
 
 import os
 import sys
+import json
 import requests
 
-def check_url(url):
+
+def sniff_platform(resp):
+    """
+    This function would try to figure out what software platform (eg, OJS) the
+    site is running.
+    TODO: unimplemented
+    """
+    # these are mostly here to filter out huge platforms and stop sniffing
+    domain_map = {
+        'jstor.org/': 'jstor',
+        'springer.com/': 'springer',
+        'springerlink.com/': 'springer',
+        'tandfonline.com/': 't_and_f',
+        'elsevier.com/': 'elsevier',
+        'wiley.com/': 'wiley',
+        'sciencedirect.com/': 'elsevier',
+        'sagepub.com/': 'sage',
+        'hypotheses.org/': 'hypothesis',
+        'tandf.co.uk/': 't_and_f',
+        'scielo': 'scielo',
+    }
+    for domain, platform in domain_map.items():
+        if domain in resp.url:
+            return platform
+    if '<meta name="generator" content="Open Journal Systems' in resp.text:
+        return "ojs"
+    return None
+
+def sniff_blocked(resp):
+    """
+    This function would try to figure out if we got blocked: soft-block, hard
+    block, etc.
+    TODO: unimplemented
+    """
+    if resp.status_code in (403, 420):
+        return True
+    # JSTOR does this
+    if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text:
+        return True
+    if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text:
+        return True
+    return None
+
+def check_gwb(url, match_type='exact'):
+    if '//web.archive.org/' in url:
+        return None
+    resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
+        'url': url,
+        'matchType': match_type,
+        'limit': -1,
+        'filter': 'statuscode:200'
+    })
+    line = resp.text.strip().split('\n')[0]
+    if line:
+        dt = line.split()[1]
+        int(dt)
+        return dt
+    else:
+        return None
+    
+
+def check_url(issnl, url):
     #print("Fetching: %s" % url)
+    info = dict(issnl=issnl, url=url)
     try:
-        resp = requests.get(url)
-    except:
-        return (url, "-1", "-1", '-')
+        resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'})
+    except requests.exceptions.TooManyRedirects:
+        info['error'] = 'TooManyRedirects'
+        info['terminal_status_code'] = info['status_code'] = -1
+        return info
+    except requests.exceptions.SSLError:
+        info['error'] = 'SSLError'
+        info['terminal_status_code'] = info['status_code'] = -1
+        return info
+    except requests.exceptions.ReadTimeout:
+        info['error'] = 'ReadTimeout'
+        info['terminal_status_code'] = info['status_code'] = -1
+        return info
+    except requests.exceptions.ConnectionError:
+        info['error'] = 'ConnectionError'
+        info['terminal_status_code'] = info['status_code'] = -1
+        return info
+    except requests.exceptions.ChunkedEncodingError:
+        info['error'] = 'ChunkedEncodingError'
+        info['terminal_status_code'] = info['status_code'] = -1
+        return info
 
-    if len(resp.history) > 0:
-        first_status = resp.history[0].status_code
+    if resp.history:
+        info['status_code'] = resp.history[0].status_code
     else:
-        first_status = resp.status_code
-    return map(str, (url, first_status, resp.status_code, resp.url))
+        info['status_code'] = resp.status_code
+
+    info['terminal_status_code'] = resp.status_code
+    info['terminal_url'] = resp.url
+    content_type = resp.headers.get('Content-Type')
+    if content_type:
+        info['terminal_content_type'] = content_type.split(';')[0]
+    info['issnl_in_body'] = bool(issnl in resp.text)
+    info['gwb_url_success_dt'] = check_gwb(url, match_type='exact')
+    info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact')
+    info['blocked'] = sniff_blocked(resp)
+    info['software_platform'] = sniff_platform(resp)
+    #info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
+    return info
 
 def run(tsvfile):
     for line in tsvfile:
         records = line.split('\t')
         issnl = records[0]
         url = records[1].strip()
-        print(issnl + '\t' + '\t'.join(check_url(url)))
+        print(json.dumps(check_url(issnl, url)))
 
 if __name__=="__main__":
     if len(sys.argv) != 2:
author	Bryan Newbold <bnewbold@robocracy.org>	2019-07-30 23:08:35 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-07-30 23:08:35 -0700
commit	c3727c86008dc9aa64ab4e61037d27a9be9d3ea6 (patch)
tree	76b330d577350b5eb070c5d47f84c1680e2b807e
parent	8e9dd0046fa5d8a117568d15463ace7363323964 (diff)
download	fatcat-c3727c86008dc9aa64ab4e61037d27a9be9d3ea6.tar.gz fatcat-c3727c86008dc9aa64ab4e61037d27a9be9d3ea6.zip