diff options
| -rwxr-xr-x | extra/journal_metadata/check_issn_urls.py | 52 | 
1 files changed, 52 insertions, 0 deletions
| diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py new file mode 100755 index 00000000..009e18b6 --- /dev/null +++ b/extra/journal_metadata/check_issn_urls.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to +stdout. + +The stdin thing means you can: + +    parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv + +For each URL, do a request and record: + +    ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL + +HTTP status will be -1 if domain does not even resolve. + +"local HTTP status" is the HTTP status code modulo same-domain redirects. This +is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc. +Will be the same as HTTP status if redirect is non-local. + +TODO: detect domain squating/spam? +""" + +import os +import sys +import requests + +def check_url(url): +    #print("Fetching: %s" % url) +    try: +        resp = requests.get(url) +    except: +        return (url, "-1", "-1", '-') + +    if len(resp.history) > 0: +        first_status = resp.history[0].status_code +    else: +        first_status = resp.status_code +    return map(str, (url, first_status, resp.status_code, resp.url)) + +def run(tsvfile): +    for line in tsvfile: +        records = line.split('\t') +        issnl = records[0] +        url = records[1].strip() +        print(issnl + '\t' + '\t'.join(check_url(url))) + +if __name__=="__main__": +    if len(sys.argv) != 2: +        f = sys.stdin +    else: +        f = open(sys.argv[1], 'r') +    run(f) | 
