diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-30 21:52:56 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-30 21:52:56 -0700 |
commit | 8e9dd0046fa5d8a117568d15463ace7363323964 (patch) | |
tree | 419ffb981f72b7c943b5ba0d21514270e7eeb785 | |
parent | 615c81605190499db2fa98cb85610197d3ce5507 (diff) | |
download | fatcat-8e9dd0046fa5d8a117568d15463ace7363323964.tar.gz fatcat-8e9dd0046fa5d8a117568d15463ace7363323964.zip |
import vanilla ISSN url checker script
-rwxr-xr-x | extra/journal_metadata/check_issn_urls.py | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py new file mode 100755 index 00000000..009e18b6 --- /dev/null +++ b/extra/journal_metadata/check_issn_urls.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +""" +Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to +stdout. + +The stdin thing means you can: + + parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv + +For each URL, do a request and record: + + ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL + +HTTP status will be -1 if domain does not even resolve. + +"local HTTP status" is the HTTP status code modulo same-domain redirects. This +is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc. +Will be the same as HTTP status if redirect is non-local. + +TODO: detect domain squating/spam? +""" + +import os +import sys +import requests + +def check_url(url): + #print("Fetching: %s" % url) + try: + resp = requests.get(url) + except: + return (url, "-1", "-1", '-') + + if len(resp.history) > 0: + first_status = resp.history[0].status_code + else: + first_status = resp.status_code + return map(str, (url, first_status, resp.status_code, resp.url)) + +def run(tsvfile): + for line in tsvfile: + records = line.split('\t') + issnl = records[0] + url = records[1].strip() + print(issnl + '\t' + '\t'.join(check_url(url))) + +if __name__=="__main__": + if len(sys.argv) != 2: + f = sys.stdin + else: + f = open(sys.argv[1], 'r') + run(f) |