1 files changed, 52 insertions, 0 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py
new file mode 100755
index 00000000..009e18b6
--- /dev/null
+++ b/extra/journal_metadata/check_issn_urls.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to
+stdout.
+
+The stdin thing means you can:
+
+    parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv
+
+For each URL, do a request and record:
+
+    ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL
+
+HTTP status will be -1 if domain does not even resolve.
+
+"local HTTP status" is the HTTP status code modulo same-domain redirects. This
+is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc.
+Will be the same as HTTP status if redirect is non-local.
+
+TODO: detect domain squating/spam?
+"""
+
+import os
+import sys
+import requests
+
+def check_url(url):
+    #print("Fetching: %s" % url)
+    try:
+        resp = requests.get(url)
+    except:
+        return (url, "-1", "-1", '-')
+
+    if len(resp.history) > 0:
+        first_status = resp.history[0].status_code
+    else:
+        first_status = resp.status_code
+    return map(str, (url, first_status, resp.status_code, resp.url))
+
+def run(tsvfile):
+    for line in tsvfile:
+        records = line.split('\t')
+        issnl = records[0]
+        url = records[1].strip()
+        print(issnl + '\t' + '\t'.join(check_url(url)))
+
+if __name__=="__main__":
+    if len(sys.argv) != 2:
+        f = sys.stdin
+    else:
+        f = open(sys.argv[1], 'r')
+    run(f)