aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-30 21:52:56 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-30 21:52:56 -0700
commit5cb3aae1cc91176c94d0256e3095860c6ba9cffe (patch)
treefa1db43e4a2c07d2162432e699ce4e430bb62c6c
parent9f6da8e7e8ecd64491c2f2f5f72d99c50b3ee20e (diff)
downloadchocula-5cb3aae1cc91176c94d0256e3095860c6ba9cffe.tar.gz
chocula-5cb3aae1cc91176c94d0256e3095860c6ba9cffe.zip
import vanilla ISSN url checker script
-rwxr-xr-xcheck_issn_urls.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py
new file mode 100755
index 0000000..009e18b
--- /dev/null
+++ b/check_issn_urls.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to
+stdout.
+
+The stdin thing means you can:
+
+ parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv
+
+For each URL, do a request and record:
+
+ ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL
+
+HTTP status will be -1 if domain does not even resolve.
+
+"local HTTP status" is the HTTP status code modulo same-domain redirects. This
+is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc.
+Will be the same as HTTP status if redirect is non-local.
+
+TODO: detect domain squating/spam?
+"""
+
+import os
+import sys
+import requests
+
+def check_url(url):
+ #print("Fetching: %s" % url)
+ try:
+ resp = requests.get(url)
+ except:
+ return (url, "-1", "-1", '-')
+
+ if len(resp.history) > 0:
+ first_status = resp.history[0].status_code
+ else:
+ first_status = resp.status_code
+ return map(str, (url, first_status, resp.status_code, resp.url))
+
+def run(tsvfile):
+ for line in tsvfile:
+ records = line.split('\t')
+ issnl = records[0]
+ url = records[1].strip()
+ print(issnl + '\t' + '\t'.join(check_url(url)))
+
+if __name__=="__main__":
+ if len(sys.argv) != 2:
+ f = sys.stdin
+ else:
+ f = open(sys.argv[1], 'r')
+ run(f)