aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-30 21:52:56 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-30 21:52:56 -0700
commit8e9dd0046fa5d8a117568d15463ace7363323964 (patch)
tree419ffb981f72b7c943b5ba0d21514270e7eeb785
parent615c81605190499db2fa98cb85610197d3ce5507 (diff)
downloadfatcat-8e9dd0046fa5d8a117568d15463ace7363323964.tar.gz
fatcat-8e9dd0046fa5d8a117568d15463ace7363323964.zip
import vanilla ISSN url checker script
-rwxr-xr-xextra/journal_metadata/check_issn_urls.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py
new file mode 100755
index 00000000..009e18b6
--- /dev/null
+++ b/extra/journal_metadata/check_issn_urls.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""
+Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to
+stdout.
+
+The stdin thing means you can:
+
+ parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv
+
+For each URL, do a request and record:
+
+ ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL
+
+HTTP status will be -1 if domain does not even resolve.
+
+"local HTTP status" is the HTTP status code modulo same-domain redirects. This
+is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc.
+Will be the same as HTTP status if redirect is non-local.
+
+TODO: detect domain squating/spam?
+"""
+
+import os
+import sys
+import requests
+
+def check_url(url):
+ #print("Fetching: %s" % url)
+ try:
+ resp = requests.get(url)
+ except:
+ return (url, "-1", "-1", '-')
+
+ if len(resp.history) > 0:
+ first_status = resp.history[0].status_code
+ else:
+ first_status = resp.status_code
+ return map(str, (url, first_status, resp.status_code, resp.url))
+
+def run(tsvfile):
+ for line in tsvfile:
+ records = line.split('\t')
+ issnl = records[0]
+ url = records[1].strip()
+ print(issnl + '\t' + '\t'.join(check_url(url)))
+
+if __name__=="__main__":
+ if len(sys.argv) != 2:
+ f = sys.stdin
+ else:
+ f = open(sys.argv[1], 'r')
+ run(f)