From ed452c219ca29c06426b03691a9369724aa7e251 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 30 Jul 2019 23:08:35 -0700 Subject: major improvements to ISSN URL checker --- check_issn_urls.py | 141 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 121 insertions(+), 20 deletions(-) diff --git a/check_issn_urls.py b/check_issn_urls.py index 009e18b..8c99997 100755 --- a/check_issn_urls.py +++ b/check_issn_urls.py @@ -1,48 +1,149 @@ #!/usr/bin/env python3 """ -Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to -stdout. +Check journal homepage status (live web and wayback) -The stdin thing means you can: - parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv +Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can: -For each URL, do a request and record: + parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json - ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL +Input columns (no header): -HTTP status will be -1 if domain does not even resolve. + ISSN-L, URL + +For each URL, do a request and record, as JSON: -"local HTTP status" is the HTTP status code modulo same-domain redirects. This -is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc. -Will be the same as HTTP status if redirect is non-local. + issnl: passthrough + url: passthrough + status_code: initial HTTP crawl status + terminal_url: final URL (or original if no redirects) + terminal_status_code: final URL (or original if no redirects) + terminal_content_type: content type (mimetype) + platform_software: slug of hosting platform, if detected + issnl_in_body: whether raw issnl appears in body text + blocked: whether we think crawler was "blocked" + gwb_url_success_dt: latest wayback datetime that an HTTP 200 exists + gwb_terminal_url_success_dt: latest wayback datetime that an HTTP 200 exists -TODO: detect domain squating/spam? +HTTP status will be -1 if domain does not even resolve. """ import os import sys +import json import requests -def check_url(url): + +def sniff_platform(resp): + """ + This function would try to figure out what software platform (eg, OJS) the + site is running. + TODO: unimplemented + """ + # these are mostly here to filter out huge platforms and stop sniffing + domain_map = { + 'jstor.org/': 'jstor', + 'springer.com/': 'springer', + 'springerlink.com/': 'springer', + 'tandfonline.com/': 't_and_f', + 'elsevier.com/': 'elsevier', + 'wiley.com/': 'wiley', + 'sciencedirect.com/': 'elsevier', + 'sagepub.com/': 'sage', + 'hypotheses.org/': 'hypothesis', + 'tandf.co.uk/': 't_and_f', + 'scielo': 'scielo', + } + for domain, platform in domain_map.items(): + if domain in resp.url: + return platform + if ' 0: - first_status = resp.history[0].status_code + if resp.history: + info['status_code'] = resp.history[0].status_code else: - first_status = resp.status_code - return map(str, (url, first_status, resp.status_code, resp.url)) + info['status_code'] = resp.status_code + + info['terminal_status_code'] = resp.status_code + info['terminal_url'] = resp.url + content_type = resp.headers.get('Content-Type') + if content_type: + info['terminal_content_type'] = content_type.split(';')[0] + info['issnl_in_body'] = bool(issnl in resp.text) + info['gwb_url_success_dt'] = check_gwb(url, match_type='exact') + info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact') + info['blocked'] = sniff_blocked(resp) + info['software_platform'] = sniff_platform(resp) + #info['gwb_host_success_dt'] = check_gwb(url, match_type='host') + return info def run(tsvfile): for line in tsvfile: records = line.split('\t') issnl = records[0] url = records[1].strip() - print(issnl + '\t' + '\t'.join(check_url(url))) + print(json.dumps(check_url(issnl, url))) if __name__=="__main__": if len(sys.argv) != 2: -- cgit v1.2.3