#!/usr/bin/env python3 """ Check journal homepage status (live web and wayback) Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can: # be sure to randomize order if you are going to use high parallelism so no # individual domain gets swamped. also remember this hits CDX API multiple # times. parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json Input columns (no header): ISSN-L, URL For each URL, do a request and record, as JSON: issnl: passthrough url: passthrough status_code: initial HTTP crawl status terminal_url: final URL (or original if no redirects) terminal_status_code: final URL (or original if no redirects) terminal_content_type: content type (mimetype) platform_software: slug of hosting platform, if detected issnl_in_body: whether raw issnl appears in body text blocked: whether we think crawler was "blocked" gwb_url_success_dt: latest wayback datetime that an HTTP 200 exists gwb_terminal_url_success_dt: latest wayback datetime that an HTTP 200 exists HTTP status will be -1 if domain does not even resolve. """ import sys import json import time import requests def sniff_platform(resp): """ This function would try to figure out what software platform (eg, OJS) the site is running. TODO: unimplemented """ # these are mostly here to filter out huge platforms and stop sniffing domain_map = { "jstor.org/": "jstor", "springer.com/": "springer", "springerlink.com/": "springer", "tandfonline.com/": "t_and_f", "elsevier.com/": "elsevier", "wiley.com/": "wiley", "sciencedirect.com/": "elsevier", "sagepub.com/": "sage", "hypotheses.org/": "hypothesis", "tandf.co.uk/": "t_and_f", "scielo": "scielo", } for domain, platform in domain_map.items(): if domain in resp.url: return platform if '