#!/usr/bin/env python3 """ Check journal homepage status (live web and wayback) Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can: # be sure to randomize order if you are going to use high parallelism so no # individual domain gets swamped. also remember this hits CDX API multiple # times. parallel -j10 --bar --pipepart -a urls_to_crawl.shuf.tsv ./check_issn_urls.py > url_status.json Input columns (no header): ISSN-L, URL For each URL, do a request and record, as JSON: issnl: passthrough url: passthrough status_code: initial HTTP crawl status terminal_url: final URL (or original if no redirects) terminal_status_code: final URL (or original if no redirects) terminal_content_type: content type (mimetype) platform_software: slug of hosting platform, if detected issnl_in_body: whether raw issnl appears in body text blocked: whether we think crawler was "blocked" gwb_url_success_dt: latest wayback datetime that an HTTP 200 exists gwb_terminal_url_success_dt: latest wayback datetime that an HTTP 200 exists HTTP status will be -1 if domain does not even resolve. """ import os import sys import json import time import requests def sniff_platform(resp): """ This function would try to figure out what software platform (eg, OJS) the site is running. TODO: unimplemented """ # these are mostly here to filter out huge platforms and stop sniffing domain_map = { 'jstor.org/': 'jstor', 'springer.com/': 'springer', 'springerlink.com/': 'springer', 'tandfonline.com/': 't_and_f', 'elsevier.com/': 'elsevier', 'wiley.com/': 'wiley', 'sciencedirect.com/': 'elsevier', 'sagepub.com/': 'sage', 'hypotheses.org/': 'hypothesis', 'tandf.co.uk/': 't_and_f', 'scielo': 'scielo', } for domain, platform in domain_map.items(): if domain in resp.url: return platform if '