aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-30 23:08:35 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-30 23:08:35 -0700
commitc3727c86008dc9aa64ab4e61037d27a9be9d3ea6 (patch)
tree76b330d577350b5eb070c5d47f84c1680e2b807e
parent8e9dd0046fa5d8a117568d15463ace7363323964 (diff)
downloadfatcat-c3727c86008dc9aa64ab4e61037d27a9be9d3ea6.tar.gz
fatcat-c3727c86008dc9aa64ab4e61037d27a9be9d3ea6.zip
major improvements to ISSN URL checker
-rwxr-xr-xextra/journal_metadata/check_issn_urls.py141
1 files changed, 121 insertions, 20 deletions
diff --git a/extra/journal_metadata/check_issn_urls.py b/extra/journal_metadata/check_issn_urls.py
index 009e18b6..8c99997e 100755
--- a/extra/journal_metadata/check_issn_urls.py
+++ b/extra/journal_metadata/check_issn_urls.py
@@ -1,48 +1,149 @@
#!/usr/bin/env python3
"""
-Takes a tsv filepath (see extract_issn_urls.py) or lines on stdin and dumps to
-stdout.
+Check journal homepage status (live web and wayback)
-The stdin thing means you can:
- parallel --bar --pipepart -a road_oa_issn_urls.tsv ./check_issn_urls.py > url_status.tsv
+Takes a tsv filepath or lines on stdin and dumps to stdout. The stdin thing means you can:
-For each URL, do a request and record:
+ parallel -j100 --bar --pipepart -a urls_to_crawl.tsv ./check_issn_urls.py > url_status.json
- ISSN, URL (or SURT?), HTTP first status, HTTP final status, final URL
+Input columns (no header):
-HTTP status will be -1 if domain does not even resolve.
+ ISSN-L, URL
+
+For each URL, do a request and record, as JSON:
-"local HTTP status" is the HTTP status code modulo same-domain redirects. This
-is intended to accomodate HTTPS upgrades, changes in web app URL schemes, etc.
-Will be the same as HTTP status if redirect is non-local.
+ issnl: passthrough
+ url: passthrough
+ status_code: initial HTTP crawl status
+ terminal_url: final URL (or original if no redirects)
+ terminal_status_code: final URL (or original if no redirects)
+ terminal_content_type: content type (mimetype)
+ platform_software: slug of hosting platform, if detected
+ issnl_in_body: whether raw issnl appears in body text
+ blocked: whether we think crawler was "blocked"
+ gwb_url_success_dt: latest wayback datetime that an HTTP 200 exists
+ gwb_terminal_url_success_dt: latest wayback datetime that an HTTP 200 exists
-TODO: detect domain squating/spam?
+HTTP status will be -1 if domain does not even resolve.
"""
import os
import sys
+import json
import requests
-def check_url(url):
+
+def sniff_platform(resp):
+ """
+ This function would try to figure out what software platform (eg, OJS) the
+ site is running.
+ TODO: unimplemented
+ """
+ # these are mostly here to filter out huge platforms and stop sniffing
+ domain_map = {
+ 'jstor.org/': 'jstor',
+ 'springer.com/': 'springer',
+ 'springerlink.com/': 'springer',
+ 'tandfonline.com/': 't_and_f',
+ 'elsevier.com/': 'elsevier',
+ 'wiley.com/': 'wiley',
+ 'sciencedirect.com/': 'elsevier',
+ 'sagepub.com/': 'sage',
+ 'hypotheses.org/': 'hypothesis',
+ 'tandf.co.uk/': 't_and_f',
+ 'scielo': 'scielo',
+ }
+ for domain, platform in domain_map.items():
+ if domain in resp.url:
+ return platform
+ if '<meta name="generator" content="Open Journal Systems' in resp.text:
+ return "ojs"
+ return None
+
+def sniff_blocked(resp):
+ """
+ This function would try to figure out if we got blocked: soft-block, hard
+ block, etc.
+ TODO: unimplemented
+ """
+ if resp.status_code in (403, 420):
+ return True
+ # JSTOR does this
+ if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text:
+ return True
+ if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text:
+ return True
+ return None
+
+def check_gwb(url, match_type='exact'):
+ if '//web.archive.org/' in url:
+ return None
+ resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
+ 'url': url,
+ 'matchType': match_type,
+ 'limit': -1,
+ 'filter': 'statuscode:200'
+ })
+ line = resp.text.strip().split('\n')[0]
+ if line:
+ dt = line.split()[1]
+ int(dt)
+ return dt
+ else:
+ return None
+
+
+def check_url(issnl, url):
#print("Fetching: %s" % url)
+ info = dict(issnl=issnl, url=url)
try:
- resp = requests.get(url)
- except:
- return (url, "-1", "-1", '-')
+ resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'})
+ except requests.exceptions.TooManyRedirects:
+ info['error'] = 'TooManyRedirects'
+ info['terminal_status_code'] = info['status_code'] = -1
+ return info
+ except requests.exceptions.SSLError:
+ info['error'] = 'SSLError'
+ info['terminal_status_code'] = info['status_code'] = -1
+ return info
+ except requests.exceptions.ReadTimeout:
+ info['error'] = 'ReadTimeout'
+ info['terminal_status_code'] = info['status_code'] = -1
+ return info
+ except requests.exceptions.ConnectionError:
+ info['error'] = 'ConnectionError'
+ info['terminal_status_code'] = info['status_code'] = -1
+ return info
+ except requests.exceptions.ChunkedEncodingError:
+ info['error'] = 'ChunkedEncodingError'
+ info['terminal_status_code'] = info['status_code'] = -1
+ return info
- if len(resp.history) > 0:
- first_status = resp.history[0].status_code
+ if resp.history:
+ info['status_code'] = resp.history[0].status_code
else:
- first_status = resp.status_code
- return map(str, (url, first_status, resp.status_code, resp.url))
+ info['status_code'] = resp.status_code
+
+ info['terminal_status_code'] = resp.status_code
+ info['terminal_url'] = resp.url
+ content_type = resp.headers.get('Content-Type')
+ if content_type:
+ info['terminal_content_type'] = content_type.split(';')[0]
+ info['issnl_in_body'] = bool(issnl in resp.text)
+ info['gwb_url_success_dt'] = check_gwb(url, match_type='exact')
+ info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact')
+ info['blocked'] = sniff_blocked(resp)
+ info['software_platform'] = sniff_platform(resp)
+ #info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
+ return info
def run(tsvfile):
for line in tsvfile:
records = line.split('\t')
issnl = records[0]
url = records[1].strip()
- print(issnl + '\t' + '\t'.join(check_url(url)))
+ print(json.dumps(check_url(issnl, url)))
if __name__=="__main__":
if len(sys.argv) != 2: