1 files changed, 77 insertions, 56 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py
index 1135d6c..23169f1 100755
--- a/check_issn_urls.py
+++ b/check_issn_urls.py
@@ -45,17 +45,17 @@ def sniff_platform(resp):
     """
     # these are mostly here to filter out huge platforms and stop sniffing
     domain_map = {
-        'jstor.org/': 'jstor',
-        'springer.com/': 'springer',
-        'springerlink.com/': 'springer',
-        'tandfonline.com/': 't_and_f',
-        'elsevier.com/': 'elsevier',
-        'wiley.com/': 'wiley',
-        'sciencedirect.com/': 'elsevier',
-        'sagepub.com/': 'sage',
-        'hypotheses.org/': 'hypothesis',
-        'tandf.co.uk/': 't_and_f',
-        'scielo': 'scielo',
+        "jstor.org/": "jstor",
+        "springer.com/": "springer",
+        "springerlink.com/": "springer",
+        "tandfonline.com/": "t_and_f",
+        "elsevier.com/": "elsevier",
+        "wiley.com/": "wiley",
+        "sciencedirect.com/": "elsevier",
+        "sagepub.com/": "sage",
+        "hypotheses.org/": "hypothesis",
+        "tandf.co.uk/": "t_and_f",
+        "scielo": "scielo",
     }
     for domain, platform in domain_map.items():
         if domain in resp.url:
@@ -64,6 +64,7 @@ def sniff_platform(resp):
         return "ojs"
     return None
 
+
 def sniff_blocked(resp):
     """
     This function would try to figure out if we got blocked: soft-block, hard
@@ -73,23 +74,33 @@ def sniff_blocked(resp):
     if resp.status_code in (403, 420):
         return True
     # JSTOR does this
-    if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text:
+    if (
+        "Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA"
+        in resp.text
+    ):
         return True
-    if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text:
+    if (
+        resp.status_code == 416
+        and "something about your browser made us think you were a bot" in resp.text
+    ):
         return True
     return None
 
-def check_gwb(url, match_type='exact'):
-    if '//web.archive.org/' in url:
+
+def check_gwb(url, match_type="exact"):
+    if "//web.archive.org/" in url:
         return None
     # crude/bad retry loop to work around CDX API throttling
     for i in range(5):
-        resp = requests.get('https://web.archive.org/cdx/search/cdx', params={
-            'url': url,
-            'matchType': match_type,
-            'limit': -1,
-            'filter': 'statuscode:200'
-        })
+        resp = requests.get(
+            "https://web.archive.org/cdx/search/cdx",
+            params={
+                "url": url,
+                "matchType": match_type,
+                "limit": -1,
+                "filter": "statuscode:200",
+            },
+        )
         if resp.status_code == 200:
             break
         time.sleep(5)
@@ -98,81 +109,91 @@ def check_gwb(url, match_type='exact'):
         # TODO: this isn't really correct, but not sure what to return/record
         # if we failed through all timeouts
         return None
-    line = resp.text.strip().split('\n')[0]
+    line = resp.text.strip().split("\n")[0]
     if line:
         dt = line.split()[1]
         int(dt)
         return dt
     else:
         return None
-    
+
 
 def check_url(issnl, url):
-    #print("Fetching: %s" % url)
+    # print("Fetching: %s" % url)
     info = dict(issnl=issnl, url=url)
     try:
-        resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'})
+        resp = requests.get(
+            url,
+            timeout=30.0,
+            headers={
+                "User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org"
+            },
+        )
     except requests.exceptions.TooManyRedirects:
-        info['error'] = 'TooManyRedirects'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "TooManyRedirects"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except requests.exceptions.SSLError:
-        info['error'] = 'SSLError'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "SSLError"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except requests.exceptions.ReadTimeout:
-        info['error'] = 'ReadTimeout'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "ReadTimeout"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except requests.exceptions.ConnectionError:
-        info['error'] = 'ConnectionError'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "ConnectionError"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except requests.exceptions.ChunkedEncodingError:
-        info['error'] = 'ChunkedEncodingError'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "ChunkedEncodingError"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except requests.exceptions.ContentDecodingError:
-        info['error'] = 'ContentDecodingError'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "ContentDecodingError"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except requests.exceptions.InvalidSchema:
-        info['error'] = 'InvalidSchema'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "InvalidSchema"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
     except UnicodeDecodeError:
-        info['error'] = 'UnicodeDecodeError'
-        info['terminal_status_code'] = info['status_code'] = -1
+        info["error"] = "UnicodeDecodeError"
+        info["terminal_status_code"] = info["status_code"] = -1
         return info
 
     if resp.history:
-        info['status_code'] = resp.history[0].status_code
+        info["status_code"] = resp.history[0].status_code
     else:
-        info['status_code'] = resp.status_code
+        info["status_code"] = resp.status_code
 
-    info['terminal_status_code'] = resp.status_code
-    info['terminal_url'] = resp.url
-    content_type = resp.headers.get('Content-Type')
+    info["terminal_status_code"] = resp.status_code
+    info["terminal_url"] = resp.url
+    content_type = resp.headers.get("Content-Type")
     if content_type:
-        info['terminal_content_type'] = content_type.split(';')[0]
-    info['issnl_in_body'] = bool(issnl in resp.text)
-    info['gwb_url_success_dt'] = check_gwb(url, match_type='exact')
-    info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact')
-    info['blocked'] = sniff_blocked(resp)
-    info['software_platform'] = sniff_platform(resp)
-    #info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
+        info["terminal_content_type"] = content_type.split(";")[0]
+    info["issnl_in_body"] = bool(issnl in resp.text)
+    info["gwb_url_success_dt"] = check_gwb(url, match_type="exact")
+    info["gwb_terminal_url_success_dt"] = check_gwb(
+        info["terminal_url"], match_type="exact"
+    )
+    info["blocked"] = sniff_blocked(resp)
+    info["software_platform"] = sniff_platform(resp)
+    # info['gwb_host_success_dt'] = check_gwb(url, match_type='host')
     return info
 
+
 def run(tsvfile):
     for line in tsvfile:
-        records = line.split('\t')
+        records = line.split("\t")
         issnl = records[0]
         url = records[1].strip()
         print(json.dumps(check_url(issnl, url)))
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     if len(sys.argv) != 2:
         f = sys.stdin
     else:
-        f = open(sys.argv[1], 'r')
+        f = open(sys.argv[1], "r")
     run(f)