From 3cefbeb3c56e4d9fc3cfe566567feccc386bb212 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 31 Jul 2019 21:57:07 -0700 Subject: webarchive_urls separate from regular URLs --- chocula.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/chocula.py b/chocula.py index 2bd111e..d1be6ae 100755 --- a/chocula.py +++ b/chocula.py @@ -1013,6 +1013,10 @@ class ChoculaDatabase(): counts['total'] += 1 url = row['url'] assert(url) + if row.get('gwb_url_success_dt') == 'error': + row['gwb_url_success_dt'] = None + if row.get('gwb_terminal_url_success_dt') == 'error': + row['gwb_terminal_url_success_dt'] = None self.c.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?", (row['status_code'], row.get('crawl_error'), @@ -1303,10 +1307,21 @@ class ChoculaDatabase(): extra['sherpa'] = dict(color=row['sherpa_color']) urls = [] + webarchive_urls = [] cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']]) for hrow in cur: + if 'web.archive.org/web' in hrow['url']: + webarchive_urls.append(hrow['url']) + urls.append(hrow['url']) + continue + if hrow['host'] in ('www.google.com', 'books.google.com'): + # individual books or google searches, not journal/conference homepages + continue + if '/oai/request' in hrow['url']: + # OAI-PMH endpoints, not homepages + continue if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error': - urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url'])) + webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url'])) continue if hrow['blocked']: urls.append(hrow['url']) @@ -1318,6 +1333,11 @@ class ChoculaDatabase(): else: urls.append(hrow['url']) continue + # didn't even crawl and no match? add anyways as a pass-through + if not hrow['status_code']: + urls.append(hrow['url']) + continue + extra['webarchive_urls'] = urls extra['urls'] = urls out['extra'] = extra print(json.dumps(out)) -- cgit v1.2.3