aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-07-31 21:57:07 -0700
committerBryan Newbold <bnewbold@archive.org>2019-07-31 21:57:07 -0700
commit3cefbeb3c56e4d9fc3cfe566567feccc386bb212 (patch)
tree6949955914d396920446c4f43b5e5127242d8a09
parente58a5c98b3254f7d0e7a999edd7b1868edc87318 (diff)
downloadchocula-3cefbeb3c56e4d9fc3cfe566567feccc386bb212.tar.gz
chocula-3cefbeb3c56e4d9fc3cfe566567feccc386bb212.zip
webarchive_urls separate from regular URLs
-rwxr-xr-xchocula.py22
1 files changed, 21 insertions, 1 deletions
diff --git a/chocula.py b/chocula.py
index 2bd111e..d1be6ae 100755
--- a/chocula.py
+++ b/chocula.py
@@ -1013,6 +1013,10 @@ class ChoculaDatabase():
counts['total'] += 1
url = row['url']
assert(url)
+ if row.get('gwb_url_success_dt') == 'error':
+ row['gwb_url_success_dt'] = None
+ if row.get('gwb_terminal_url_success_dt') == 'error':
+ row['gwb_terminal_url_success_dt'] = None
self.c.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
(row['status_code'],
row.get('crawl_error'),
@@ -1303,10 +1307,21 @@ class ChoculaDatabase():
extra['sherpa'] = dict(color=row['sherpa_color'])
urls = []
+ webarchive_urls = []
cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']])
for hrow in cur:
+ if 'web.archive.org/web' in hrow['url']:
+ webarchive_urls.append(hrow['url'])
+ urls.append(hrow['url'])
+ continue
+ if hrow['host'] in ('www.google.com', 'books.google.com'):
+ # individual books or google searches, not journal/conference homepages
+ continue
+ if '/oai/request' in hrow['url']:
+ # OAI-PMH endpoints, not homepages
+ continue
if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error':
- urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url']))
+ webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url']))
continue
if hrow['blocked']:
urls.append(hrow['url'])
@@ -1318,6 +1333,11 @@ class ChoculaDatabase():
else:
urls.append(hrow['url'])
continue
+ # didn't even crawl and no match? add anyways as a pass-through
+ if not hrow['status_code']:
+ urls.append(hrow['url'])
+ continue
+ extra['webarchive_urls'] = urls
extra['urls'] = urls
out['extra'] = extra
print(json.dumps(out))