aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-30 14:21:11 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-30 14:21:11 -0800
commit55539e94edc9a49fca1dafdd9468966abd33fe10 (patch)
tree719bf5586b59f6ee65f14335768c46cf3a1177d3
parent9542ab3ea9145e937e412bb707d96ab031b13e31 (diff)
downloadchocula-55539e94edc9a49fca1dafdd9468966abd33fe10.tar.gz
chocula-55539e94edc9a49fca1dafdd9468966abd33fe10.zip
simplify homepage URL handling code a bit
-rw-r--r--chocula/database.py26
1 files changed, 14 insertions, 12 deletions
diff --git a/chocula/database.py b/chocula/database.py
index 9d7bfb1..0af64a5 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -323,9 +323,17 @@ class ChoculaDatabase:
counts["total"] += 1
url = row["url"]
assert url
- if row.get("gwb_url_success_dt") == "error":
+ if not (
+ row.get("gwb_url_success_dt")
+ and row["gwb_url_success_dt"].isdigit()
+ and len(row["gwb_url_success_dt"]) == 14
+ ):
row["gwb_url_success_dt"] = None
- if row.get("gwb_terminal_url_success_dt") == "error":
+ if not (
+ row.get("gwb_terminal_url_success_dt")
+ and row["gwb_terminal_url_success_dt"]
+ and len(row["gwb_terminal_url_success_dt"]) == 14
+ ):
row["gwb_terminal_url_success_dt"] = None
cur.execute(
"UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
@@ -724,17 +732,15 @@ class ChoculaDatabase:
"SELECT * FROM homepage WHERE issnl = ?;", [row["issnl"]]
)
for hrow in cur:
- if "://doaj.org/" in hrow["url"] or "://www.doaj.org/" in hrow["url"]:
- continue
- if "://www.ncbi.nlm.nih.gov/" in hrow["url"]:
- continue
if "LOCKSS_RESOLVER" in hrow["url"]:
continue
if "web.archive.org/web" in hrow["url"]:
webarchive_urls.append(hrow["url"])
- urls.append(hrow["url"])
continue
if hrow["host"] in (
+ "doaj.org",
+ "www.doaj.org",
+ "www.ncbi.nlm.nih.gov",
"www.google.com",
"books.google.com",
"translate.google.com",
@@ -775,11 +781,7 @@ class ChoculaDatabase:
if "/oai/request" in hrow["url"]:
# OAI-PMH endpoints, not homepages
continue
- if (
- not row["any_live_homepage"]
- and hrow["gwb_url_success_dt"]
- and hrow["gwb_url_success_dt"] != "error"
- ):
+ if not row["any_live_homepage"] and hrow["gwb_url_success_dt"]:
webarchive_urls.append(
"https://web.archive.org/web/{}/{}".format(
hrow["gwb_url_success_dt"], hrow["url"]