aboutsummaryrefslogtreecommitdiffstats
path: root/chocula
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-07-31 15:24:22 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-03 10:50:14 -0700
commitee65634b8f7d8cc482b8bbe521f7ad9e62e4211c (patch)
tree304b49d45aa90a355a43baef04272ce9d80796f0 /chocula
parent99e67b079f8d3267e9bbd53bbdb9cf729be61d87 (diff)
downloadchocula-ee65634b8f7d8cc482b8bbe521f7ad9e62e4211c.tar.gz
chocula-ee65634b8f7d8cc482b8bbe521f7ad9e62e4211c.zip
more blocked URLs and domains
Diffstat (limited to 'chocula')
-rw-r--r--chocula/database.py29
1 files changed, 29 insertions, 0 deletions
diff --git a/chocula/database.py b/chocula/database.py
index 0820e0f..fa8d105 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -53,7 +53,10 @@ class HomepageUrl:
or "://firstsearch.oclc.org" in url
or "://bibpurl.oclc.org" in url
or "://books.google.com" in url
+ or "://translate.google.com" in url
or "://search.ebscohost.com" in url
+ or "://search.proquest.com" in url
+ or "://gateway.proquest.com" in url
):
return None
if url.startswith("www."):
@@ -533,6 +536,11 @@ class ChoculaDatabase:
out["any_live_homepage"] = True
if hrow["gwb_url_success_dt"] or hrow["gwb_terminal_url_success_dt"]:
out["any_gwb_homepage"] = True
+ if not out.get("platform"):
+ if hrow["domain"] == "wordpress.com":
+ out["platform"] = "wordpress"
+ elif hrow["domain"] == "hypotheses.org":
+ out["platform"] = "hypotheses"
if out.get("wikidata_qid"):
assert out["wikidata_qid"].startswith("Q")
@@ -571,6 +579,8 @@ class ChoculaDatabase:
or "association" in pl
or "academy of " in pl
or "institute of" in pl
+ or "ieee" in pl
+ or "ieee" in out.get("name", "")
):
out["publisher_type"] = "society"
elif publisher in UNI_PRESS_PUBLISHERS or "university " in pl:
@@ -702,6 +712,13 @@ class ChoculaDatabase:
if hrow["host"] in (
"www.google.com",
"books.google.com",
+ "translate.google.com",
+ "drive.google.com",
+ "mail.google.com",
+ "play.google.com",
+ "news.google.com",
+ "docs.google.com",
+ "goo.gl",
"www.loc.gov",
"search.ebscohost.com",
"bibpurl.oclc.org",
@@ -714,6 +731,18 @@ class ChoculaDatabase:
"umi.com",
"search.informit.com.au",
"search.ebscohost.com",
+ "search.proquest.com",
+ "gateway.proquest.com",
+ "purl.access.gpo.gov",
+ "arxiv.org",
+ "pubmedcentral.nih.gov",
+ "ncbi.nlm.nih.gov",
+ "heinonline.org",
+ "www.heinonline.org",
+ "crcnetbase.com",
+ "nla.gov.au",
+ "purl.nla.gov.au",
+ "www.bibliothek.uni-regensburg.de",
):
# individual books or google searches, not journal/conference homepages
# LOC scanned newspapers