aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/database.py
diff options
context:
space:
mode:
Diffstat (limited to 'chocula/database.py')
-rw-r--r--chocula/database.py604
1 files changed, 356 insertions, 248 deletions
diff --git a/chocula/database.py b/chocula/database.py
index f620515..11632b9 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -1,4 +1,3 @@
-
from __future__ import annotations
import sys
@@ -47,41 +46,49 @@ class HomepageUrl:
"""
Returns None if url is really bad (not a URL).
"""
- if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+ if (
+ not url
+ or "mailto:" in url.lower()
+ or url.lower() in ("http://n/a", "http://na/", "http://na")
+ ):
return None
- if url.startswith('www.'):
+ if url.startswith("www."):
url = "http://" + url
- if url.startswith('ttp://') or url.startswith('ttps://'):
+ if url.startswith("ttp://") or url.startswith("ttps://"):
url = "h" + url
- url.replace('Http://', 'http://')
+ url.replace("Http://", "http://")
url = str(urlcanon.semantic_precise(url))
- if url == 'http://na/':
+ if url == "http://na/":
# sort of redundant with above, but some only match after canonicalization
return None
url_surt = surt.surt(url)
tld = tldextract.extract(url)
- host = '.'.join(tld)
- if host.startswith('.'):
+ host = ".".join(tld)
+ if host.startswith("."):
host = host[1:]
- return HomepageUrl(url=url,
- surt=url_surt,
- host=host,
- domain=tld.registered_domain,
- suffix=tld.suffix)
+ return HomepageUrl(
+ url=url,
+ surt=url_surt,
+ host=host,
+ domain=tld.registered_domain,
+ suffix=tld.suffix,
+ )
+
def test_from_url():
-
- assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == 'core.ac.uk'
- assert HomepageUrl.from_url("http://thing.core.ac.uk").host == 'thing.core.ac.uk'
- assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix== 'ac.uk'
- assert HomepageUrl.from_url("google.com").suffix == 'com'
- assert HomepageUrl.from_url("google.com").host == 'google.com'
+ assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == "core.ac.uk"
+ assert HomepageUrl.from_url("http://thing.core.ac.uk").host == "thing.core.ac.uk"
+ assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix == "ac.uk"
+
+ assert HomepageUrl.from_url("google.com").suffix == "com"
+ assert HomepageUrl.from_url("google.com").host == "google.com"
assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") == None
- assert HomepageUrl.from_url("thing.com").url == 'http://thing.com/'
- assert HomepageUrl.from_url("Http://thing.com///").url == 'http://thing.com/'
+ assert HomepageUrl.from_url("thing.com").url == "http://thing.com/"
+ assert HomepageUrl.from_url("Http://thing.com///").url == "http://thing.com/"
+
@dataclass
class UrlCrawlStatus:
@@ -95,6 +102,7 @@ class UrlCrawlStatus:
gwb_url_success_dt: Optional[str]
gwb_terminal_url_success_dt: Optional[str]
+
@dataclass
class DirectoryInfo:
directory_slug: str
@@ -127,10 +135,19 @@ class DirectoryInfo:
"""
if not self.issnl:
raise ValueError
- extra_dict = self.extra
-
- for k in ('issne', 'issnp', 'name', 'publisher', 'abbrev', 'platform',
- 'country', 'langs', 'original_name'):
+ extra_dict = self.extra
+
+ for k in (
+ "issne",
+ "issnp",
+ "name",
+ "publisher",
+ "abbrev",
+ "platform",
+ "country",
+ "langs",
+ "original_name",
+ ):
if self.__dict__[k]:
extra_dict[k] = self.__dict__[k]
@@ -151,7 +168,7 @@ class DirectoryInfo:
raise NotImplementedError()
-class IssnDatabase():
+class IssnDatabase:
"""
Holds complete ISSN/ISSN-L table and helps with lookups and munging of raw
ISSN strings
@@ -163,7 +180,7 @@ class IssnDatabase():
def read_issn_map_file(self, issn_map_path: str):
print("##### Loading ISSN-L map file...", file=sys.stderr)
- with open(issn_map_path, 'r') as issn_map_file:
+ with open(issn_map_path, "r") as issn_map_file:
for line in issn_map_file:
if line.startswith("ISSN") or len(line) == 0:
continue
@@ -209,7 +226,7 @@ class IssnDatabase():
return info
-class ChoculaDatabase():
+class ChoculaDatabase:
"""
Wraps a sqlite3 database
"""
@@ -218,7 +235,7 @@ class ChoculaDatabase():
"""
To create a temporary database, pass ":memory:" as db_file
"""
- self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self.data = dict()
self.issn_db = issn_db
@@ -247,8 +264,7 @@ class ChoculaDatabase():
cur = self.db.cursor()
try:
- cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)",
- info.to_db_tuple())
+ cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", info.to_db_tuple())
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
return "duplicate"
@@ -264,7 +280,8 @@ class ChoculaDatabase():
try:
cur.execute(
"INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)",
- homepage.to_db_tuple(issnl))
+ homepage.to_db_tuple(issnl),
+ )
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
return "duplicate"
@@ -276,29 +293,33 @@ class ChoculaDatabase():
print("##### Loading IA Homepage Crawl Results...")
counts: Counter = Counter()
cur = self.db.cursor()
- for line in open(config.homepage_status.filepath, 'r'):
+ for line in open(config.homepage_status.filepath, "r"):
if not line.strip():
continue
row = json.loads(line)
- counts['total'] += 1
- url = row['url']
- assert(url)
- if row.get('gwb_url_success_dt') == 'error':
- row['gwb_url_success_dt'] = None
- if row.get('gwb_terminal_url_success_dt') == 'error':
- row['gwb_terminal_url_success_dt'] = None
- cur.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
- (row['status_code'],
- row.get('crawl_error'),
- row.get('terminal_url'),
- row.get('terminal_status_code'),
- row.get('platform_software'),
- row.get('issnl_in_body'),
- row.get('blocked'),
- row.get('gwb_url_success_dt'),
- row.get('gwb_terminal_url_success_dt'),
- url))
- counts['updated'] += 1
+ counts["total"] += 1
+ url = row["url"]
+ assert url
+ if row.get("gwb_url_success_dt") == "error":
+ row["gwb_url_success_dt"] = None
+ if row.get("gwb_terminal_url_success_dt") == "error":
+ row["gwb_terminal_url_success_dt"] = None
+ cur.execute(
+ "UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?",
+ (
+ row["status_code"],
+ row.get("crawl_error"),
+ row.get("terminal_url"),
+ row.get("terminal_status_code"),
+ row.get("platform_software"),
+ row.get("issnl_in_body"),
+ row.get("blocked"),
+ row.get("gwb_url_success_dt"),
+ row.get("gwb_terminal_url_success_dt"),
+ url,
+ ),
+ )
+ counts["updated"] += 1
cur.close()
self.db.commit()
return counts
@@ -306,51 +327,54 @@ class ChoculaDatabase():
def load_fatcat_containers(self, config: ChoculaConfig) -> Counter:
print("##### Loading Fatcat Container Entities...")
# JSON
- json_file = open(config.fatcat_containers.filepath, 'r')
+ json_file = open(config.fatcat_containers.filepath, "r")
counts: Counter = Counter()
cur = self.db.cursor()
for line in json_file:
if not line:
continue
row = json.loads(line)
- if row['state'] != 'active':
+ if row["state"] != "active":
continue
- counts['total'] += 1
- extra = row.get('extra', dict())
- issne = extra.get('issne')
- issnp = extra.get('issnp')
- country = extra.get('country')
- languages = extra.get('languages', [])
+ counts["total"] += 1
+ extra = row.get("extra", dict())
+ issne = extra.get("issne")
+ issnp = extra.get("issnp")
+ country = extra.get("country")
+ languages = extra.get("languages", [])
lang = None
if languages:
lang = languages[0]
try:
- cur.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
- (row.get('issnl'),
- row['ident'],
- row['revision'],
- issne,
- issnp,
- row.get('wikidata_qid'),
- row['name'],
- row.get('container_type'),
- extra.get('publisher'),
- country,
- lang,
- ))
+ cur.execute(
+ "INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)",
+ (
+ row.get("issnl"),
+ row["ident"],
+ row["revision"],
+ issne,
+ issnp,
+ row.get("wikidata_qid"),
+ row["name"],
+ row.get("container_type"),
+ extra.get("publisher"),
+ country,
+ lang,
+ ),
+ )
except sqlite3.IntegrityError as ie:
if str(ie).startswith("UNIQUE"):
counts["existing"] += 1
continue
else:
raise ie
- counts['inserted'] += 1
- if row.get('issnl'):
- urls = extra.get('urls', [])
+ counts["inserted"] += 1
+ if row.get("issnl"):
+ urls = extra.get("urls", [])
for url in urls:
homepage = HomepageUrl.from_url(url)
if homepage:
- self.insert_homepage(row.get('issnl'), homepage, cur)
+ self.insert_homepage(row.get("issnl"), homepage, cur)
cur.close()
self.db.commit()
return counts
@@ -358,22 +382,31 @@ class ChoculaDatabase():
def load_fatcat_stats(self, config: ChoculaConfig) -> Counter:
print("##### Loading Fatcat Container Stats...")
# JSON
- json_file = open(config.fatcat_stats.filepath, 'r')
+ json_file = open(config.fatcat_stats.filepath, "r")
counts: Counter = Counter()
cur = self.db.cursor()
for line in json_file:
if not line:
continue
row = json.loads(line)
- total = int(row['total'])
+ total = int(row["total"])
ia_frac: Optional[float] = None
preserved_frac: Optional[float] = None
if total > 0:
- ia_frac = float(row['in_web'])/total
- preserved_frac = float(row['is_preserved'])/total
- cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
- (total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl']))
- counts['updated'] += 1
+ ia_frac = float(row["in_web"]) / total
+ preserved_frac = float(row["is_preserved"]) / total
+ cur.execute(
+ "UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?",
+ (
+ total,
+ row["in_web"],
+ ia_frac,
+ row["is_preserved"],
+ preserved_frac,
+ row["issnl"],
+ ),
+ )
+ counts["updated"] += 1
cur.close()
self.db.commit()
return counts
@@ -384,10 +417,10 @@ class ChoculaDatabase():
self.db.row_factory = sqlite3.Row
cur = self.db.execute("SELECT issnl, url FROM homepage;")
for hrow in cur:
- assert(hrow['url'])
- assert(len(hrow['url'].split()) == 1)
- counts['total'] += 1
- print('\t'.join((hrow['issnl'], hrow['url'])))
+ assert hrow["url"]
+ assert len(hrow["url"].split()) == 1
+ counts["total"] += 1
+ print("\t".join((hrow["issnl"], hrow["url"])))
return counts
def summarize(self) -> Counter:
@@ -395,135 +428,189 @@ class ChoculaDatabase():
counts: Counter = Counter()
cur = self.db.cursor()
self.db.row_factory = sqlite3.Row
- index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory'))
- fatcat_issnls = list(cur.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null'))
+ index_issnls = list(cur.execute("SELECT DISTINCT issnl FROM directory"))
+ fatcat_issnls = list(
+ cur.execute(
+ "SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null"
+ )
+ )
all_issnls = set([i[0] for i in index_issnls + fatcat_issnls])
print("{} total ISSN-Ls".format(len(all_issnls)))
for issnl in all_issnls:
- #print(issnl)
- counts['total'] += 1
+ # print(issnl)
+ counts["total"] += 1
out = dict()
# check if ISSN-L is good. this is here because of fatcat import
- out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl)
- if not out['known_issnl']:
- counts['unknown-issnl'] += 1
- out['valid_issnl'] = stdnum.issn.is_valid(issnl)
- if not out['valid_issnl']:
- counts['invalid-issnl'] += 1
-
- fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]))
+ out["known_issnl"] = self.issn_db.issn2issnl(issnl) == issnl
+ if not out["known_issnl"]:
+ counts["unknown-issnl"] += 1
+ out["valid_issnl"] = stdnum.issn.is_valid(issnl)
+ if not out["valid_issnl"]:
+ counts["invalid-issnl"] += 1
+
+ fatcat_row = list(
+ self.db.execute(
+ "SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl]
+ )
+ )
if fatcat_row:
frow = fatcat_row[0]
- out['fatcat_ident'] = frow['ident']
- for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'):
+ out["fatcat_ident"] = frow["ident"]
+ for k in (
+ "name",
+ "publisher",
+ "issne",
+ "issnp",
+ "wikidata_qid",
+ "lang",
+ "country",
+ "release_count",
+ "ia_count",
+ "ia_frac",
+ "kbart_count",
+ "kbart_frac",
+ "preserved_count",
+ "preserved_frac",
+ ):
if not out.get(k) and frow[k] != None:
out[k] = frow[k]
cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl])
for irow in cur:
- if irow['slug'] in ('crossref',):
- out['has_dois'] = True
+ if irow["slug"] in ("crossref",):
+ out["has_dois"] = True
# TODO: other DOI registrars (japan, datacite)
- if irow['slug'] == 'wikidata':
- out['wikidata_qid'] = irow['identifier']
- for k in ('name',):
+ if irow["slug"] == "wikidata":
+ out["wikidata_qid"] = irow["identifier"]
+ for k in ("name",):
if not out.get(k) and irow[k]:
out[k] = irow[k]
- if irow['extra']:
- extra = json.loads(irow['extra'])
- for k in ('country', 'lang', 'issne', 'issnp', 'publisher', 'platform'):
+ if irow["extra"]:
+ extra = json.loads(irow["extra"])
+ for k in (
+ "country",
+ "lang",
+ "issne",
+ "issnp",
+ "publisher",
+ "platform",
+ ):
if not out.get(k) and extra.get(k):
out[k] = extra[k]
- if irow['slug'] in ('doaj','road','szczepanski', 'gold_oa'):
- out['is_oa'] = True
- if irow['slug'] == 'ezb':
- ezb_extra = json.loads(irow['extra'])
- if ezb_extra['ezb_color'] == 'green':
- out['is_oa'] = True
- if irow['slug'] == 'sherpa_romeo':
- extra = json.loads(irow['extra'])
- out['sherpa_color'] = extra['sherpa_romeo']['color']
- if extra['sherpa_romeo']['color'] == 'green':
- out['is_oa'] = True
+ if irow["slug"] in ("doaj", "road", "szczepanski", "gold_oa"):
+ out["is_oa"] = True
+ if irow["slug"] == "ezb":
+ ezb_extra = json.loads(irow["extra"])
+ if ezb_extra["ezb_color"] == "green":
+ out["is_oa"] = True
+ if irow["slug"] == "sherpa_romeo":
+ extra = json.loads(irow["extra"])
+ out["sherpa_color"] = extra["sherpa_romeo"]["color"]
+ if extra["sherpa_romeo"]["color"] == "green":
+ out["is_oa"] = True
# filter out "NA" ISSNs
- for k in ('issne', 'issnp'):
- if out.get(k) and (len(out[k]) != 9 or out[k][4] != '-'):
+ for k in ("issne", "issnp"):
+ if out.get(k) and (len(out[k]) != 9 or out[k][4] != "-"):
out.pop(k)
cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [issnl])
for hrow in cur:
- out['any_homepage'] = True
- if hrow['terminal_status_code'] == 200 and hrow['host'] != 'web.archive.org':
- out['any_live_homepage'] = True
- if hrow['gwb_url_success_dt'] or hrow['gwb_terminal_url_success_dt']:
- out['any_gwb_homepage'] = True
-
- if out.get('wikidata_qid'):
- assert out['wikidata_qid'].startswith('Q')
- assert out['wikidata_qid'][1].isdigit()
- assert out['wikidata_qid'][-1].isdigit()
+ out["any_homepage"] = True
+ if (
+ hrow["terminal_status_code"] == 200
+ and hrow["host"] != "web.archive.org"
+ ):
+ out["any_live_homepage"] = True
+ if hrow["gwb_url_success_dt"] or hrow["gwb_terminal_url_success_dt"]:
+ out["any_gwb_homepage"] = True
+
+ if out.get("wikidata_qid"):
+ assert out["wikidata_qid"].startswith("Q")
+ assert out["wikidata_qid"][1].isdigit()
+ assert out["wikidata_qid"][-1].isdigit()
# define publisher types
- publisher = out.get('publisher')
- pl = out.get('publisher', '').lower().strip()
- if out.get('platform') == 'scielo':
- out['publisher_type'] = 'scielo'
- elif publisher in BIG5_PUBLISHERS or 'elsevier' in pl or 'springer' in pl or 'wiley' in pl:
- out['publisher_type'] = 'big5'
+ publisher = out.get("publisher")
+ pl = out.get("publisher", "").lower().strip()
+ if out.get("platform") == "scielo":
+ out["publisher_type"] = "scielo"
+ elif (
+ publisher in BIG5_PUBLISHERS
+ or "elsevier" in pl
+ or "springer" in pl
+ or "wiley" in pl
+ ):
+ out["publisher_type"] = "big5"
elif publisher in OA_PUBLISHERS:
- out['publisher_type'] = 'oa'
- elif publisher in COMMERCIAL_PUBLISHERS or 'wolters kluwer' in pl or 'wolters-kluwer' in pl:
- out['publisher_type'] = 'commercial'
+ out["publisher_type"] = "oa"
+ elif (
+ publisher in COMMERCIAL_PUBLISHERS
+ or "wolters kluwer" in pl
+ or "wolters-kluwer" in pl
+ ):
+ out["publisher_type"] = "commercial"
elif publisher in ARCHIVE_PUBLISHERS:
- out['publisher_type'] = 'archive'
+ out["publisher_type"] = "archive"
elif publisher in REPOSITORY_PUBLISHERS:
- out['publisher_type'] = 'repository'
+ out["publisher_type"] = "repository"
elif publisher in OTHER_PUBLISHERS:
- out['publisher_type'] = 'other'
- elif publisher in SOCIETY_PUBLISHERS or 'society' in pl or 'association' in pl or 'academy of ' in pl or 'institute of' in pl:
- out['publisher_type'] = 'society'
- elif publisher in UNI_PRESS_PUBLISHERS or 'university ' in pl:
- out['publisher_type'] = 'unipress'
- elif 'scielo' in pl:
- out['publisher_type'] = 'scielo'
- elif out.get('is_oa') and (not out.get('has_dois') or out.get('lang') not in (None, 'en', 'de', 'fr', 'ja') or out.get('country') not in (None, 'us', 'gb', 'nl', 'cn', 'jp', 'de')):
+ out["publisher_type"] = "other"
+ elif (
+ publisher in SOCIETY_PUBLISHERS
+ or "society" in pl
+ or "association" in pl
+ or "academy of " in pl
+ or "institute of" in pl
+ ):
+ out["publisher_type"] = "society"
+ elif publisher in UNI_PRESS_PUBLISHERS or "university " in pl:
+ out["publisher_type"] = "unipress"
+ elif "scielo" in pl:
+ out["publisher_type"] = "scielo"
+ elif out.get("is_oa") and (
+ not out.get("has_dois")
+ or out.get("lang") not in (None, "en", "de", "fr", "ja")
+ or out.get("country") not in (None, "us", "gb", "nl", "cn", "jp", "de")
+ ):
# current informal definition of longtail
- out['publisher_type'] = 'longtail'
- out['is_longtail'] = True
-
- cur.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
- (issnl,
- out.get('issne'),
- out.get('issnp'),
- out.get('wikidata_qid'),
- out.get('fatcat_ident'),
- out.get('name'),
- out.get('publisher'),
- out.get('country'),
- out.get('lang'),
- out.get('is_oa', False),
- out.get('sherpa_color'),
- out.get('is_longtail', False),
- out.get('is_active'),
- out.get('publisher_type'),
- out.get('has_dois', False),
- out.get('any_homepage', False),
- out.get('any_live_homepage', False),
- out.get('any_gwb_homepage', False),
- out.get('known_issnl'),
- out.get('valid_issnl'),
-
- out.get('release_count'),
- out.get('ia_count'),
- out.get('ia_frac'),
- out.get('kbart_count'),
- out.get('kbart_frac'),
- out.get('preserved_count'),
- out.get('preserved_frac'),
- ))
+ out["publisher_type"] = "longtail"
+ out["is_longtail"] = True
+
+ cur.execute(
+ "INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
+ (
+ issnl,
+ out.get("issne"),
+ out.get("issnp"),
+ out.get("wikidata_qid"),
+ out.get("fatcat_ident"),
+ out.get("name"),
+ out.get("publisher"),
+ out.get("country"),
+ out.get("lang"),
+ out.get("is_oa", False),
+ out.get("sherpa_color"),
+ out.get("is_longtail", False),
+ out.get("is_active"),
+ out.get("publisher_type"),
+ out.get("has_dois", False),
+ out.get("any_homepage", False),
+ out.get("any_live_homepage", False),
+ out.get("any_gwb_homepage", False),
+ out.get("known_issnl"),
+ out.get("valid_issnl"),
+ out.get("release_count"),
+ out.get("ia_count"),
+ out.get("ia_frac"),
+ out.get("kbart_count"),
+ out.get("kbart_frac"),
+ out.get("preserved_count"),
+ out.get("preserved_frac"),
+ ),
+ )
cur.close()
self.db.commit()
return counts
@@ -534,125 +621,146 @@ class ChoculaDatabase():
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
+
counts: Counter = Counter()
self.db.row_factory = dict_factory
cur = self.db.cursor()
- for row in cur.execute('SELECT * FROM journal'):
+ for row in cur.execute("SELECT * FROM journal"):
print(json.dumps(row))
- counts['total'] += 1
+ counts["total"] += 1
return counts
def export_fatcat(self):
counts: Counter = Counter()
self.db.row_factory = sqlite3.Row
cur = self.db.cursor()
- for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'):
- counts['total'] += 1
+ for row in cur.execute("SELECT * FROM journal WHERE valid_issnl = 1"):
+ counts["total"] += 1
- name = row['name']
+ name = row["name"]
if name:
name = name.strip()
- if not row['name']:
- counts['empty-name'] += 1
+ if not row["name"]:
+ counts["empty-name"] += 1
continue
if len(name) <= 2:
- counts['short-name'] += 1
+ counts["short-name"] += 1
continue
- publisher = row['publisher']
+ publisher = row["publisher"]
if publisher:
publisher = publisher.strip() or None
out = dict(
- issnl=row['issnl'],
- wikidata_qid=row['wikidata_qid'],
- ident=row['fatcat_ident'],
+ issnl=row["issnl"],
+ wikidata_qid=row["wikidata_qid"],
+ ident=row["fatcat_ident"],
publisher=publisher,
name=name,
- _known_issnl=row['known_issnl'])
+ _known_issnl=row["known_issnl"],
+ )
extra = dict(
- issnp=row['issnp'],
- issne=row['issne'],
- country=row['country'],
+ issnp=row["issnp"], issne=row["issne"], country=row["country"],
)
- if row['lang']:
- extra['languages'] = [row['lang'],]
- if row['sherpa_color']:
- extra['sherpa_romeo'] = dict(color=row['sherpa_color'])
+ if row["lang"]:
+ extra["languages"] = [
+ row["lang"],
+ ]
+ if row["sherpa_color"]:
+ extra["sherpa_romeo"] = dict(color=row["sherpa_color"])
urls = []
webarchive_urls = []
- cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']])
+ cur = self.db.execute(
+ "SELECT * FROM homepage WHERE issnl = ?;", [row["issnl"]]
+ )
for hrow in cur:
- if '://doaj.org/' in hrow['url'] or '://www.doaj.org/' in hrow['url']:
+ if "://doaj.org/" in hrow["url"] or "://www.doaj.org/" in hrow["url"]:
continue
- if '://www.ncbi.nlm.nih.gov/' in hrow['url']:
+ if "://www.ncbi.nlm.nih.gov/" in hrow["url"]:
continue
- if 'web.archive.org/web' in hrow['url']:
- webarchive_urls.append(hrow['url'])
- urls.append(hrow['url'])
+ if "web.archive.org/web" in hrow["url"]:
+ webarchive_urls.append(hrow["url"])
+ urls.append(hrow["url"])
continue
- if hrow['host'] in ('www.google.com', 'books.google.com'):
+ if hrow["host"] in ("www.google.com", "books.google.com"):
# individual books or google searches, not journal/conference homepages
continue
- if '/oai/request' in hrow['url']:
+ if "/oai/request" in hrow["url"]:
# OAI-PMH endpoints, not homepages
continue
- if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error':
- webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url']))
+ if (
+ not row["any_live_homepage"]
+ and hrow["gwb_url_success_dt"]
+ and hrow["gwb_url_success_dt"] != "error"
+ ):
+ webarchive_urls.append(
+ "https://web.archive.org/web/{}/{}".format(
+ hrow["gwb_url_success_dt"], hrow["url"]
+ )
+ )
continue
- if hrow['blocked']:
- urls.append(hrow['url'])
+ if hrow["blocked"]:
+ urls.append(hrow["url"])
continue
- if hrow['terminal_status_code'] == 200:
- if hrow['terminal_url'] == hrow['url'].replace('http://', 'https://') or hrow['terminal_url'] == hrow['url'] + "/":
+ if hrow["terminal_status_code"] == 200:
+ if (
+ hrow["terminal_url"]
+ == hrow["url"].replace("http://", "https://")
+ or hrow["terminal_url"] == hrow["url"] + "/"
+ ):
# check for trivial redirects; use post-redirect URL in those cases
- urls.append(hrow['terminal_url'])
+ urls.append(hrow["terminal_url"])
else:
- urls.append(hrow['url'])
+ urls.append(hrow["url"])
continue
# didn't even crawl and no match? add anyways as a pass-through
- if not hrow['status_code']:
- urls.append(hrow['url'])
+ if not hrow["status_code"]:
+ urls.append(hrow["url"])
continue
- extra['webarchive_urls'] = webarchive_urls
- extra['urls'] = urls
+ extra["webarchive_urls"] = webarchive_urls
+ extra["urls"] = urls
- cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [row['issnl']])
+ cur = self.db.execute(
+ "SELECT * FROM directory WHERE issnl = ?;", [row["issnl"]]
+ )
for drow in cur:
- if drow['slug'] == 'ezb':
- ezb = json.loads(drow['extra'])
- extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color'])
- elif drow['slug'] == 'szczepanski':
- extra['szczepanski'] = drow['extra']
- elif drow['slug'] == 'doaj':
- extra['doaj'] = json.loads(drow['extra'])
- elif drow['slug'] == 'scielo':
- extra['scielo'] = json.loads(drow['extra'])
- elif drow['slug'] == 'sim':
- extra['ia'] = extra.get('ia', {})
- extra['ia']['sim'] = json.loads(drow['extra'])
- extra['ia']['sim']['sim_pubid'] = drow['identifier']
- elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'):
- extra['kbart'] = extra.get('kbart', {})
- extra['kbart'][drow['slug']] = json.loads(drow['extra'])
-
- out['extra'] = extra
+ if drow["slug"] == "ezb":
+ ezb = json.loads(drow["extra"])
+ extra["ezb"] = dict(
+ ezb_id=drow["identifier"], color=ezb["ezb_color"]
+ )
+ elif drow["slug"] == "szczepanski":
+ extra["szczepanski"] = drow["extra"]
+ elif drow["slug"] == "doaj":
+ extra["doaj"] = json.loads(drow["extra"])
+ elif drow["slug"] == "scielo":
+ extra["scielo"] = json.loads(drow["extra"])
+ elif drow["slug"] == "sim":
+ extra["ia"] = extra.get("ia", {})
+ extra["ia"]["sim"] = json.loads(drow["extra"])
+ extra["ia"]["sim"]["sim_pubid"] = drow["identifier"]
+ elif drow["slug"] in ("lockss", "clockss", "portico", "jstor"):
+ extra["kbart"] = extra.get("kbart", {})
+ extra["kbart"][drow["slug"]] = json.loads(drow["extra"])
+
+ out["extra"] = extra
print(json.dumps(out))
return counts
def init_db(self):
print("### Creating Database...", file=sys.stderr)
- self.db.executescript("""
+ self.db.executescript(
+ """
PRAGMA main.page_size = 4096;
PRAGMA main.cache_size = 20000;
PRAGMA main.locking_mode = EXCLUSIVE;
PRAGMA main.synchronous = OFF;
- """)
- with open('chocula_schema.sql', 'r') as fschema:
+ """
+ )
+ with open("chocula_schema.sql", "r") as fschema:
self.db.executescript(fschema.read())
print("Done!", file=sys.stderr)
-