diff options
Diffstat (limited to 'chocula/database.py')
-rw-r--r-- | chocula/database.py | 604 |
1 files changed, 356 insertions, 248 deletions
diff --git a/chocula/database.py b/chocula/database.py index f620515..11632b9 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -1,4 +1,3 @@ - from __future__ import annotations import sys @@ -47,41 +46,49 @@ class HomepageUrl: """ Returns None if url is really bad (not a URL). """ - if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'): + if ( + not url + or "mailto:" in url.lower() + or url.lower() in ("http://n/a", "http://na/", "http://na") + ): return None - if url.startswith('www.'): + if url.startswith("www."): url = "http://" + url - if url.startswith('ttp://') or url.startswith('ttps://'): + if url.startswith("ttp://") or url.startswith("ttps://"): url = "h" + url - url.replace('Http://', 'http://') + url.replace("Http://", "http://") url = str(urlcanon.semantic_precise(url)) - if url == 'http://na/': + if url == "http://na/": # sort of redundant with above, but some only match after canonicalization return None url_surt = surt.surt(url) tld = tldextract.extract(url) - host = '.'.join(tld) - if host.startswith('.'): + host = ".".join(tld) + if host.startswith("."): host = host[1:] - return HomepageUrl(url=url, - surt=url_surt, - host=host, - domain=tld.registered_domain, - suffix=tld.suffix) + return HomepageUrl( + url=url, + surt=url_surt, + host=host, + domain=tld.registered_domain, + suffix=tld.suffix, + ) + def test_from_url(): - - assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == 'core.ac.uk' - assert HomepageUrl.from_url("http://thing.core.ac.uk").host == 'thing.core.ac.uk' - assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix== 'ac.uk' - assert HomepageUrl.from_url("google.com").suffix == 'com' - assert HomepageUrl.from_url("google.com").host == 'google.com' + assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == "core.ac.uk" + assert HomepageUrl.from_url("http://thing.core.ac.uk").host == "thing.core.ac.uk" + assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix == "ac.uk" + + assert HomepageUrl.from_url("google.com").suffix == "com" + assert HomepageUrl.from_url("google.com").host == "google.com" assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") == None - assert HomepageUrl.from_url("thing.com").url == 'http://thing.com/' - assert HomepageUrl.from_url("Http://thing.com///").url == 'http://thing.com/' + assert HomepageUrl.from_url("thing.com").url == "http://thing.com/" + assert HomepageUrl.from_url("Http://thing.com///").url == "http://thing.com/" + @dataclass class UrlCrawlStatus: @@ -95,6 +102,7 @@ class UrlCrawlStatus: gwb_url_success_dt: Optional[str] gwb_terminal_url_success_dt: Optional[str] + @dataclass class DirectoryInfo: directory_slug: str @@ -127,10 +135,19 @@ class DirectoryInfo: """ if not self.issnl: raise ValueError - extra_dict = self.extra - - for k in ('issne', 'issnp', 'name', 'publisher', 'abbrev', 'platform', - 'country', 'langs', 'original_name'): + extra_dict = self.extra + + for k in ( + "issne", + "issnp", + "name", + "publisher", + "abbrev", + "platform", + "country", + "langs", + "original_name", + ): if self.__dict__[k]: extra_dict[k] = self.__dict__[k] @@ -151,7 +168,7 @@ class DirectoryInfo: raise NotImplementedError() -class IssnDatabase(): +class IssnDatabase: """ Holds complete ISSN/ISSN-L table and helps with lookups and munging of raw ISSN strings @@ -163,7 +180,7 @@ class IssnDatabase(): def read_issn_map_file(self, issn_map_path: str): print("##### Loading ISSN-L map file...", file=sys.stderr) - with open(issn_map_path, 'r') as issn_map_file: + with open(issn_map_path, "r") as issn_map_file: for line in issn_map_file: if line.startswith("ISSN") or len(line) == 0: continue @@ -209,7 +226,7 @@ class IssnDatabase(): return info -class ChoculaDatabase(): +class ChoculaDatabase: """ Wraps a sqlite3 database """ @@ -218,7 +235,7 @@ class ChoculaDatabase(): """ To create a temporary database, pass ":memory:" as db_file """ - self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') + self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self.data = dict() self.issn_db = issn_db @@ -247,8 +264,7 @@ class ChoculaDatabase(): cur = self.db.cursor() try: - cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", - info.to_db_tuple()) + cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", info.to_db_tuple()) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): return "duplicate" @@ -264,7 +280,8 @@ class ChoculaDatabase(): try: cur.execute( "INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)", - homepage.to_db_tuple(issnl)) + homepage.to_db_tuple(issnl), + ) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): return "duplicate" @@ -276,29 +293,33 @@ class ChoculaDatabase(): print("##### Loading IA Homepage Crawl Results...") counts: Counter = Counter() cur = self.db.cursor() - for line in open(config.homepage_status.filepath, 'r'): + for line in open(config.homepage_status.filepath, "r"): if not line.strip(): continue row = json.loads(line) - counts['total'] += 1 - url = row['url'] - assert(url) - if row.get('gwb_url_success_dt') == 'error': - row['gwb_url_success_dt'] = None - if row.get('gwb_terminal_url_success_dt') == 'error': - row['gwb_terminal_url_success_dt'] = None - cur.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?", - (row['status_code'], - row.get('crawl_error'), - row.get('terminal_url'), - row.get('terminal_status_code'), - row.get('platform_software'), - row.get('issnl_in_body'), - row.get('blocked'), - row.get('gwb_url_success_dt'), - row.get('gwb_terminal_url_success_dt'), - url)) - counts['updated'] += 1 + counts["total"] += 1 + url = row["url"] + assert url + if row.get("gwb_url_success_dt") == "error": + row["gwb_url_success_dt"] = None + if row.get("gwb_terminal_url_success_dt") == "error": + row["gwb_terminal_url_success_dt"] = None + cur.execute( + "UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?", + ( + row["status_code"], + row.get("crawl_error"), + row.get("terminal_url"), + row.get("terminal_status_code"), + row.get("platform_software"), + row.get("issnl_in_body"), + row.get("blocked"), + row.get("gwb_url_success_dt"), + row.get("gwb_terminal_url_success_dt"), + url, + ), + ) + counts["updated"] += 1 cur.close() self.db.commit() return counts @@ -306,51 +327,54 @@ class ChoculaDatabase(): def load_fatcat_containers(self, config: ChoculaConfig) -> Counter: print("##### Loading Fatcat Container Entities...") # JSON - json_file = open(config.fatcat_containers.filepath, 'r') + json_file = open(config.fatcat_containers.filepath, "r") counts: Counter = Counter() cur = self.db.cursor() for line in json_file: if not line: continue row = json.loads(line) - if row['state'] != 'active': + if row["state"] != "active": continue - counts['total'] += 1 - extra = row.get('extra', dict()) - issne = extra.get('issne') - issnp = extra.get('issnp') - country = extra.get('country') - languages = extra.get('languages', []) + counts["total"] += 1 + extra = row.get("extra", dict()) + issne = extra.get("issne") + issnp = extra.get("issnp") + country = extra.get("country") + languages = extra.get("languages", []) lang = None if languages: lang = languages[0] try: - cur.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)", - (row.get('issnl'), - row['ident'], - row['revision'], - issne, - issnp, - row.get('wikidata_qid'), - row['name'], - row.get('container_type'), - extra.get('publisher'), - country, - lang, - )) + cur.execute( + "INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)", + ( + row.get("issnl"), + row["ident"], + row["revision"], + issne, + issnp, + row.get("wikidata_qid"), + row["name"], + row.get("container_type"), + extra.get("publisher"), + country, + lang, + ), + ) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): counts["existing"] += 1 continue else: raise ie - counts['inserted'] += 1 - if row.get('issnl'): - urls = extra.get('urls', []) + counts["inserted"] += 1 + if row.get("issnl"): + urls = extra.get("urls", []) for url in urls: homepage = HomepageUrl.from_url(url) if homepage: - self.insert_homepage(row.get('issnl'), homepage, cur) + self.insert_homepage(row.get("issnl"), homepage, cur) cur.close() self.db.commit() return counts @@ -358,22 +382,31 @@ class ChoculaDatabase(): def load_fatcat_stats(self, config: ChoculaConfig) -> Counter: print("##### Loading Fatcat Container Stats...") # JSON - json_file = open(config.fatcat_stats.filepath, 'r') + json_file = open(config.fatcat_stats.filepath, "r") counts: Counter = Counter() cur = self.db.cursor() for line in json_file: if not line: continue row = json.loads(line) - total = int(row['total']) + total = int(row["total"]) ia_frac: Optional[float] = None preserved_frac: Optional[float] = None if total > 0: - ia_frac = float(row['in_web'])/total - preserved_frac = float(row['is_preserved'])/total - cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?", - (total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl'])) - counts['updated'] += 1 + ia_frac = float(row["in_web"]) / total + preserved_frac = float(row["is_preserved"]) / total + cur.execute( + "UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?", + ( + total, + row["in_web"], + ia_frac, + row["is_preserved"], + preserved_frac, + row["issnl"], + ), + ) + counts["updated"] += 1 cur.close() self.db.commit() return counts @@ -384,10 +417,10 @@ class ChoculaDatabase(): self.db.row_factory = sqlite3.Row cur = self.db.execute("SELECT issnl, url FROM homepage;") for hrow in cur: - assert(hrow['url']) - assert(len(hrow['url'].split()) == 1) - counts['total'] += 1 - print('\t'.join((hrow['issnl'], hrow['url']))) + assert hrow["url"] + assert len(hrow["url"].split()) == 1 + counts["total"] += 1 + print("\t".join((hrow["issnl"], hrow["url"]))) return counts def summarize(self) -> Counter: @@ -395,135 +428,189 @@ class ChoculaDatabase(): counts: Counter = Counter() cur = self.db.cursor() self.db.row_factory = sqlite3.Row - index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory')) - fatcat_issnls = list(cur.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null')) + index_issnls = list(cur.execute("SELECT DISTINCT issnl FROM directory")) + fatcat_issnls = list( + cur.execute( + "SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null" + ) + ) all_issnls = set([i[0] for i in index_issnls + fatcat_issnls]) print("{} total ISSN-Ls".format(len(all_issnls))) for issnl in all_issnls: - #print(issnl) - counts['total'] += 1 + # print(issnl) + counts["total"] += 1 out = dict() # check if ISSN-L is good. this is here because of fatcat import - out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl) - if not out['known_issnl']: - counts['unknown-issnl'] += 1 - out['valid_issnl'] = stdnum.issn.is_valid(issnl) - if not out['valid_issnl']: - counts['invalid-issnl'] += 1 - - fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl])) + out["known_issnl"] = self.issn_db.issn2issnl(issnl) == issnl + if not out["known_issnl"]: + counts["unknown-issnl"] += 1 + out["valid_issnl"] = stdnum.issn.is_valid(issnl) + if not out["valid_issnl"]: + counts["invalid-issnl"] += 1 + + fatcat_row = list( + self.db.execute( + "SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl] + ) + ) if fatcat_row: frow = fatcat_row[0] - out['fatcat_ident'] = frow['ident'] - for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'): + out["fatcat_ident"] = frow["ident"] + for k in ( + "name", + "publisher", + "issne", + "issnp", + "wikidata_qid", + "lang", + "country", + "release_count", + "ia_count", + "ia_frac", + "kbart_count", + "kbart_frac", + "preserved_count", + "preserved_frac", + ): if not out.get(k) and frow[k] != None: out[k] = frow[k] cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl]) for irow in cur: - if irow['slug'] in ('crossref',): - out['has_dois'] = True + if irow["slug"] in ("crossref",): + out["has_dois"] = True # TODO: other DOI registrars (japan, datacite) - if irow['slug'] == 'wikidata': - out['wikidata_qid'] = irow['identifier'] - for k in ('name',): + if irow["slug"] == "wikidata": + out["wikidata_qid"] = irow["identifier"] + for k in ("name",): if not out.get(k) and irow[k]: out[k] = irow[k] - if irow['extra']: - extra = json.loads(irow['extra']) - for k in ('country', 'lang', 'issne', 'issnp', 'publisher', 'platform'): + if irow["extra"]: + extra = json.loads(irow["extra"]) + for k in ( + "country", + "lang", + "issne", + "issnp", + "publisher", + "platform", + ): if not out.get(k) and extra.get(k): out[k] = extra[k] - if irow['slug'] in ('doaj','road','szczepanski', 'gold_oa'): - out['is_oa'] = True - if irow['slug'] == 'ezb': - ezb_extra = json.loads(irow['extra']) - if ezb_extra['ezb_color'] == 'green': - out['is_oa'] = True - if irow['slug'] == 'sherpa_romeo': - extra = json.loads(irow['extra']) - out['sherpa_color'] = extra['sherpa_romeo']['color'] - if extra['sherpa_romeo']['color'] == 'green': - out['is_oa'] = True + if irow["slug"] in ("doaj", "road", "szczepanski", "gold_oa"): + out["is_oa"] = True + if irow["slug"] == "ezb": + ezb_extra = json.loads(irow["extra"]) + if ezb_extra["ezb_color"] == "green": + out["is_oa"] = True + if irow["slug"] == "sherpa_romeo": + extra = json.loads(irow["extra"]) + out["sherpa_color"] = extra["sherpa_romeo"]["color"] + if extra["sherpa_romeo"]["color"] == "green": + out["is_oa"] = True # filter out "NA" ISSNs - for k in ('issne', 'issnp'): - if out.get(k) and (len(out[k]) != 9 or out[k][4] != '-'): + for k in ("issne", "issnp"): + if out.get(k) and (len(out[k]) != 9 or out[k][4] != "-"): out.pop(k) cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [issnl]) for hrow in cur: - out['any_homepage'] = True - if hrow['terminal_status_code'] == 200 and hrow['host'] != 'web.archive.org': - out['any_live_homepage'] = True - if hrow['gwb_url_success_dt'] or hrow['gwb_terminal_url_success_dt']: - out['any_gwb_homepage'] = True - - if out.get('wikidata_qid'): - assert out['wikidata_qid'].startswith('Q') - assert out['wikidata_qid'][1].isdigit() - assert out['wikidata_qid'][-1].isdigit() + out["any_homepage"] = True + if ( + hrow["terminal_status_code"] == 200 + and hrow["host"] != "web.archive.org" + ): + out["any_live_homepage"] = True + if hrow["gwb_url_success_dt"] or hrow["gwb_terminal_url_success_dt"]: + out["any_gwb_homepage"] = True + + if out.get("wikidata_qid"): + assert out["wikidata_qid"].startswith("Q") + assert out["wikidata_qid"][1].isdigit() + assert out["wikidata_qid"][-1].isdigit() # define publisher types - publisher = out.get('publisher') - pl = out.get('publisher', '').lower().strip() - if out.get('platform') == 'scielo': - out['publisher_type'] = 'scielo' - elif publisher in BIG5_PUBLISHERS or 'elsevier' in pl or 'springer' in pl or 'wiley' in pl: - out['publisher_type'] = 'big5' + publisher = out.get("publisher") + pl = out.get("publisher", "").lower().strip() + if out.get("platform") == "scielo": + out["publisher_type"] = "scielo" + elif ( + publisher in BIG5_PUBLISHERS + or "elsevier" in pl + or "springer" in pl + or "wiley" in pl + ): + out["publisher_type"] = "big5" elif publisher in OA_PUBLISHERS: - out['publisher_type'] = 'oa' - elif publisher in COMMERCIAL_PUBLISHERS or 'wolters kluwer' in pl or 'wolters-kluwer' in pl: - out['publisher_type'] = 'commercial' + out["publisher_type"] = "oa" + elif ( + publisher in COMMERCIAL_PUBLISHERS + or "wolters kluwer" in pl + or "wolters-kluwer" in pl + ): + out["publisher_type"] = "commercial" elif publisher in ARCHIVE_PUBLISHERS: - out['publisher_type'] = 'archive' + out["publisher_type"] = "archive" elif publisher in REPOSITORY_PUBLISHERS: - out['publisher_type'] = 'repository' + out["publisher_type"] = "repository" elif publisher in OTHER_PUBLISHERS: - out['publisher_type'] = 'other' - elif publisher in SOCIETY_PUBLISHERS or 'society' in pl or 'association' in pl or 'academy of ' in pl or 'institute of' in pl: - out['publisher_type'] = 'society' - elif publisher in UNI_PRESS_PUBLISHERS or 'university ' in pl: - out['publisher_type'] = 'unipress' - elif 'scielo' in pl: - out['publisher_type'] = 'scielo' - elif out.get('is_oa') and (not out.get('has_dois') or out.get('lang') not in (None, 'en', 'de', 'fr', 'ja') or out.get('country') not in (None, 'us', 'gb', 'nl', 'cn', 'jp', 'de')): + out["publisher_type"] = "other" + elif ( + publisher in SOCIETY_PUBLISHERS + or "society" in pl + or "association" in pl + or "academy of " in pl + or "institute of" in pl + ): + out["publisher_type"] = "society" + elif publisher in UNI_PRESS_PUBLISHERS or "university " in pl: + out["publisher_type"] = "unipress" + elif "scielo" in pl: + out["publisher_type"] = "scielo" + elif out.get("is_oa") and ( + not out.get("has_dois") + or out.get("lang") not in (None, "en", "de", "fr", "ja") + or out.get("country") not in (None, "us", "gb", "nl", "cn", "jp", "de") + ): # current informal definition of longtail - out['publisher_type'] = 'longtail' - out['is_longtail'] = True - - cur.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", - (issnl, - out.get('issne'), - out.get('issnp'), - out.get('wikidata_qid'), - out.get('fatcat_ident'), - out.get('name'), - out.get('publisher'), - out.get('country'), - out.get('lang'), - out.get('is_oa', False), - out.get('sherpa_color'), - out.get('is_longtail', False), - out.get('is_active'), - out.get('publisher_type'), - out.get('has_dois', False), - out.get('any_homepage', False), - out.get('any_live_homepage', False), - out.get('any_gwb_homepage', False), - out.get('known_issnl'), - out.get('valid_issnl'), - - out.get('release_count'), - out.get('ia_count'), - out.get('ia_frac'), - out.get('kbart_count'), - out.get('kbart_frac'), - out.get('preserved_count'), - out.get('preserved_frac'), - )) + out["publisher_type"] = "longtail" + out["is_longtail"] = True + + cur.execute( + "INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + issnl, + out.get("issne"), + out.get("issnp"), + out.get("wikidata_qid"), + out.get("fatcat_ident"), + out.get("name"), + out.get("publisher"), + out.get("country"), + out.get("lang"), + out.get("is_oa", False), + out.get("sherpa_color"), + out.get("is_longtail", False), + out.get("is_active"), + out.get("publisher_type"), + out.get("has_dois", False), + out.get("any_homepage", False), + out.get("any_live_homepage", False), + out.get("any_gwb_homepage", False), + out.get("known_issnl"), + out.get("valid_issnl"), + out.get("release_count"), + out.get("ia_count"), + out.get("ia_frac"), + out.get("kbart_count"), + out.get("kbart_frac"), + out.get("preserved_count"), + out.get("preserved_frac"), + ), + ) cur.close() self.db.commit() return counts @@ -534,125 +621,146 @@ class ChoculaDatabase(): for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d + counts: Counter = Counter() self.db.row_factory = dict_factory cur = self.db.cursor() - for row in cur.execute('SELECT * FROM journal'): + for row in cur.execute("SELECT * FROM journal"): print(json.dumps(row)) - counts['total'] += 1 + counts["total"] += 1 return counts def export_fatcat(self): counts: Counter = Counter() self.db.row_factory = sqlite3.Row cur = self.db.cursor() - for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'): - counts['total'] += 1 + for row in cur.execute("SELECT * FROM journal WHERE valid_issnl = 1"): + counts["total"] += 1 - name = row['name'] + name = row["name"] if name: name = name.strip() - if not row['name']: - counts['empty-name'] += 1 + if not row["name"]: + counts["empty-name"] += 1 continue if len(name) <= 2: - counts['short-name'] += 1 + counts["short-name"] += 1 continue - publisher = row['publisher'] + publisher = row["publisher"] if publisher: publisher = publisher.strip() or None out = dict( - issnl=row['issnl'], - wikidata_qid=row['wikidata_qid'], - ident=row['fatcat_ident'], + issnl=row["issnl"], + wikidata_qid=row["wikidata_qid"], + ident=row["fatcat_ident"], publisher=publisher, name=name, - _known_issnl=row['known_issnl']) + _known_issnl=row["known_issnl"], + ) extra = dict( - issnp=row['issnp'], - issne=row['issne'], - country=row['country'], + issnp=row["issnp"], issne=row["issne"], country=row["country"], ) - if row['lang']: - extra['languages'] = [row['lang'],] - if row['sherpa_color']: - extra['sherpa_romeo'] = dict(color=row['sherpa_color']) + if row["lang"]: + extra["languages"] = [ + row["lang"], + ] + if row["sherpa_color"]: + extra["sherpa_romeo"] = dict(color=row["sherpa_color"]) urls = [] webarchive_urls = [] - cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']]) + cur = self.db.execute( + "SELECT * FROM homepage WHERE issnl = ?;", [row["issnl"]] + ) for hrow in cur: - if '://doaj.org/' in hrow['url'] or '://www.doaj.org/' in hrow['url']: + if "://doaj.org/" in hrow["url"] or "://www.doaj.org/" in hrow["url"]: continue - if '://www.ncbi.nlm.nih.gov/' in hrow['url']: + if "://www.ncbi.nlm.nih.gov/" in hrow["url"]: continue - if 'web.archive.org/web' in hrow['url']: - webarchive_urls.append(hrow['url']) - urls.append(hrow['url']) + if "web.archive.org/web" in hrow["url"]: + webarchive_urls.append(hrow["url"]) + urls.append(hrow["url"]) continue - if hrow['host'] in ('www.google.com', 'books.google.com'): + if hrow["host"] in ("www.google.com", "books.google.com"): # individual books or google searches, not journal/conference homepages continue - if '/oai/request' in hrow['url']: + if "/oai/request" in hrow["url"]: # OAI-PMH endpoints, not homepages continue - if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error': - webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url'])) + if ( + not row["any_live_homepage"] + and hrow["gwb_url_success_dt"] + and hrow["gwb_url_success_dt"] != "error" + ): + webarchive_urls.append( + "https://web.archive.org/web/{}/{}".format( + hrow["gwb_url_success_dt"], hrow["url"] + ) + ) continue - if hrow['blocked']: - urls.append(hrow['url']) + if hrow["blocked"]: + urls.append(hrow["url"]) continue - if hrow['terminal_status_code'] == 200: - if hrow['terminal_url'] == hrow['url'].replace('http://', 'https://') or hrow['terminal_url'] == hrow['url'] + "/": + if hrow["terminal_status_code"] == 200: + if ( + hrow["terminal_url"] + == hrow["url"].replace("http://", "https://") + or hrow["terminal_url"] == hrow["url"] + "/" + ): # check for trivial redirects; use post-redirect URL in those cases - urls.append(hrow['terminal_url']) + urls.append(hrow["terminal_url"]) else: - urls.append(hrow['url']) + urls.append(hrow["url"]) continue # didn't even crawl and no match? add anyways as a pass-through - if not hrow['status_code']: - urls.append(hrow['url']) + if not hrow["status_code"]: + urls.append(hrow["url"]) continue - extra['webarchive_urls'] = webarchive_urls - extra['urls'] = urls + extra["webarchive_urls"] = webarchive_urls + extra["urls"] = urls - cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [row['issnl']]) + cur = self.db.execute( + "SELECT * FROM directory WHERE issnl = ?;", [row["issnl"]] + ) for drow in cur: - if drow['slug'] == 'ezb': - ezb = json.loads(drow['extra']) - extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color']) - elif drow['slug'] == 'szczepanski': - extra['szczepanski'] = drow['extra'] - elif drow['slug'] == 'doaj': - extra['doaj'] = json.loads(drow['extra']) - elif drow['slug'] == 'scielo': - extra['scielo'] = json.loads(drow['extra']) - elif drow['slug'] == 'sim': - extra['ia'] = extra.get('ia', {}) - extra['ia']['sim'] = json.loads(drow['extra']) - extra['ia']['sim']['sim_pubid'] = drow['identifier'] - elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'): - extra['kbart'] = extra.get('kbart', {}) - extra['kbart'][drow['slug']] = json.loads(drow['extra']) - - out['extra'] = extra + if drow["slug"] == "ezb": + ezb = json.loads(drow["extra"]) + extra["ezb"] = dict( + ezb_id=drow["identifier"], color=ezb["ezb_color"] + ) + elif drow["slug"] == "szczepanski": + extra["szczepanski"] = drow["extra"] + elif drow["slug"] == "doaj": + extra["doaj"] = json.loads(drow["extra"]) + elif drow["slug"] == "scielo": + extra["scielo"] = json.loads(drow["extra"]) + elif drow["slug"] == "sim": + extra["ia"] = extra.get("ia", {}) + extra["ia"]["sim"] = json.loads(drow["extra"]) + extra["ia"]["sim"]["sim_pubid"] = drow["identifier"] + elif drow["slug"] in ("lockss", "clockss", "portico", "jstor"): + extra["kbart"] = extra.get("kbart", {}) + extra["kbart"][drow["slug"]] = json.loads(drow["extra"]) + + out["extra"] = extra print(json.dumps(out)) return counts def init_db(self): print("### Creating Database...", file=sys.stderr) - self.db.executescript(""" + self.db.executescript( + """ PRAGMA main.page_size = 4096; PRAGMA main.cache_size = 20000; PRAGMA main.locking_mode = EXCLUSIVE; PRAGMA main.synchronous = OFF; - """) - with open('chocula_schema.sql', 'r') as fschema: + """ + ) + with open("chocula_schema.sql", "r") as fschema: self.db.executescript(fschema.read()) print("Done!", file=sys.stderr) - |