diff options
-rwxr-xr-x | check_issn_urls.py | 133 | ||||
-rw-r--r-- | chocula/__init__.py | 2 | ||||
-rw-r--r-- | chocula/__main__.py | 87 | ||||
-rw-r--r-- | chocula/common.py | 60 | ||||
-rw-r--r-- | chocula/config.py | 3 | ||||
-rw-r--r-- | chocula/database.py | 604 | ||||
-rw-r--r-- | chocula/directories/__init__.py | 17 | ||||
-rw-r--r-- | chocula/directories/crossref.py | 15 | ||||
-rw-r--r-- | chocula/directories/doaj.py | 58 | ||||
-rw-r--r-- | chocula/directories/entrez.py | 14 | ||||
-rw-r--r-- | chocula/directories/ezb.py | 29 | ||||
-rw-r--r-- | chocula/directories/gold_oa.py | 13 | ||||
-rw-r--r-- | chocula/directories/norwegian.py | 31 | ||||
-rw-r--r-- | chocula/directories/openapc.py | 19 | ||||
-rw-r--r-- | chocula/directories/road.py | 29 | ||||
-rw-r--r-- | chocula/directories/scielo.py | 32 | ||||
-rw-r--r-- | chocula/directories/sherpa_romeo.py | 33 | ||||
-rw-r--r-- | chocula/directories/sim.py | 41 | ||||
-rw-r--r-- | chocula/directories/szczepanski.py | 25 | ||||
-rw-r--r-- | chocula/directories/wikidata.py | 27 | ||||
-rw-r--r-- | chocula/kbart.py | 3 | ||||
-rw-r--r-- | chocula/util.py | 237 | ||||
-rw-r--r-- | tests/test_database.py | 10 | ||||
-rw-r--r-- | tests/test_directories.py | 9 |
24 files changed, 853 insertions, 678 deletions
diff --git a/check_issn_urls.py b/check_issn_urls.py index 1135d6c..23169f1 100755 --- a/check_issn_urls.py +++ b/check_issn_urls.py @@ -45,17 +45,17 @@ def sniff_platform(resp): """ # these are mostly here to filter out huge platforms and stop sniffing domain_map = { - 'jstor.org/': 'jstor', - 'springer.com/': 'springer', - 'springerlink.com/': 'springer', - 'tandfonline.com/': 't_and_f', - 'elsevier.com/': 'elsevier', - 'wiley.com/': 'wiley', - 'sciencedirect.com/': 'elsevier', - 'sagepub.com/': 'sage', - 'hypotheses.org/': 'hypothesis', - 'tandf.co.uk/': 't_and_f', - 'scielo': 'scielo', + "jstor.org/": "jstor", + "springer.com/": "springer", + "springerlink.com/": "springer", + "tandfonline.com/": "t_and_f", + "elsevier.com/": "elsevier", + "wiley.com/": "wiley", + "sciencedirect.com/": "elsevier", + "sagepub.com/": "sage", + "hypotheses.org/": "hypothesis", + "tandf.co.uk/": "t_and_f", + "scielo": "scielo", } for domain, platform in domain_map.items(): if domain in resp.url: @@ -64,6 +64,7 @@ def sniff_platform(resp): return "ojs" return None + def sniff_blocked(resp): """ This function would try to figure out if we got blocked: soft-block, hard @@ -73,23 +74,33 @@ def sniff_blocked(resp): if resp.status_code in (403, 420): return True # JSTOR does this - if 'Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA' in resp.text: + if ( + "Our systems have detected unusual traffic activity from your network. Please complete this reCAPTCHA" + in resp.text + ): return True - if resp.status_code == 416 and 'something about your browser made us think you were a bot' in resp.text: + if ( + resp.status_code == 416 + and "something about your browser made us think you were a bot" in resp.text + ): return True return None -def check_gwb(url, match_type='exact'): - if '//web.archive.org/' in url: + +def check_gwb(url, match_type="exact"): + if "//web.archive.org/" in url: return None # crude/bad retry loop to work around CDX API throttling for i in range(5): - resp = requests.get('https://web.archive.org/cdx/search/cdx', params={ - 'url': url, - 'matchType': match_type, - 'limit': -1, - 'filter': 'statuscode:200' - }) + resp = requests.get( + "https://web.archive.org/cdx/search/cdx", + params={ + "url": url, + "matchType": match_type, + "limit": -1, + "filter": "statuscode:200", + }, + ) if resp.status_code == 200: break time.sleep(5) @@ -98,81 +109,91 @@ def check_gwb(url, match_type='exact'): # TODO: this isn't really correct, but not sure what to return/record # if we failed through all timeouts return None - line = resp.text.strip().split('\n')[0] + line = resp.text.strip().split("\n")[0] if line: dt = line.split()[1] int(dt) return dt else: return None - + def check_url(issnl, url): - #print("Fetching: %s" % url) + # print("Fetching: %s" % url) info = dict(issnl=issnl, url=url) try: - resp = requests.get(url, timeout=30., headers={'User-Agent': 'ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org'}) + resp = requests.get( + url, + timeout=30.0, + headers={ + "User-Agent": "ia_bot/0.0 (python requests) journal-live-check; contact:info@archive.org" + }, + ) except requests.exceptions.TooManyRedirects: - info['error'] = 'TooManyRedirects' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "TooManyRedirects" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.SSLError: - info['error'] = 'SSLError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "SSLError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ReadTimeout: - info['error'] = 'ReadTimeout' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ReadTimeout" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ConnectionError: - info['error'] = 'ConnectionError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ConnectionError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ChunkedEncodingError: - info['error'] = 'ChunkedEncodingError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ChunkedEncodingError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.ContentDecodingError: - info['error'] = 'ContentDecodingError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "ContentDecodingError" + info["terminal_status_code"] = info["status_code"] = -1 return info except requests.exceptions.InvalidSchema: - info['error'] = 'InvalidSchema' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "InvalidSchema" + info["terminal_status_code"] = info["status_code"] = -1 return info except UnicodeDecodeError: - info['error'] = 'UnicodeDecodeError' - info['terminal_status_code'] = info['status_code'] = -1 + info["error"] = "UnicodeDecodeError" + info["terminal_status_code"] = info["status_code"] = -1 return info if resp.history: - info['status_code'] = resp.history[0].status_code + info["status_code"] = resp.history[0].status_code else: - info['status_code'] = resp.status_code + info["status_code"] = resp.status_code - info['terminal_status_code'] = resp.status_code - info['terminal_url'] = resp.url - content_type = resp.headers.get('Content-Type') + info["terminal_status_code"] = resp.status_code + info["terminal_url"] = resp.url + content_type = resp.headers.get("Content-Type") if content_type: - info['terminal_content_type'] = content_type.split(';')[0] - info['issnl_in_body'] = bool(issnl in resp.text) - info['gwb_url_success_dt'] = check_gwb(url, match_type='exact') - info['gwb_terminal_url_success_dt'] = check_gwb(info['terminal_url'], match_type='exact') - info['blocked'] = sniff_blocked(resp) - info['software_platform'] = sniff_platform(resp) - #info['gwb_host_success_dt'] = check_gwb(url, match_type='host') + info["terminal_content_type"] = content_type.split(";")[0] + info["issnl_in_body"] = bool(issnl in resp.text) + info["gwb_url_success_dt"] = check_gwb(url, match_type="exact") + info["gwb_terminal_url_success_dt"] = check_gwb( + info["terminal_url"], match_type="exact" + ) + info["blocked"] = sniff_blocked(resp) + info["software_platform"] = sniff_platform(resp) + # info['gwb_host_success_dt'] = check_gwb(url, match_type='host') return info + def run(tsvfile): for line in tsvfile: - records = line.split('\t') + records = line.split("\t") issnl = records[0] url = records[1].strip() print(json.dumps(check_url(issnl, url))) -if __name__=="__main__": + +if __name__ == "__main__": if len(sys.argv) != 2: f = sys.stdin else: - f = open(sys.argv[1], 'r') + f = open(sys.argv[1], "r") run(f) diff --git a/chocula/__init__.py b/chocula/__init__.py index 38e61c8..2191320 100644 --- a/chocula/__init__.py +++ b/chocula/__init__.py @@ -1,6 +1,4 @@ - from chocula.config import ChoculaConfig from chocula.database import ChoculaDatabase, IssnDatabase from chocula.directories import * from chocula.kbart import * - diff --git a/chocula/__main__.py b/chocula/__main__.py index f897dd1..92f2e6f 100644 --- a/chocula/__main__.py +++ b/chocula/__main__.py @@ -48,8 +48,13 @@ import sys import csv import argparse -from chocula import ChoculaDatabase, ChoculaConfig, IssnDatabase,\ - ALL_CHOCULA_DIR_CLASSES, ALL_CHOCULA_KBART_CLASSES +from chocula import ( + ChoculaDatabase, + ChoculaConfig, + IssnDatabase, + ALL_CHOCULA_DIR_CLASSES, + ALL_CHOCULA_KBART_CLASSES, +) def run_everything(config, database): @@ -70,6 +75,7 @@ def run_everything(config, database): database.summarize() print("### Done with everything!") + def run_directory(config, database, source): for cls in ALL_CHOCULA_DIR_CLASSES: if cls.source_slug == source: @@ -79,6 +85,7 @@ def run_directory(config, database, source): return raise NotImplementedError(f"unknown source: {source}") + def run_kbart(config, database, source): for cls in ALL_CHOCULA_KBART_CLASSES: if cls.source_slug == source: @@ -88,63 +95,65 @@ def run_kbart(config, database, source): return raise NotImplementedError(f"unknown source: {source}") + def run_load(config, database, source): - if source == 'fatcat_stats': + if source == "fatcat_stats": print(database.load_fatcat_stats(config)) - elif source == 'fatcat_containers': + elif source == "fatcat_containers": print(database.load_fatcat_containers(config)) - elif source == 'homepage_status': + elif source == "homepage_status": print(database.load_homepage_status(config)) else: raise NotImplementedError(f"unknown source: {source}") + def main(): parser = argparse.ArgumentParser( - prog="python -m chocula", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + prog="python -m chocula", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - parser.add_argument("--db-file", - help="sqlite database file", - default='chocula.sqlite', - type=str) + parser.add_argument( + "--db-file", help="sqlite database file", default="chocula.sqlite", type=str + ) - sub = subparsers.add_parser('everything', - help="run all the commands") - sub.set_defaults(func='everything') + sub = subparsers.add_parser("everything", help="run all the commands") + sub.set_defaults(func="everything") - sub = subparsers.add_parser('init_db', - help="create sqlite3 output file and tables") - sub.set_defaults(func='init_db') + sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables") + sub.set_defaults(func="init_db") - sub = subparsers.add_parser('summarize', - help="aggregate metadata from all tables into 'journals' table") - sub.set_defaults(func='summarize') + sub = subparsers.add_parser( + "summarize", help="aggregate metadata from all tables into 'journals' table" + ) + sub.set_defaults(func="summarize") - sub = subparsers.add_parser('export', - help="dump JSON output") - sub.set_defaults(func='export') + sub = subparsers.add_parser("export", help="dump JSON output") + sub.set_defaults(func="export") - sub = subparsers.add_parser('export_fatcat', - help="dump JSON output in a format that can load into fatcat") - sub.set_defaults(func='export_fatcat') + sub = subparsers.add_parser( + "export_fatcat", help="dump JSON output in a format that can load into fatcat" + ) + sub.set_defaults(func="export_fatcat") - sub = subparsers.add_parser('export_urls', - help="dump homepage URLs (eg, to crawl for status)") - sub.set_defaults(func='export_urls') + sub = subparsers.add_parser( + "export_urls", help="dump homepage URLs (eg, to crawl for status)" + ) + sub.set_defaults(func="export_urls") - sub = subparsers.add_parser('directory', - help="index directory metadata from a given source") + sub = subparsers.add_parser( + "directory", help="index directory metadata from a given source" + ) sub.add_argument("source", type=str, help="short name of source to index") sub.set_defaults(func=run_directory) - sub = subparsers.add_parser('load', - help="load metadata of a given type") + sub = subparsers.add_parser("load", help="load metadata of a given type") sub.add_argument("source", type=str, help="short name of source to index") sub.set_defaults(func=run_load) - sub = subparsers.add_parser('kbart', - help="index KBART holding metadata for a given source") + sub = subparsers.add_parser( + "kbart", help="index KBART holding metadata for a given source" + ) sub.add_argument("source", type=str, help="short name of source to index") sub.set_defaults(func=run_kbart) @@ -155,11 +164,11 @@ def main(): config = ChoculaConfig.from_file() issn_db: Optional[IssnDatabase] = None - if args.func in ('everything', 'summarize', run_directory, run_kbart): + if args.func in ("everything", "summarize", run_directory, run_kbart): issn_db = IssnDatabase(config.issnl.filepath) cdb = ChoculaDatabase(args.db_file, issn_db) - if args.func == 'everything': + if args.func == "everything": run_everything(config, cdb) elif args.func in (run_directory, run_load, run_kbart): args.func(config, cdb, args.source) @@ -168,6 +177,6 @@ def main(): func = getattr(cdb, args.func) print(func(), file=sys.stderr) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/chocula/common.py b/chocula/common.py index a5b3739..455649a 100644 --- a/chocula/common.py +++ b/chocula/common.py @@ -1,4 +1,3 @@ - import sys import csv import datetime @@ -17,7 +16,8 @@ from chocula.database import DirectoryInfo, IssnDatabase, HomepageUrl csv.field_size_limit(1310720) THIS_YEAR = datetime.date.today().year -class DirectoryLoader(): + +class DirectoryLoader: source_slug: str = "GENERIC" @@ -35,7 +35,7 @@ class DirectoryLoader(): counts: Counter = Counter() cur = db.db.cursor() for record in self.open_file(): - counts['total'] += 1 + counts["total"] += 1 info = self.parse_record(record) if info: status = db.insert_directory(info, cur=cur) @@ -44,6 +44,7 @@ class DirectoryLoader(): db.db.commit() return counts + @dataclass class KbartRecord: issnl: Optional[str] @@ -60,7 +61,7 @@ class KbartRecord: year_spans: List[Any] -class KbartLoader(): +class KbartLoader: source_slug: str = "GENERIC" @@ -68,19 +69,19 @@ class KbartLoader(): self.config = config def file_path(self) -> str: - #return self.config.TEMPLATE.filepath) + # return self.config.TEMPLATE.filepath) raise NotImplementedError() def open_file(self) -> Iterable: - raw_file = open(self.file_path(), 'rb').read().decode(errors='replace') + raw_file = open(self.file_path(), "rb").read().decode(errors="replace") fixed_file = ftfy.fix_text(raw_file) - reader = csv.DictReader(fixed_file.split('\n'), delimiter='\t') + reader = csv.DictReader(fixed_file.split("\n"), delimiter="\t") return reader def parse_record(self, row: dict, issn_db: IssnDatabase) -> Optional[KbartRecord]: - issne: Optional[str] = clean_issn(row['online_identifier'] or "") - issnp: Optional[str] = clean_issn(row['print_identifier'] or "") + issne: Optional[str] = clean_issn(row["online_identifier"] or "") + issnp: Optional[str] = clean_issn(row["print_identifier"] or "") issnl: Optional[str] = None if issne: issnl = issn_db.issn2issnl(issne) @@ -88,31 +89,31 @@ class KbartLoader(): issnl = issn_db.issn2issnl(issnp) start_year: Optional[int] = None end_year: Optional[int] = None - if row['date_first_issue_online']: - start_year = int(row['date_first_issue_online'][:4]) - if row['date_last_issue_online']: - end_year = int(row['date_last_issue_online'][:4]) - end_volume = row['num_last_vol_online'] + if row["date_first_issue_online"]: + start_year = int(row["date_first_issue_online"][:4]) + if row["date_last_issue_online"]: + end_year = int(row["date_last_issue_online"][:4]) + end_volume = row["num_last_vol_online"] # hack to handle open-ended preservation - if end_year is None and end_volume and '(present)' in end_volume: + if end_year is None and end_volume and "(present)" in end_volume: end_year = THIS_YEAR record = KbartRecord( issnl=issnl, issnp=issnp, issne=issne, - title=clean_str(row['publication_title']), - publisher=clean_str(row['publisher_name']), - url=HomepageUrl.from_url(row['title_url']), - embargo=clean_str(row['embargo_info']), + title=clean_str(row["publication_title"]), + publisher=clean_str(row["publisher_name"]), + url=HomepageUrl.from_url(row["title_url"]), + embargo=clean_str(row["embargo_info"]), start_year=start_year, end_year=end_year, - start_volume=clean_str(row['num_first_vol_online']), - end_volume=clean_str(row['num_last_vol_online']), + start_volume=clean_str(row["num_first_vol_online"]), + end_volume=clean_str(row["num_last_vol_online"]), year_spans=[], ) - if record.start_volume == 'null': + if record.start_volume == "null": record.start_volume = None - if record.end_volume == 'null': + if record.end_volume == "null": record.end_volume = None return record @@ -126,18 +127,18 @@ class KbartLoader(): counts: Counter = Counter() kbart_dict: Dict[str, KbartRecord] = dict() for row in self.open_file(): - counts['total'] += 1 + counts["total"] += 1 record = self.parse_record(row, db.issn_db) if record is None: - counts['skip-parse'] += 1 + counts["skip-parse"] += 1 continue elif not record.issnl: - counts['skip-issnl'] += 1 + counts["skip-issnl"] += 1 continue elif record.start_year is None or record.end_year is None: - counts['partial-missing-years'] += 1 - counts['parsed'] += 1 + counts["partial-missing-years"] += 1 + counts["parsed"] += 1 existing = kbart_dict.get(record.issnl, record) if record.start_year and record.end_year: @@ -149,7 +150,7 @@ class KbartLoader(): record.year_spans = merge_spans(old_spans, new_spans) kbart_dict[record.issnl] = record - counts['unique-issnl'] = len(kbart_dict) + counts["unique-issnl"] = len(kbart_dict) cur = db.db.cursor() for issnl, record in kbart_dict.items(): info = DirectoryInfo( @@ -169,4 +170,3 @@ class KbartLoader(): cur.close() db.db.commit() return counts - diff --git a/chocula/config.py b/chocula/config.py index 2237404..3bd8ade 100644 --- a/chocula/config.py +++ b/chocula/config.py @@ -1,9 +1,8 @@ - from types import SimpleNamespace import toml -class ChoculaConfig(SimpleNamespace): +class ChoculaConfig(SimpleNamespace): @classmethod def from_file(cls, file_path="sources.toml", sources_dir="data/"): diff --git a/chocula/database.py b/chocula/database.py index f620515..11632b9 100644 --- a/chocula/database.py +++ b/chocula/database.py @@ -1,4 +1,3 @@ - from __future__ import annotations import sys @@ -47,41 +46,49 @@ class HomepageUrl: """ Returns None if url is really bad (not a URL). """ - if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'): + if ( + not url + or "mailto:" in url.lower() + or url.lower() in ("http://n/a", "http://na/", "http://na") + ): return None - if url.startswith('www.'): + if url.startswith("www."): url = "http://" + url - if url.startswith('ttp://') or url.startswith('ttps://'): + if url.startswith("ttp://") or url.startswith("ttps://"): url = "h" + url - url.replace('Http://', 'http://') + url.replace("Http://", "http://") url = str(urlcanon.semantic_precise(url)) - if url == 'http://na/': + if url == "http://na/": # sort of redundant with above, but some only match after canonicalization return None url_surt = surt.surt(url) tld = tldextract.extract(url) - host = '.'.join(tld) - if host.startswith('.'): + host = ".".join(tld) + if host.startswith("."): host = host[1:] - return HomepageUrl(url=url, - surt=url_surt, - host=host, - domain=tld.registered_domain, - suffix=tld.suffix) + return HomepageUrl( + url=url, + surt=url_surt, + host=host, + domain=tld.registered_domain, + suffix=tld.suffix, + ) + def test_from_url(): - - assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == 'core.ac.uk' - assert HomepageUrl.from_url("http://thing.core.ac.uk").host == 'thing.core.ac.uk' - assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix== 'ac.uk' - assert HomepageUrl.from_url("google.com").suffix == 'com' - assert HomepageUrl.from_url("google.com").host == 'google.com' + assert HomepageUrl.from_url("http://thing.core.ac.uk").domain == "core.ac.uk" + assert HomepageUrl.from_url("http://thing.core.ac.uk").host == "thing.core.ac.uk" + assert HomepageUrl.from_url("http://thing.core.ac.uk").suffix == "ac.uk" + + assert HomepageUrl.from_url("google.com").suffix == "com" + assert HomepageUrl.from_url("google.com").host == "google.com" assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") == None - assert HomepageUrl.from_url("thing.com").url == 'http://thing.com/' - assert HomepageUrl.from_url("Http://thing.com///").url == 'http://thing.com/' + assert HomepageUrl.from_url("thing.com").url == "http://thing.com/" + assert HomepageUrl.from_url("Http://thing.com///").url == "http://thing.com/" + @dataclass class UrlCrawlStatus: @@ -95,6 +102,7 @@ class UrlCrawlStatus: gwb_url_success_dt: Optional[str] gwb_terminal_url_success_dt: Optional[str] + @dataclass class DirectoryInfo: directory_slug: str @@ -127,10 +135,19 @@ class DirectoryInfo: """ if not self.issnl: raise ValueError - extra_dict = self.extra - - for k in ('issne', 'issnp', 'name', 'publisher', 'abbrev', 'platform', - 'country', 'langs', 'original_name'): + extra_dict = self.extra + + for k in ( + "issne", + "issnp", + "name", + "publisher", + "abbrev", + "platform", + "country", + "langs", + "original_name", + ): if self.__dict__[k]: extra_dict[k] = self.__dict__[k] @@ -151,7 +168,7 @@ class DirectoryInfo: raise NotImplementedError() -class IssnDatabase(): +class IssnDatabase: """ Holds complete ISSN/ISSN-L table and helps with lookups and munging of raw ISSN strings @@ -163,7 +180,7 @@ class IssnDatabase(): def read_issn_map_file(self, issn_map_path: str): print("##### Loading ISSN-L map file...", file=sys.stderr) - with open(issn_map_path, 'r') as issn_map_file: + with open(issn_map_path, "r") as issn_map_file: for line in issn_map_file: if line.startswith("ISSN") or len(line) == 0: continue @@ -209,7 +226,7 @@ class IssnDatabase(): return info -class ChoculaDatabase(): +class ChoculaDatabase: """ Wraps a sqlite3 database """ @@ -218,7 +235,7 @@ class ChoculaDatabase(): """ To create a temporary database, pass ":memory:" as db_file """ - self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') + self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self.data = dict() self.issn_db = issn_db @@ -247,8 +264,7 @@ class ChoculaDatabase(): cur = self.db.cursor() try: - cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", - info.to_db_tuple()) + cur.execute("INSERT INTO directory VALUES (?,?,?,?,?)", info.to_db_tuple()) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): return "duplicate" @@ -264,7 +280,8 @@ class ChoculaDatabase(): try: cur.execute( "INSERT OR REPLACE INTO homepage (issnl, surt, url, host, domain, suffix) VALUES (?,?,?,?,?,?)", - homepage.to_db_tuple(issnl)) + homepage.to_db_tuple(issnl), + ) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): return "duplicate" @@ -276,29 +293,33 @@ class ChoculaDatabase(): print("##### Loading IA Homepage Crawl Results...") counts: Counter = Counter() cur = self.db.cursor() - for line in open(config.homepage_status.filepath, 'r'): + for line in open(config.homepage_status.filepath, "r"): if not line.strip(): continue row = json.loads(line) - counts['total'] += 1 - url = row['url'] - assert(url) - if row.get('gwb_url_success_dt') == 'error': - row['gwb_url_success_dt'] = None - if row.get('gwb_terminal_url_success_dt') == 'error': - row['gwb_terminal_url_success_dt'] = None - cur.execute("UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?", - (row['status_code'], - row.get('crawl_error'), - row.get('terminal_url'), - row.get('terminal_status_code'), - row.get('platform_software'), - row.get('issnl_in_body'), - row.get('blocked'), - row.get('gwb_url_success_dt'), - row.get('gwb_terminal_url_success_dt'), - url)) - counts['updated'] += 1 + counts["total"] += 1 + url = row["url"] + assert url + if row.get("gwb_url_success_dt") == "error": + row["gwb_url_success_dt"] = None + if row.get("gwb_terminal_url_success_dt") == "error": + row["gwb_terminal_url_success_dt"] = None + cur.execute( + "UPDATE homepage SET status_code=?, crawl_error=?, terminal_url=?, terminal_status_code=?, platform_software=?, issnl_in_body=?, blocked=?, gwb_url_success_dt=?, gwb_terminal_url_success_dt=? WHERE url=?", + ( + row["status_code"], + row.get("crawl_error"), + row.get("terminal_url"), + row.get("terminal_status_code"), + row.get("platform_software"), + row.get("issnl_in_body"), + row.get("blocked"), + row.get("gwb_url_success_dt"), + row.get("gwb_terminal_url_success_dt"), + url, + ), + ) + counts["updated"] += 1 cur.close() self.db.commit() return counts @@ -306,51 +327,54 @@ class ChoculaDatabase(): def load_fatcat_containers(self, config: ChoculaConfig) -> Counter: print("##### Loading Fatcat Container Entities...") # JSON - json_file = open(config.fatcat_containers.filepath, 'r') + json_file = open(config.fatcat_containers.filepath, "r") counts: Counter = Counter() cur = self.db.cursor() for line in json_file: if not line: continue row = json.loads(line) - if row['state'] != 'active': + if row["state"] != "active": continue - counts['total'] += 1 - extra = row.get('extra', dict()) - issne = extra.get('issne') - issnp = extra.get('issnp') - country = extra.get('country') - languages = extra.get('languages', []) + counts["total"] += 1 + extra = row.get("extra", dict()) + issne = extra.get("issne") + issnp = extra.get("issnp") + country = extra.get("country") + languages = extra.get("languages", []) lang = None if languages: lang = languages[0] try: - cur.execute("INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)", - (row.get('issnl'), - row['ident'], - row['revision'], - issne, - issnp, - row.get('wikidata_qid'), - row['name'], - row.get('container_type'), - extra.get('publisher'), - country, - lang, - )) + cur.execute( + "INSERT OR REPLACE INTO fatcat_container (issnl, ident, revision, issne, issnp, wikidata_qid, name, container_type, publisher, country, lang) VALUES (?,?,?,?,?,?,?,?,?,?,?)", + ( + row.get("issnl"), + row["ident"], + row["revision"], + issne, + issnp, + row.get("wikidata_qid"), + row["name"], + row.get("container_type"), + extra.get("publisher"), + country, + lang, + ), + ) except sqlite3.IntegrityError as ie: if str(ie).startswith("UNIQUE"): counts["existing"] += 1 continue else: raise ie - counts['inserted'] += 1 - if row.get('issnl'): - urls = extra.get('urls', []) + counts["inserted"] += 1 + if row.get("issnl"): + urls = extra.get("urls", []) for url in urls: homepage = HomepageUrl.from_url(url) if homepage: - self.insert_homepage(row.get('issnl'), homepage, cur) + self.insert_homepage(row.get("issnl"), homepage, cur) cur.close() self.db.commit() return counts @@ -358,22 +382,31 @@ class ChoculaDatabase(): def load_fatcat_stats(self, config: ChoculaConfig) -> Counter: print("##### Loading Fatcat Container Stats...") # JSON - json_file = open(config.fatcat_stats.filepath, 'r') + json_file = open(config.fatcat_stats.filepath, "r") counts: Counter = Counter() cur = self.db.cursor() for line in json_file: if not line: continue row = json.loads(line) - total = int(row['total']) + total = int(row["total"]) ia_frac: Optional[float] = None preserved_frac: Optional[float] = None if total > 0: - ia_frac = float(row['in_web'])/total - preserved_frac = float(row['is_preserved'])/total - cur.execute("UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?", - (total, row['in_web'], ia_frac, row['is_preserved'], preserved_frac, row['issnl'])) - counts['updated'] += 1 + ia_frac = float(row["in_web"]) / total + preserved_frac = float(row["is_preserved"]) / total + cur.execute( + "UPDATE fatcat_container SET release_count = ?, ia_count = ?, ia_frac = ?, preserved_count = ?, preserved_frac = ? WHERE issnl = ?", + ( + total, + row["in_web"], + ia_frac, + row["is_preserved"], + preserved_frac, + row["issnl"], + ), + ) + counts["updated"] += 1 cur.close() self.db.commit() return counts @@ -384,10 +417,10 @@ class ChoculaDatabase(): self.db.row_factory = sqlite3.Row cur = self.db.execute("SELECT issnl, url FROM homepage;") for hrow in cur: - assert(hrow['url']) - assert(len(hrow['url'].split()) == 1) - counts['total'] += 1 - print('\t'.join((hrow['issnl'], hrow['url']))) + assert hrow["url"] + assert len(hrow["url"].split()) == 1 + counts["total"] += 1 + print("\t".join((hrow["issnl"], hrow["url"]))) return counts def summarize(self) -> Counter: @@ -395,135 +428,189 @@ class ChoculaDatabase(): counts: Counter = Counter() cur = self.db.cursor() self.db.row_factory = sqlite3.Row - index_issnls = list(cur.execute('SELECT DISTINCT issnl FROM directory')) - fatcat_issnls = list(cur.execute('SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null')) + index_issnls = list(cur.execute("SELECT DISTINCT issnl FROM directory")) + fatcat_issnls = list( + cur.execute( + "SELECT DISTINCT issnl FROM fatcat_container WHERE issnl IS NOT null" + ) + ) all_issnls = set([i[0] for i in index_issnls + fatcat_issnls]) print("{} total ISSN-Ls".format(len(all_issnls))) for issnl in all_issnls: - #print(issnl) - counts['total'] += 1 + # print(issnl) + counts["total"] += 1 out = dict() # check if ISSN-L is good. this is here because of fatcat import - out['known_issnl'] = (self.issn_db.issn2issnl(issnl) == issnl) - if not out['known_issnl']: - counts['unknown-issnl'] += 1 - out['valid_issnl'] = stdnum.issn.is_valid(issnl) - if not out['valid_issnl']: - counts['invalid-issnl'] += 1 - - fatcat_row = list(self.db.execute("SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl])) + out["known_issnl"] = self.issn_db.issn2issnl(issnl) == issnl + if not out["known_issnl"]: + counts["unknown-issnl"] += 1 + out["valid_issnl"] = stdnum.issn.is_valid(issnl) + if not out["valid_issnl"]: + counts["invalid-issnl"] += 1 + + fatcat_row = list( + self.db.execute( + "SELECT * FROM fatcat_container WHERE issnl = ?;", [issnl] + ) + ) if fatcat_row: frow = fatcat_row[0] - out['fatcat_ident'] = frow['ident'] - for k in ('name', 'publisher', 'issne', 'issnp', 'wikidata_qid', 'lang', 'country', 'release_count', 'ia_count', 'ia_frac', 'kbart_count', 'kbart_frac', 'preserved_count', 'preserved_frac'): + out["fatcat_ident"] = frow["ident"] + for k in ( + "name", + "publisher", + "issne", + "issnp", + "wikidata_qid", + "lang", + "country", + "release_count", + "ia_count", + "ia_frac", + "kbart_count", + "kbart_frac", + "preserved_count", + "preserved_frac", + ): if not out.get(k) and frow[k] != None: out[k] = frow[k] cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [issnl]) for irow in cur: - if irow['slug'] in ('crossref',): - out['has_dois'] = True + if irow["slug"] in ("crossref",): + out["has_dois"] = True # TODO: other DOI registrars (japan, datacite) - if irow['slug'] == 'wikidata': - out['wikidata_qid'] = irow['identifier'] - for k in ('name',): + if irow["slug"] == "wikidata": + out["wikidata_qid"] = irow["identifier"] + for k in ("name",): if not out.get(k) and irow[k]: out[k] = irow[k] - if irow['extra']: - extra = json.loads(irow['extra']) - for k in ('country', 'lang', 'issne', 'issnp', 'publisher', 'platform'): + if irow["extra"]: + extra = json.loads(irow["extra"]) + for k in ( + "country", + "lang", + "issne", + "issnp", + "publisher", + "platform", + ): if not out.get(k) and extra.get(k): out[k] = extra[k] - if irow['slug'] in ('doaj','road','szczepanski', 'gold_oa'): - out['is_oa'] = True - if irow['slug'] == 'ezb': - ezb_extra = json.loads(irow['extra']) - if ezb_extra['ezb_color'] == 'green': - out['is_oa'] = True - if irow['slug'] == 'sherpa_romeo': - extra = json.loads(irow['extra']) - out['sherpa_color'] = extra['sherpa_romeo']['color'] - if extra['sherpa_romeo']['color'] == 'green': - out['is_oa'] = True + if irow["slug"] in ("doaj", "road", "szczepanski", "gold_oa"): + out["is_oa"] = True + if irow["slug"] == "ezb": + ezb_extra = json.loads(irow["extra"]) + if ezb_extra["ezb_color"] == "green": + out["is_oa"] = True + if irow["slug"] == "sherpa_romeo": + extra = json.loads(irow["extra"]) + out["sherpa_color"] = extra["sherpa_romeo"]["color"] + if extra["sherpa_romeo"]["color"] == "green": + out["is_oa"] = True # filter out "NA" ISSNs - for k in ('issne', 'issnp'): - if out.get(k) and (len(out[k]) != 9 or out[k][4] != '-'): + for k in ("issne", "issnp"): + if out.get(k) and (len(out[k]) != 9 or out[k][4] != "-"): out.pop(k) cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [issnl]) for hrow in cur: - out['any_homepage'] = True - if hrow['terminal_status_code'] == 200 and hrow['host'] != 'web.archive.org': - out['any_live_homepage'] = True - if hrow['gwb_url_success_dt'] or hrow['gwb_terminal_url_success_dt']: - out['any_gwb_homepage'] = True - - if out.get('wikidata_qid'): - assert out['wikidata_qid'].startswith('Q') - assert out['wikidata_qid'][1].isdigit() - assert out['wikidata_qid'][-1].isdigit() + out["any_homepage"] = True + if ( + hrow["terminal_status_code"] == 200 + and hrow["host"] != "web.archive.org" + ): + out["any_live_homepage"] = True + if hrow["gwb_url_success_dt"] or hrow["gwb_terminal_url_success_dt"]: + out["any_gwb_homepage"] = True + + if out.get("wikidata_qid"): + assert out["wikidata_qid"].startswith("Q") + assert out["wikidata_qid"][1].isdigit() + assert out["wikidata_qid"][-1].isdigit() # define publisher types - publisher = out.get('publisher') - pl = out.get('publisher', '').lower().strip() - if out.get('platform') == 'scielo': - out['publisher_type'] = 'scielo' - elif publisher in BIG5_PUBLISHERS or 'elsevier' in pl or 'springer' in pl or 'wiley' in pl: - out['publisher_type'] = 'big5' + publisher = out.get("publisher") + pl = out.get("publisher", "").lower().strip() + if out.get("platform") == "scielo": + out["publisher_type"] = "scielo" + elif ( + publisher in BIG5_PUBLISHERS + or "elsevier" in pl + or "springer" in pl + or "wiley" in pl + ): + out["publisher_type"] = "big5" elif publisher in OA_PUBLISHERS: - out['publisher_type'] = 'oa' - elif publisher in COMMERCIAL_PUBLISHERS or 'wolters kluwer' in pl or 'wolters-kluwer' in pl: - out['publisher_type'] = 'commercial' + out["publisher_type"] = "oa" + elif ( + publisher in COMMERCIAL_PUBLISHERS + or "wolters kluwer" in pl + or "wolters-kluwer" in pl + ): + out["publisher_type"] = "commercial" elif publisher in ARCHIVE_PUBLISHERS: - out['publisher_type'] = 'archive' + out["publisher_type"] = "archive" elif publisher in REPOSITORY_PUBLISHERS: - out['publisher_type'] = 'repository' + out["publisher_type"] = "repository" elif publisher in OTHER_PUBLISHERS: - out['publisher_type'] = 'other' - elif publisher in SOCIETY_PUBLISHERS or 'society' in pl or 'association' in pl or 'academy of ' in pl or 'institute of' in pl: - out['publisher_type'] = 'society' - elif publisher in UNI_PRESS_PUBLISHERS or 'university ' in pl: - out['publisher_type'] = 'unipress' - elif 'scielo' in pl: - out['publisher_type'] = 'scielo' - elif out.get('is_oa') and (not out.get('has_dois') or out.get('lang') not in (None, 'en', 'de', 'fr', 'ja') or out.get('country') not in (None, 'us', 'gb', 'nl', 'cn', 'jp', 'de')): + out["publisher_type"] = "other" + elif ( + publisher in SOCIETY_PUBLISHERS + or "society" in pl + or "association" in pl + or "academy of " in pl + or "institute of" in pl + ): + out["publisher_type"] = "society" + elif publisher in UNI_PRESS_PUBLISHERS or "university " in pl: + out["publisher_type"] = "unipress" + elif "scielo" in pl: + out["publisher_type"] = "scielo" + elif out.get("is_oa") and ( + not out.get("has_dois") + or out.get("lang") not in (None, "en", "de", "fr", "ja") + or out.get("country") not in (None, "us", "gb", "nl", "cn", "jp", "de") + ): # current informal definition of longtail - out['publisher_type'] = 'longtail' - out['is_longtail'] = True - - cur.execute("INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", - (issnl, - out.get('issne'), - out.get('issnp'), - out.get('wikidata_qid'), - out.get('fatcat_ident'), - out.get('name'), - out.get('publisher'), - out.get('country'), - out.get('lang'), - out.get('is_oa', False), - out.get('sherpa_color'), - out.get('is_longtail', False), - out.get('is_active'), - out.get('publisher_type'), - out.get('has_dois', False), - out.get('any_homepage', False), - out.get('any_live_homepage', False), - out.get('any_gwb_homepage', False), - out.get('known_issnl'), - out.get('valid_issnl'), - - out.get('release_count'), - out.get('ia_count'), - out.get('ia_frac'), - out.get('kbart_count'), - out.get('kbart_frac'), - out.get('preserved_count'), - out.get('preserved_frac'), - )) + out["publisher_type"] = "longtail" + out["is_longtail"] = True + + cur.execute( + "INSERT OR REPLACE INTO journal (issnl, issne, issnp, wikidata_qid, fatcat_ident, name, publisher, country, lang, is_oa, sherpa_color, is_longtail, is_active, publisher_type, has_dois, any_homepage, any_live_homepage, any_gwb_homepage, known_issnl, valid_issnl, release_count, ia_count, ia_frac, kbart_count, kbart_frac, preserved_count, preserved_frac) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + issnl, + out.get("issne"), + out.get("issnp"), + out.get("wikidata_qid"), + out.get("fatcat_ident"), + out.get("name"), + out.get("publisher"), + out.get("country"), + out.get("lang"), + out.get("is_oa", False), + out.get("sherpa_color"), + out.get("is_longtail", False), + out.get("is_active"), + out.get("publisher_type"), + out.get("has_dois", False), + out.get("any_homepage", False), + out.get("any_live_homepage", False), + out.get("any_gwb_homepage", False), + out.get("known_issnl"), + out.get("valid_issnl"), + out.get("release_count"), + out.get("ia_count"), + out.get("ia_frac"), + out.get("kbart_count"), + out.get("kbart_frac"), + out.get("preserved_count"), + out.get("preserved_frac"), + ), + ) cur.close() self.db.commit() return counts @@ -534,125 +621,146 @@ class ChoculaDatabase(): for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d + counts: Counter = Counter() self.db.row_factory = dict_factory cur = self.db.cursor() - for row in cur.execute('SELECT * FROM journal'): + for row in cur.execute("SELECT * FROM journal"): print(json.dumps(row)) - counts['total'] += 1 + counts["total"] += 1 return counts def export_fatcat(self): counts: Counter = Counter() self.db.row_factory = sqlite3.Row cur = self.db.cursor() - for row in cur.execute('SELECT * FROM journal WHERE valid_issnl = 1'): - counts['total'] += 1 + for row in cur.execute("SELECT * FROM journal WHERE valid_issnl = 1"): + counts["total"] += 1 - name = row['name'] + name = row["name"] if name: name = name.strip() - if not row['name']: - counts['empty-name'] += 1 + if not row["name"]: + counts["empty-name"] += 1 continue if len(name) <= 2: - counts['short-name'] += 1 + counts["short-name"] += 1 continue - publisher = row['publisher'] + publisher = row["publisher"] if publisher: publisher = publisher.strip() or None out = dict( - issnl=row['issnl'], - wikidata_qid=row['wikidata_qid'], - ident=row['fatcat_ident'], + issnl=row["issnl"], + wikidata_qid=row["wikidata_qid"], + ident=row["fatcat_ident"], publisher=publisher, name=name, - _known_issnl=row['known_issnl']) + _known_issnl=row["known_issnl"], + ) extra = dict( - issnp=row['issnp'], - issne=row['issne'], - country=row['country'], + issnp=row["issnp"], issne=row["issne"], country=row["country"], ) - if row['lang']: - extra['languages'] = [row['lang'],] - if row['sherpa_color']: - extra['sherpa_romeo'] = dict(color=row['sherpa_color']) + if row["lang"]: + extra["languages"] = [ + row["lang"], + ] + if row["sherpa_color"]: + extra["sherpa_romeo"] = dict(color=row["sherpa_color"]) urls = [] webarchive_urls = [] - cur = self.db.execute("SELECT * FROM homepage WHERE issnl = ?;", [row['issnl']]) + cur = self.db.execute( + "SELECT * FROM homepage WHERE issnl = ?;", [row["issnl"]] + ) for hrow in cur: - if '://doaj.org/' in hrow['url'] or '://www.doaj.org/' in hrow['url']: + if "://doaj.org/" in hrow["url"] or "://www.doaj.org/" in hrow["url"]: continue - if '://www.ncbi.nlm.nih.gov/' in hrow['url']: + if "://www.ncbi.nlm.nih.gov/" in hrow["url"]: continue - if 'web.archive.org/web' in hrow['url']: - webarchive_urls.append(hrow['url']) - urls.append(hrow['url']) + if "web.archive.org/web" in hrow["url"]: + webarchive_urls.append(hrow["url"]) + urls.append(hrow["url"]) continue - if hrow['host'] in ('www.google.com', 'books.google.com'): + if hrow["host"] in ("www.google.com", "books.google.com"): # individual books or google searches, not journal/conference homepages continue - if '/oai/request' in hrow['url']: + if "/oai/request" in hrow["url"]: # OAI-PMH endpoints, not homepages continue - if not row['any_live_homepage'] and hrow['gwb_url_success_dt'] and hrow['gwb_url_success_dt'] != 'error': - webarchive_urls.append("https://web.archive.org/web/{}/{}".format(hrow['gwb_url_success_dt'], hrow['url'])) + if ( + not row["any_live_homepage"] + and hrow["gwb_url_success_dt"] + and hrow["gwb_url_success_dt"] != "error" + ): + webarchive_urls.append( + "https://web.archive.org/web/{}/{}".format( + hrow["gwb_url_success_dt"], hrow["url"] + ) + ) continue - if hrow['blocked']: - urls.append(hrow['url']) + if hrow["blocked"]: + urls.append(hrow["url"]) continue - if hrow['terminal_status_code'] == 200: - if hrow['terminal_url'] == hrow['url'].replace('http://', 'https://') or hrow['terminal_url'] == hrow['url'] + "/": + if hrow["terminal_status_code"] == 200: + if ( + hrow["terminal_url"] + == hrow["url"].replace("http://", "https://") + or hrow["terminal_url"] == hrow["url"] + "/" + ): # check for trivial redirects; use post-redirect URL in those cases - urls.append(hrow['terminal_url']) + urls.append(hrow["terminal_url"]) else: - urls.append(hrow['url']) + urls.append(hrow["url"]) continue # didn't even crawl and no match? add anyways as a pass-through - if not hrow['status_code']: - urls.append(hrow['url']) + if not hrow["status_code"]: + urls.append(hrow["url"]) continue - extra['webarchive_urls'] = webarchive_urls - extra['urls'] = urls + extra["webarchive_urls"] = webarchive_urls + extra["urls"] = urls - cur = self.db.execute("SELECT * FROM directory WHERE issnl = ?;", [row['issnl']]) + cur = self.db.execute( + "SELECT * FROM directory WHERE issnl = ?;", [row["issnl"]] + ) for drow in cur: - if drow['slug'] == 'ezb': - ezb = json.loads(drow['extra']) - extra['ezb'] = dict(ezb_id=drow['identifier'], color=ezb['ezb_color']) - elif drow['slug'] == 'szczepanski': - extra['szczepanski'] = drow['extra'] - elif drow['slug'] == 'doaj': - extra['doaj'] = json.loads(drow['extra']) - elif drow['slug'] == 'scielo': - extra['scielo'] = json.loads(drow['extra']) - elif drow['slug'] == 'sim': - extra['ia'] = extra.get('ia', {}) - extra['ia']['sim'] = json.loads(drow['extra']) - extra['ia']['sim']['sim_pubid'] = drow['identifier'] - elif drow['slug'] in ('lockss', 'clockss', 'portico', 'jstor'): - extra['kbart'] = extra.get('kbart', {}) - extra['kbart'][drow['slug']] = json.loads(drow['extra']) - - out['extra'] = extra + if drow["slug"] == "ezb": + ezb = json.loads(drow["extra"]) + extra["ezb"] = dict( + ezb_id=drow["identifier"], color=ezb["ezb_color"] + ) + elif drow["slug"] == "szczepanski": + extra["szczepanski"] = drow["extra"] + elif drow["slug"] == "doaj": + extra["doaj"] = json.loads(drow["extra"]) + elif drow["slug"] == "scielo": + extra["scielo"] = json.loads(drow["extra"]) + elif drow["slug"] == "sim": + extra["ia"] = extra.get("ia", {}) + extra["ia"]["sim"] = json.loads(drow["extra"]) + extra["ia"]["sim"]["sim_pubid"] = drow["identifier"] + elif drow["slug"] in ("lockss", "clockss", "portico", "jstor"): + extra["kbart"] = extra.get("kbart", {}) + extra["kbart"][drow["slug"]] = json.loads(drow["extra"]) + + out["extra"] = extra print(json.dumps(out)) return counts def init_db(self): print("### Creating Database...", file=sys.stderr) - self.db.executescript(""" + self.db.executescript( + """ PRAGMA main.page_size = 4096; PRAGMA main.cache_size = 20000; PRAGMA main.locking_mode = EXCLUSIVE; PRAGMA main.synchronous = OFF; - """) - with open('chocula_schema.sql', 'r') as fschema: + """ + ) + with open("chocula_schema.sql", "r") as fschema: self.db.executescript(fschema.read()) print("Done!", file=sys.stderr) - diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index a233a26..90e6f26 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -1,4 +1,3 @@ - from chocula.directories.crossref import CrossrefLoader from chocula.directories.doaj import DoajLoader from chocula.directories.entrez import EntrezLoader @@ -14,7 +13,17 @@ from chocula.directories.szczepanski import SzczepanskiLoader from chocula.directories.wikidata import WikidataLoader ALL_CHOCULA_DIR_CLASSES = [ - CrossrefLoader, DoajLoader, EntrezLoader,EzbLoader, GoldOALoader, - NorwegianLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader, - SzczepanskiLoader, WikidataLoader, SimLoader, ScieloLoader, + CrossrefLoader, + DoajLoader, + EntrezLoader, + EzbLoader, + GoldOALoader, + NorwegianLoader, + OpenAPCLoader, + RoadLoader, + SherpaRomeoLoader, + SzczepanskiLoader, + WikidataLoader, + SimLoader, + ScieloLoader, ] diff --git a/chocula/directories/crossref.py b/chocula/directories/crossref.py index 4208008..a494021 100644 --- a/chocula/directories/crossref.py +++ b/chocula/directories/crossref.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -23,14 +22,14 @@ class CrossrefLoader(DirectoryLoader): def parse_record(self, record) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, - issne=record['eissn'], - issnp=record['pissn'], - custom_id=record.get('doi').strip() or None, - name=clean_str(record.get('JournalTitle')), - publisher=clean_str(record.get('Publisher')), + issne=record["eissn"], + issnp=record["pissn"], + custom_id=record.get("doi").strip() or None, + name=clean_str(record.get("JournalTitle")), + publisher=clean_str(record.get("Publisher")), ) - if record['additionalIssns']: - info.raw_issn = record['additionalIssns'][0] + if record["additionalIssns"]: + info.raw_issn = record["additionalIssns"][0] return info diff --git a/chocula/directories/doaj.py b/chocula/directories/doaj.py index 7968dc2..795ce68 100644 --- a/chocula/directories/doaj.py +++ b/chocula/directories/doaj.py @@ -1,8 +1,13 @@ - from typing import Iterable, Optional, Dict, Any import csv -from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP +from chocula.util import ( + clean_str, + parse_mimetypes, + parse_country, + parse_lang, + PLATFORM_MAP, +) from chocula.common import DirectoryLoader from chocula.database import DirectoryInfo, HomepageUrl @@ -81,40 +86,43 @@ class DoajLoader(DirectoryLoader): info = DirectoryInfo( directory_slug=self.source_slug, - issnp=row['Journal ISSN (print version)'], - issne=row['Journal EISSN (online version)'], - name=clean_str(row['Journal title']), - publisher=clean_str(row['Publisher']), - platform=PLATFORM_MAP.get(row['Platform, host or aggregator']), - country=parse_country(row['Country of publisher']), + issnp=row["Journal ISSN (print version)"], + issne=row["Journal EISSN (online version)"], + name=clean_str(row["Journal title"]), + publisher=clean_str(row["Publisher"]), + platform=PLATFORM_MAP.get(row["Platform, host or aggregator"]), + country=parse_country(row["Country of publisher"]), ) - lang = parse_lang(row['Full text language']) + lang = parse_lang(row["Full text language"]) if lang: info.langs.append(lang) extra: Dict[str, Any] = dict(doaj=dict()) - extra['mimetypes'] = parse_mimetypes(row['Full text formats']) - extra['doaj']['as_of'] = self.config.snapshot.date - if row['DOAJ Seal']: - extra['doaj']['seal'] = {"no": False, "yes": True}[row['DOAJ Seal'].lower()] + extra["mimetypes"] = parse_mimetypes(row["Full text formats"]) + extra["doaj"]["as_of"] = self.config.snapshot.date + if row["DOAJ Seal"]: + extra["doaj"]["seal"] = {"no": False, "yes": True}[row["DOAJ Seal"].lower()] - if row['Digital archiving policy or program(s)']: - extra['archive'] = [a.strip() for a in row['Digital archiving policy or program(s)'].split(',') if a.strip()] - elif row['Archiving: national library']: - extra['archive'] = ['national-library'] + if row["Digital archiving policy or program(s)"]: + extra["archive"] = [ + a.strip() + for a in row["Digital archiving policy or program(s)"].split(",") + if a.strip() + ] + elif row["Archiving: national library"]: + extra["archive"] = ["national-library"] - crawl_permission = row['Journal full-text crawl permission'] + crawl_permission = row["Journal full-text crawl permission"] if crawl_permission: - extra['crawl-permission'] = dict(Yes=True, No=False)[crawl_permission] - default_license = row['Journal license'] - if default_license and default_license.startswith('CC'): - extra['default_license'] = default_license.replace('CC ', 'CC-').strip() + extra["crawl-permission"] = dict(Yes=True, No=False)[crawl_permission] + default_license = row["Journal license"] + if default_license and default_license.startswith("CC"): + extra["default_license"] = default_license.replace("CC ", "CC-").strip() - url = row['Journal URL'] + url = row["Journal URL"] if url: - homepage = HomepageUrl.from_url(row['Journal URL']) + homepage = HomepageUrl.from_url(row["Journal URL"]) if homepage: info.homepage_urls.append(homepage) return info - diff --git a/chocula/directories/entrez.py b/chocula/directories/entrez.py index b30f04d..f9f6d23 100644 --- a/chocula/directories/entrez.py +++ b/chocula/directories/entrez.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -26,14 +25,13 @@ class EntrezLoader(DirectoryLoader): return csv.DictReader(open(self.config.entrez_simple.filepath)) def parse_record(self, record) -> Optional[DirectoryInfo]: - if not (record.get('ISSN (Online)') or record.get('ISSN (Print)')): + if not (record.get("ISSN (Online)") or record.get("ISSN (Print)")): return None return DirectoryInfo( directory_slug=self.source_slug, - issne=record.get('ISSN (Online)'), - issnp=record.get('ISSN (Print)'), - custom_id=record.get('NlmId').strip() or None, - name=clean_str(record.get('JournalTitle')), - abbrev=clean_str(record['IsoAbbr']), + issne=record.get("ISSN (Online)"), + issnp=record.get("ISSN (Print)"), + custom_id=record.get("NlmId").strip() or None, + name=clean_str(record.get("JournalTitle")), + abbrev=clean_str(record["IsoAbbr"]), ) - diff --git a/chocula/directories/ezb.py b/chocula/directories/ezb.py index 1573048..056350d 100644 --- a/chocula/directories/ezb.py +++ b/chocula/directories/ezb.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import json @@ -16,7 +15,7 @@ class EzbLoader(DirectoryLoader): source_slug = "ezb" def open_file(self) -> Iterable: - return open(self.config.ezb.filepath, 'r') + return open(self.config.ezb.filepath, "r") def parse_record(self, row) -> Optional[DirectoryInfo]: @@ -26,21 +25,29 @@ class EzbLoader(DirectoryLoader): info = DirectoryInfo( directory_slug=self.source_slug, - issne=row.get('issne'), - issnp=row.get('issnp'), - custom_id=row['ezb_id'], - name=clean_str(row['title']), - publisher=clean_str(row.get('publisher')), + issne=row.get("issne"), + issnp=row.get("issnp"), + custom_id=row["ezb_id"], + name=clean_str(row["title"]), + publisher=clean_str(row.get("publisher")), ) info.extra = dict() - for k in ('ezb_color', 'subjects', 'keywords', 'zdb_id', - 'first_volume', 'first_issue', 'first_year', - 'appearance', 'costs'): + for k in ( + "ezb_color", + "subjects", + "keywords", + "zdb_id", + "first_volume", + "first_issue", + "first_year", + "appearance", + "costs", + ): if row.get(k): info.extra[k] = row[k] - url = HomepageUrl.from_url(row.get('url')) + url = HomepageUrl.from_url(row.get("url")) if url: info.homepage_urls.append(url) diff --git a/chocula/directories/gold_oa.py b/chocula/directories/gold_oa.py index a75944d..d0c6e8b 100644 --- a/chocula/directories/gold_oa.py +++ b/chocula/directories/gold_oa.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -21,11 +20,11 @@ class GoldOALoader(DirectoryLoader): def parse_record(self, row) -> Optional[DirectoryInfo]: - if not (row.get('ISSN_L') and row.get('TITLE')): + if not (row.get("ISSN_L") and row.get("TITLE")): return None # TODO: also add for other non-direct indices - #for ind in ('WOS', 'SCOPUS'): + # for ind in ('WOS', 'SCOPUS'): # issnl, status = self.add_issn( # ind.lower(), # raw_issn=row['ISSN_L'], @@ -33,12 +32,12 @@ class GoldOALoader(DirectoryLoader): # ) extra = dict() - for ind in ('DOAJ', 'ROAD', 'PMC', 'OAPC', 'WOS', 'SCOPUS'): - extra['in_' + ind.lower()] = bool(int(row['JOURNAL_IN_' + ind])) + for ind in ("DOAJ", "ROAD", "PMC", "OAPC", "WOS", "SCOPUS"): + extra["in_" + ind.lower()] = bool(int(row["JOURNAL_IN_" + ind])) return DirectoryInfo( directory_slug=self.source_slug, - raw_issn=row['ISSN_L'], - name=clean_str(row['TITLE']), + raw_issn=row["ISSN_L"], + name=clean_str(row["TITLE"]), extra=extra, ) diff --git a/chocula/directories/norwegian.py b/chocula/directories/norwegian.py index 2b83961..2425318 100644 --- a/chocula/directories/norwegian.py +++ b/chocula/directories/norwegian.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -52,29 +51,31 @@ class NorwegianLoader(DirectoryLoader): source_slug = "norwegian" def open_file(self) -> Iterable: - return csv.DictReader(open(self.config.norwegian.filepath, encoding="ISO-8859-1"), delimiter=";") + return csv.DictReader( + open(self.config.norwegian.filepath, encoding="ISO-8859-1"), delimiter=";" + ) def parse_record(self, row) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, - issnp=row['Print ISSN'], - issne=row['Online ISSN'], - country=parse_country(row['Country of publication']), - name=clean_str(row.get('International title')), - langs=[l for l in [parse_lang(row['Language'])] if l], + issnp=row["Print ISSN"], + issne=row["Online ISSN"], + country=parse_country(row["Country of publication"]), + name=clean_str(row.get("International title")), + langs=[l for l in [parse_lang(row["Language"])] if l], ) - info.extra['norwegian'] = dict(as_of=self.config.norwegian.date) - if row['Level 2019']: - info.extra['norwegian']['level'] = int(row['Level 2019']) + info.extra["norwegian"] = dict(as_of=self.config.norwegian.date) + if row["Level 2019"]: + info.extra["norwegian"]["level"] = int(row["Level 2019"]) - if row['Original title'] != row['International title']: - info.original_name = clean_str(row['Original title']) + if row["Original title"] != row["International title"]: + info.original_name = clean_str(row["Original title"]) - identifier=row['NSD tidsskrift_id'], - publisher=row['Publisher'], + identifier = (row["NSD tidsskrift_id"],) + publisher = (row["Publisher"],) - url = HomepageUrl.from_url(row['URL']) + url = HomepageUrl.from_url(row["URL"]) if url: info.homepage_urls.append(url) diff --git a/chocula/directories/openapc.py b/chocula/directories/openapc.py index c2acd95..99304c3 100644 --- a/chocula/directories/openapc.py +++ b/chocula/directories/openapc.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -21,24 +20,22 @@ class OpenAPCLoader(DirectoryLoader): def parse_record(self, row) -> Optional[DirectoryInfo]: - if not row.get('issn'): + if not row.get("issn"): return None info = DirectoryInfo( directory_slug=self.source_slug, - issne=row['issn_electronic'], - issnp=row['issn_print'], - raw_issn=row['issn_l'] or row['issn'], - name=clean_str(row['journal_full_title']), - publisher=clean_str(row['publisher']), + issne=row["issn_electronic"], + issnp=row["issn_print"], + raw_issn=row["issn_l"] or row["issn"], + name=clean_str(row["journal_full_title"]), + publisher=clean_str(row["publisher"]), ) - info.extra['is_hybrid'] = bool(row['is_hybrid']) + info.extra["is_hybrid"] = bool(row["is_hybrid"]) - homepage = HomepageUrl.from_url(row['url']) + homepage = HomepageUrl.from_url(row["url"]) if homepage: info.homepage_urls.append(homepage) return info - - diff --git a/chocula/directories/road.py b/chocula/directories/road.py index 23cca65..bc550fd 100644 --- a/chocula/directories/road.py +++ b/chocula/directories/road.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -26,27 +25,39 @@ class RoadLoader(DirectoryLoader): source_slug = "road" def open_file(self) -> Iterable: - return csv.DictReader(open(self.config.road.filepath), delimiter='\t', - fieldnames=("ISSN", "ISSN-L", "Short Title", "Title", "Publisher", "URL1", "URL2", "Region", "Lang1", "Lang2") + return csv.DictReader( + open(self.config.road.filepath), + delimiter="\t", + fieldnames=( + "ISSN", + "ISSN-L", + "Short Title", + "Title", + "Publisher", + "URL1", + "URL2", + "Region", + "Lang1", + "Lang2", + ), ) def parse_record(self, row) -> Optional[DirectoryInfo]: info = DirectoryInfo( directory_slug=self.source_slug, - raw_issn=row['ISSN-L'], - name=clean_str(row['Short Title']), - publisher=clean_str(row['Publisher']), - langs=[l for l in (row['Lang1'], row['Lang2']) if l], + raw_issn=row["ISSN-L"], + name=clean_str(row["Short Title"]), + publisher=clean_str(row["Publisher"]), + langs=[l for l in (row["Lang1"], row["Lang2"]) if l], ) # TODO: region mapping: "Europe and North America" # TODO: lang mapping: already alpha-3 # homepages - for url in [u for u in (row['URL1'], row['URL2']) if u]: + for url in [u for u in (row["URL1"], row["URL2"]) if u]: homepage = HomepageUrl.from_url(url) if homepage: info.homepage_urls.append(homepage) return info - diff --git a/chocula/directories/scielo.py b/chocula/directories/scielo.py index 247866b..0ed8fde 100644 --- a/chocula/directories/scielo.py +++ b/chocula/directories/scielo.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import json @@ -17,32 +16,31 @@ class ScieloLoader(DirectoryLoader): def parse_record(self, line) -> Optional[DirectoryInfo]: record = json.loads(line) extra = dict( - status=clean_str(record.get('current_status')), - first_year=record.get('first_year'), - collection=record.get('collection_acronym'), + status=clean_str(record.get("current_status")), + first_year=record.get("first_year"), + collection=record.get("collection_acronym"), ) for k in list(extra.keys()): if extra[k] is None: extra.pop(k) country: Optional[str] = None - if record['publisher_country'] and len(record['publisher_country'][0]) == 2: - country = record['publisher_country'][0].lower() + if record["publisher_country"] and len(record["publisher_country"][0]) == 2: + country = record["publisher_country"][0].lower() info = DirectoryInfo( directory_slug=self.source_slug, - issne=clean_issn(record.get('electronic_issn') or ''), - issnp=clean_issn(record.get('print_issn') or ''), - custom_id=clean_str(record.get('scielo_issn')), - name=clean_str(record.get('fulltitle')), - publisher=clean_str((record.get('publisher_name') or [''])[0]), - abbrev=clean_str(record['abbreviated_iso_title']), - platform='scielo', - langs=list(filter(lambda s: len(s) == 2, record['languages'])), + issne=clean_issn(record.get("electronic_issn") or ""), + issnp=clean_issn(record.get("print_issn") or ""), + custom_id=clean_str(record.get("scielo_issn")), + name=clean_str(record.get("fulltitle")), + publisher=clean_str((record.get("publisher_name") or [""])[0]), + abbrev=clean_str(record["abbreviated_iso_title"]), + platform="scielo", + langs=list(filter(lambda s: len(s) == 2, record["languages"])), country=country, extra=extra, ) - if record['url']: - homepage = HomepageUrl.from_url(record['url']) + if record["url"]: + homepage = HomepageUrl.from_url(record["url"]) if homepage: info.homepage_urls.append(homepage) return info - diff --git a/chocula/directories/sherpa_romeo.py b/chocula/directories/sherpa_romeo.py index e92dc69..a8ba1b0 100644 --- a/chocula/directories/sherpa_romeo.py +++ b/chocula/directories/sherpa_romeo.py @@ -1,4 +1,3 @@ - import sys from typing import Iterable, Optional, Dict, Any import csv @@ -27,32 +26,38 @@ class SherpaRomeoLoader(DirectoryLoader): # first load policies print("##### Loading SHERPA/ROMEO policies...", file=sys.stderr) - fixed_policy_file = ftfy.fix_file(open(self.config.sherpa_romeo_policies_simple.filepath, 'rb')) + fixed_policy_file = ftfy.fix_file( + open(self.config.sherpa_romeo_policies_simple.filepath, "rb") + ) policy_reader = csv.DictReader(fixed_policy_file) for row in policy_reader: - self.sherpa_policies[row['RoMEO Record ID']] = row + self.sherpa_policies[row["RoMEO Record ID"]] = row # then open regular file - raw_file = open(self.config.sherpa_romeo_journals_simple.filepath, 'rb').read().decode(errors='replace') + raw_file = ( + open(self.config.sherpa_romeo_journals_simple.filepath, "rb") + .read() + .decode(errors="replace") + ) fixed_file = ftfy.fix_text(raw_file) - return csv.DictReader(fixed_file.split('\n')) + return csv.DictReader(fixed_file.split("\n")) def parse_record(self, row) -> Optional[DirectoryInfo]: # super mangled :( - row.update(self.sherpa_policies[row['RoMEO Record ID']]) + row.update(self.sherpa_policies[row["RoMEO Record ID"]]) info = DirectoryInfo( directory_slug=self.source_slug, - issnp=row['ISSN'], - issne=row['ESSN'], - name=clean_str(row['Journal Title']), - publisher=clean_str(row['Publisher']), - country=parse_country(row['Country']), - custom_id=row['RoMEO Record ID'], + issnp=row["ISSN"], + issne=row["ESSN"], + name=clean_str(row["Journal Title"]), + publisher=clean_str(row["Publisher"]), + country=parse_country(row["Country"]), + custom_id=row["RoMEO Record ID"], ) - if row['RoMEO colour']: - info.extra['sherpa_romeo'] = dict(color=row['RoMEO colour']) + if row["RoMEO colour"]: + info.extra["sherpa_romeo"] = dict(color=row["RoMEO colour"]) return info diff --git a/chocula/directories/sim.py b/chocula/directories/sim.py index ff5cce3..97f84d2 100644 --- a/chocula/directories/sim.py +++ b/chocula/directories/sim.py @@ -1,8 +1,14 @@ - from typing import Iterable, Optional, Dict, Any import csv -from chocula.util import clean_str, parse_mimetypes, parse_country, parse_lang, PLATFORM_MAP, gaps_to_spans +from chocula.util import ( + clean_str, + parse_mimetypes, + parse_country, + parse_lang, + PLATFORM_MAP, + gaps_to_spans, +) from chocula.common import DirectoryLoader from chocula.database import DirectoryInfo, HomepageUrl @@ -37,35 +43,34 @@ class SimLoader(DirectoryLoader): # TODO: 'Pub Type' extra: Dict[str, Any] = {} - first_year = row['First Volume'] + first_year = row["First Volume"] if first_year: first_year = int(first_year) - extra['first_year'] = int(row['First Volume']) + extra["first_year"] = int(row["First Volume"]) else: first_year = None - last_year = row['Last Volume'] + last_year = row["Last Volume"] if last_year: last_year = int(last_year) - extra['last_year'] = last_year + extra["last_year"] = last_year else: last_year = None - gaps = [int(g) for g in row['NA Gaps'].split(';') if g.strip()] + gaps = [int(g) for g in row["NA Gaps"].split(";") if g.strip()] if gaps: - extra['gaps'] = gaps + extra["gaps"] = gaps if first_year and last_year: - extra['year_spans'] = gaps_to_spans(first_year, last_year, gaps) - extra['scholarly_peer_reviewed'] = row["Scholarly / Peer-\nReviewed"] - extra['peer_reviewed'] = row["Peer-\nReviewed"] - extra['pub_type'] = clean_str(row["Pub Type"]) + extra["year_spans"] = gaps_to_spans(first_year, last_year, gaps) + extra["scholarly_peer_reviewed"] = row["Scholarly / Peer-\nReviewed"] + extra["peer_reviewed"] = row["Peer-\nReviewed"] + extra["pub_type"] = clean_str(row["Pub Type"]) info = DirectoryInfo( directory_slug=self.source_slug, - name=clean_str(row['Title']), - publisher=clean_str(row['Publisher']), - raw_issn=row['ISSN'][:9], - custom_id=row.get('NA Pub Cat ID').strip() or None, - langs=[parse_lang(row['Pub Language'])], + name=clean_str(row["Title"]), + publisher=clean_str(row["Publisher"]), + raw_issn=row["ISSN"][:9], + custom_id=row.get("NA Pub Cat ID").strip() or None, + langs=[parse_lang(row["Pub Language"])], extra=extra, ) return info - diff --git a/chocula/directories/szczepanski.py b/chocula/directories/szczepanski.py index 0d1558a..3586acb 100644 --- a/chocula/directories/szczepanski.py +++ b/chocula/directories/szczepanski.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import json @@ -16,7 +15,7 @@ class SzczepanskiLoader(DirectoryLoader): source_slug = "szczepanski" def open_file(self) -> Iterable: - return open(self.config.szczepanski.filepath, 'r') + return open(self.config.szczepanski.filepath, "r") def parse_record(self, row) -> Optional[DirectoryInfo]: @@ -27,21 +26,21 @@ class SzczepanskiLoader(DirectoryLoader): info = DirectoryInfo( directory_slug=self.source_slug, - issne=row.get('issne'), - issnp=row.get('issnp'), - raw_issn=row.get('issn'), - name=clean_str(row['title']), - publisher=clean_str(row.get('ed')), + issne=row.get("issne"), + issnp=row.get("issnp"), + raw_issn=row.get("issn"), + name=clean_str(row["title"]), + publisher=clean_str(row.get("ed")), ) - info.extra['szczepanski'] = dict(as_of=self.config.szczepanski.date) - if row.get('extra'): - info.extra['szczepanski']['notes'] = row.get('extra') - for k in ('other_titles', 'year_spans', 'ed'): + info.extra["szczepanski"] = dict(as_of=self.config.szczepanski.date) + if row.get("extra"): + info.extra["szczepanski"]["notes"] = row.get("extra") + for k in ("other_titles", "year_spans", "ed"): if row.get(k): - info.extra['szczepanski'][k] = row[k] + info.extra["szczepanski"][k] = row[k] - url = HomepageUrl.from_url(row.get('url')) + url = HomepageUrl.from_url(row.get("url")) if url: info.homepage_urls.append(url) diff --git a/chocula/directories/wikidata.py b/chocula/directories/wikidata.py index d16d8df..5ffe6fb 100644 --- a/chocula/directories/wikidata.py +++ b/chocula/directories/wikidata.py @@ -1,4 +1,3 @@ - from typing import Iterable, Optional import csv @@ -16,27 +15,31 @@ class WikidataLoader(DirectoryLoader): source_slug = "wikidata" def open_file(self) -> Iterable: - return csv.DictReader(open(self.config.wikidata.filepath), delimiter='\t') + return csv.DictReader(open(self.config.wikidata.filepath), delimiter="\t") def parse_record(self, row) -> Optional[DirectoryInfo]: - if not (row.get('issn') and row.get('title')): + if not (row.get("issn") and row.get("title")): return None - wikidata_qid = row['item'].strip().split('/')[-1] - publisher = row['publisher_name'] - if (publisher.startswith('Q') and publisher[1].isdigit()) or publisher.startswith('t1') or not publisher: + wikidata_qid = row["item"].strip().split("/")[-1] + publisher = row["publisher_name"] + if ( + (publisher.startswith("Q") and publisher[1].isdigit()) + or publisher.startswith("t1") + or not publisher + ): publisher = None - info =DirectoryInfo( + info = DirectoryInfo( directory_slug=self.source_slug, - raw_issn=row['issn'], + raw_issn=row["issn"], custom_id=wikidata_qid, - name=clean_str(row['title']), + name=clean_str(row["title"]), publisher=clean_str(publisher), ) - if row.get('start_year'): - info.extra['start_year'] = row['start_year'] + if row.get("start_year"): + info.extra["start_year"] = row["start_year"] - url = HomepageUrl.from_url(row.get('websiteurl')) + url = HomepageUrl.from_url(row.get("websiteurl")) if url: info.homepage_urls.append(url) diff --git a/chocula/kbart.py b/chocula/kbart.py index 6c1f580..e8094e3 100644 --- a/chocula/kbart.py +++ b/chocula/kbart.py @@ -1,4 +1,3 @@ - from typing import List, Any from chocula.common import KbartLoader @@ -25,7 +24,7 @@ class PorticoKbartLoader(KbartLoader): def file_path(self) -> str: return self.config.portico.filepath - + class JstorKbartLoader(KbartLoader): diff --git a/chocula/util.py b/chocula/util.py index 2cb771d..11303b8 100644 --- a/chocula/util.py +++ b/chocula/util.py @@ -1,4 +1,3 @@ - import sys from dataclasses import dataclass from typing import Dict, Optional @@ -11,119 +10,120 @@ import pycountry # NOTE: this is a partial list, focusing on non-publisher hosted platforms and # software frameworks PLATFORM_MAP = { - 'OJS': 'ojs', - 'OJS SEER': 'ojs', - 'Open Journal System/OJS': 'ojs', - 'BMC': 'bmc', - 'SciELO Brazil': 'scielo', - 'SciELO Argentina': 'scielo', - 'SciELO': 'scielo', - 'SciELO Mexico': 'scielo', - 'SciELO Spain': 'scielo', - 'SciELO Portugal': 'scielo', - 'WordPress': 'wordpress', - 'Sciendo': 'sciendo', - 'Drupal': 'drupal', - 'revues.org': 'openedition', + "OJS": "ojs", + "OJS SEER": "ojs", + "Open Journal System/OJS": "ojs", + "BMC": "bmc", + "SciELO Brazil": "scielo", + "SciELO Argentina": "scielo", + "SciELO": "scielo", + "SciELO Mexico": "scielo", + "SciELO Spain": "scielo", + "SciELO Portugal": "scielo", + "WordPress": "wordpress", + "Sciendo": "sciendo", + "Drupal": "drupal", + "revues.org": "openedition", } MIMETYPE_MAP = { - 'PDF': 'application/pdf', - 'HTML': 'text/html', - 'XML': 'application/xml', + "PDF": "application/pdf", + "HTML": "text/html", + "XML": "application/xml", } BIG5_PUBLISHERS = [ - 'Elsevier', - 'Informa UK (Taylor & Francis)', - 'Springer-Verlag', - 'SAGE Publications', - 'Wiley (Blackwell Publishing)', - 'Wiley (John Wiley & Sons)', - 'Springer (Biomed Central Ltd.)', - 'Springer Nature', + "Elsevier", + "Informa UK (Taylor & Francis)", + "Springer-Verlag", + "SAGE Publications", + "Wiley (Blackwell Publishing)", + "Wiley (John Wiley & Sons)", + "Springer (Biomed Central Ltd.)", + "Springer Nature", ] COMMERCIAL_PUBLISHERS = [ - 'Peter Lang International Academic Publishers', - 'Walter de Gruyter GmbH', - 'Oldenbourg Wissenschaftsverlag', - 'Georg Thieme Verlag KG', # not springer - 'Emerald (MCB UP )', - 'Medknow Publications', - 'Inderscience Enterprises Ltd', - 'Bentham Science', - 'Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins', - 'Scientific Research Publishing, Inc', - 'MDPI AG', - 'S. Karger AG', - 'Pleiades Publishing', - 'Science Publishing Group', - 'IGI Global', - 'The Economist Intelligence Unit', - 'Maney Publishing', - 'Diva Enterprises Private Limited', - 'World Scientific', - 'Mary Ann Liebert', - 'Trans Tech Publications', + "Peter Lang International Academic Publishers", + "Walter de Gruyter GmbH", + "Oldenbourg Wissenschaftsverlag", + "Georg Thieme Verlag KG", # not springer + "Emerald (MCB UP )", + "Medknow Publications", + "Inderscience Enterprises Ltd", + "Bentham Science", + "Ovid Technologies (Wolters Kluwer) - Lippincott Williams & Wilkins", + "Scientific Research Publishing, Inc", + "MDPI AG", + "S. Karger AG", + "Pleiades Publishing", + "Science Publishing Group", + "IGI Global", + "The Economist Intelligence Unit", + "Maney Publishing", + "Diva Enterprises Private Limited", + "World Scientific", + "Mary Ann Liebert", + "Trans Tech Publications", ] OA_PUBLISHERS = [ - 'Hindawi Limited', - 'OMICS Publishing Group', - 'De Gruyter Open Sp. z o.o.', - 'OpenEdition', - 'Hindawi (International Scholarly Research Network)', - 'Public Library of Science', - 'Frontiers Media SA', - 'eLife Sciences Publications, Ltd', - 'MDPI AG', - 'Hindawi (International Scholarly Research Network)', - 'Dove Medical Press', - 'Open Access Text', + "Hindawi Limited", + "OMICS Publishing Group", + "De Gruyter Open Sp. z o.o.", + "OpenEdition", + "Hindawi (International Scholarly Research Network)", + "Public Library of Science", + "Frontiers Media SA", + "eLife Sciences Publications, Ltd", + "MDPI AG", + "Hindawi (International Scholarly Research Network)", + "Dove Medical Press", + "Open Access Text", ] SOCIETY_PUBLISHERS = [ - 'Institute of Electrical and Electronics Engineers', - 'Institution of Electrical Engineers', - 'Association for Computing Machinery', - 'American Psychological Association', - 'IOS Press', - 'IOP Publishing', - 'American Chemical Society', - 'Royal Society of Chemistry (RSC)', - 'American Geophysical Union', - 'American College of Physicians', - 'New England Journal of Medicine', - 'BMJ', - 'RCN Publishing', - 'International Union of Crystallography', - 'Portland Press', - 'ASME International', + "Institute of Electrical and Electronics Engineers", + "Institution of Electrical Engineers", + "Association for Computing Machinery", + "American Psychological Association", + "IOS Press", + "IOP Publishing", + "American Chemical Society", + "Royal Society of Chemistry (RSC)", + "American Geophysical Union", + "American College of Physicians", + "New England Journal of Medicine", + "BMJ", + "RCN Publishing", + "International Union of Crystallography", + "Portland Press", + "ASME International", ] UNI_PRESS_PUBLISHERS = [ - 'Cambridge University Press', - 'Oxford University Press', - 'The University of Chicago Press', - 'MIT Press', + "Cambridge University Press", + "Oxford University Press", + "The University of Chicago Press", + "MIT Press", ] ARCHIVE_PUBLISHERS = [ - 'JSTOR', - 'Portico', + "JSTOR", + "Portico", ] REPOSITORY_PUBLISHERS = [ - 'PERSEE Program', - 'Social Science Electronic Publishing', - 'CAIRN', - 'CSIRO Publishing', + "PERSEE Program", + "Social Science Electronic Publishing", + "CAIRN", + "CSIRO Publishing", ] OTHER_PUBLISHERS = [ - 'African Journals Online', - 'Smithsonian Institution Biodiversity Heritage Library', - 'Canadian Science Publishing', - 'Philosophy Documentation Center', - 'Project MUSE', + "African Journals Online", + "Smithsonian Institution Biodiversity Heritage Library", + "Canadian Science Publishing", + "Philosophy Documentation Center", + "Project MUSE", ] + def parse_lang(s): - if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'): + if not s or s in ("Not applicable", "Multiple languages", "Unknown"): return None try: if len(s) == 2: @@ -138,8 +138,9 @@ def parse_lang(s): except AttributeError: return None + def parse_country(s): - if not s or s in ('Unknown'): + if not s or s in ("Unknown"): return None try: if len(s) == 2: @@ -153,12 +154,13 @@ def parse_country(s): else: return None + def parse_mimetypes(val): # XXX: multiple mimetypes? if not val: return mimetype = None - if '/' in val: + if "/" in val: mimetype = val else: mimetype = MIMETYPE_MAP.get(val) @@ -166,13 +168,14 @@ def parse_mimetypes(val): return None return [mimetype] + def gaps_to_spans(first, last, gaps): if not gaps: return [[first, last]] if not (last >= first and max(gaps) < last and min(gaps) > first): # years seem mangled? will continue though print("mangled years: {}".format((first, last, gaps)), file=sys.stderr) - full = list(range(first, last+1)) + full = list(range(first, last + 1)) for missing in gaps: if missing in full: full.remove(missing) @@ -184,7 +187,7 @@ def gaps_to_spans(first, last, gaps): low = year last = year continue - if year != last+1: + if year != last + 1: spans.append([low, last]) low = year last = year @@ -193,15 +196,17 @@ def gaps_to_spans(first, last, gaps): spans.append([low, last]) return spans + def test_gaps(): - assert gaps_to_spans(1900, 1900, None) == \ - [[1900, 1900]] - assert gaps_to_spans(1900, 1903, None) == \ - [[1900, 1903]] - assert gaps_to_spans(1900, 1902, [1901]) == \ - [[1900, 1900], [1902, 1902]] - assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == \ - [[1950, 1954], [1957, 1964], [1966, 1970]] + assert gaps_to_spans(1900, 1900, None) == [[1900, 1900]] + assert gaps_to_spans(1900, 1903, None) == [[1900, 1903]] + assert gaps_to_spans(1900, 1902, [1901]) == [[1900, 1900], [1902, 1902]] + assert gaps_to_spans(1950, 1970, [1955, 1956, 1965]) == [ + [1950, 1954], + [1957, 1964], + [1966, 1970], + ] + def merge_spans(old, new): if not new: @@ -211,7 +216,7 @@ def merge_spans(old, new): old.extend(new) years = set() for span in old: - for y in range(span[0], span[1]+1): + for y in range(span[0], span[1] + 1): years.add(y) if not years: return [] @@ -240,19 +245,14 @@ def merge_spans(old, new): spans.append([start, last]) return spans + def test_merge_spans(): - assert merge_spans([[5, 10]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([[5, 9]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([[5, 11]], [[10, 20]]) == \ - [[5, 20]] - assert merge_spans([], []) == \ - [] - assert merge_spans([[9, 11]], []) == \ - [[9,11]] - assert merge_spans([[2000, 2000]], [[1450, 1900]]) == \ - [[1450, 1900], [2000, 2000]] + assert merge_spans([[5, 10]], [[10, 20]]) == [[5, 20]] + assert merge_spans([[5, 9]], [[10, 20]]) == [[5, 20]] + assert merge_spans([[5, 11]], [[10, 20]]) == [[5, 20]] + assert merge_spans([], []) == [] + assert merge_spans([[9, 11]], []) == [[9, 11]] + assert merge_spans([[2000, 2000]], [[1450, 1900]]) == [[1450, 1900], [2000, 2000]] def unquote(s: str) -> str: @@ -260,7 +260,7 @@ def unquote(s: str) -> str: s = s[1:] if s.endswith('"') or s.endswith("'"): s = s[:-1] - if s.endswith('.'): + if s.endswith("."): s = s[:-1] return s.strip() @@ -283,6 +283,7 @@ def clean_str(s: Optional[str]) -> Optional[str]: s = unquote(ftfy.fix_text(s)) return s or None + def test_clean_str(): assert clean_str("") is None assert clean_str(" ") is None @@ -290,7 +291,6 @@ def test_clean_str(): assert clean_str(" Bloody work.") == "Bloody work" - def clean_issn(s: str) -> Optional[str]: s = s.strip().upper() if len(s) == 8: @@ -299,6 +299,7 @@ def clean_issn(s: str) -> Optional[str]: return None return s + def test_clean_issn(): assert clean_issn("1234-5678") == "1234-5678" assert clean_issn(" 12345678") == "1234-5678" diff --git a/tests/test_database.py b/tests/test_database.py index 3d41e79..dc75d23 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -1,4 +1,3 @@ - from chocula.database import IssnDatabase @@ -6,10 +5,9 @@ def test_issn_database(): issn_db = IssnDatabase(issn_issnl_file_path="tests/files/ISSN-to-ISSN-L.txt") - assert issn_db.issn2issnl('1234-5678') is None - assert issn_db.issn2issnl('0000-0000') is None + assert issn_db.issn2issnl("1234-5678") is None + assert issn_db.issn2issnl("0000-0000") is None # "The Lancet" - assert issn_db.issn2issnl('0140-6736') == '0140-6736' - assert issn_db.issn2issnl('1474-547X') == '0140-6736' - + assert issn_db.issn2issnl("0140-6736") == "0140-6736" + assert issn_db.issn2issnl("1474-547X") == "0140-6736" diff --git a/tests/test_directories.py b/tests/test_directories.py index 90856bc..b366192 100644 --- a/tests/test_directories.py +++ b/tests/test_directories.py @@ -1,26 +1,29 @@ - import pytest from chocula import * + @pytest.fixture def config(): config = ChoculaConfig.from_file(sources_dir="tests/files/") return config + @pytest.fixture def issn_db(): return IssnDatabase(issn_issnl_file_path="tests/files/ISSN-to-ISSN-L.txt") + @pytest.fixture def database(issn_db): db = ChoculaDatabase(db_file=":memory:", issn_db=issn_db) db.init_db() return db + def test_all(config, database): for cls in ALL_CHOCULA_DIR_CLASSES: loader = cls(config) counts = loader.index_file(database) - assert counts['total'] >= 20 - assert counts['inserted'] > 5 + assert counts["total"] >= 20 + assert counts["inserted"] > 5 |