import sys import json import sqlite3 import argparse from dataclasses import dataclass from typing import List, Dict, Tuple, Optional, Any, Sequence import fatcat_openapi_client import elasticsearch from elasticsearch_dsl import Search, Q @dataclass class SimPubRow: sim_pubid: str pub_collection: str title: str issn: Optional[str] pub_type: Optional[str] publisher: Optional[str] container_issnl: Optional[str] container_ident: Optional[str] wikidata_qid: Optional[str] def tuple(self): return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid) @dataclass class SimIssueRow: """ TODO: - distinguish between release count that can do full link with pages, or just in this year/volume/issue? """ issue_item: str sim_pubid: str year: Optional[int] volume: Optional[str] issue: Optional[str] first_page: Optional[int] last_page: Optional[int] release_count: Optional[int] def tuple(self): return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count) @classmethod def from_tuple(self, row: Any): return SimIssueRow( issue_item=row[0], sim_pubid=row[1], year=row[2], volume=row[3], issue=row[4], first_page=row[5], last_page=row[6], release_count=row[7], ) @dataclass class ReleaseCountsRow: sim_pubid: str year_in_sim: bool release_count: int year: Optional[int] volume: Optional[str] def tuple(self): return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count) def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int: search = Search(using=es_client, index="fatcat_release") search = search\ .filter("term", container_id=container_id)\ .filter("term", year=year)\ .filter("term", volume=volume)\ .filter("term", issue=issue)\ .extra(request_cache=True) return search.count() def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]: """ What is being returned is a list of dicts, each with year, volume, count keys. """ search = Search(using=es_client, index="fatcat_release") search = search\ .filter("term", container_id=container_id) search.aggs\ .bucket('years', 'terms', field="year")\ .bucket('volumes', 'terms', field="volume") search = search[:0] res = search.execute() ret = [] for year in res.aggregations.years.buckets: for volume in year.volumes.buckets: ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key)) #print(ret[-1]) return ret class IssueDB(): def __init__(self, db_file): """ To create a temporary database, pass ":memory:" as db_file """ self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') self._pubid2container_map: Dict[str, Optional[str]] = dict() self._container2pubid_map: Dict[str, Optional[str]] = dict() def init_db(self): self.db.executescript(""" PRAGMA main.page_size = 4096; PRAGMA main.cache_size = 20000; PRAGMA main.locking_mode = EXCLUSIVE; PRAGMA main.synchronous = OFF; """) with open('schema/issue_db.sql', 'r') as fschema: self.db.executescript(fschema.read()) def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", pub.tuple()) def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", issue.tuple()) def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple()) def pubid2container(self, sim_pubid: str) -> Optional[str]: if sim_pubid in self._pubid2container_map: return self._pubid2container_map[sim_pubid] row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])) if row: self._pubid2container_map[sim_pubid] = row[0][0] return row[0][0] else: self._pubid2container_map[sim_pubid] = None return None def container2pubid(self, container_ident: str) -> Optional[str]: if container_ident in self._container2pubid_map: return self._container2pubid_map[container_ident] row = list(self.db.execute("SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", [container_ident])) if row: self._container2pubid_map[container_ident] = row[0][0] return row[0][0] else: self._pubid2container_map[container_ident] = None return None def lookup_issue(self, sim_pubid: str, volume: str, issue: str) -> Optional[SimIssueRow]: row = list(self.db.execute("SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", [sim_pubid, volume, issue])) if not row: return None return SimIssueRow.from_tuple(row[0]) def load_pubs(self, json_lines: Sequence[str], api: Any): """ Reads a file (or some other iterator) of JSON lines, parses them into a dict, then inserts rows. """ cur = self.db.cursor() for line in json_lines: if not line: continue obj = json.loads(line) meta = obj['metadata'] assert "periodicals" in meta['collection'] container: Optional[fatcat_openapi_client.ContainerEntity] = None if meta.get('issn'): try: container = api.lookup_container(issnl=meta['issn']) except fatcat_openapi_client.ApiException as ae: if ae.status != 404: raise ae row = SimPubRow( sim_pubid=meta['sim_pubid'], pub_collection=meta['identifier'], title=meta['title'], issn=meta.get('issn'), pub_type=meta.get('pub_type'), publisher=meta.get('publisher'), container_issnl=container and container.issnl, container_ident=container and container.ident, wikidata_qid=container and container.wikidata_qid, ) self.insert_sim_pub(row, cur) cur.close() self.db.commit() def load_issues(self, json_lines: Sequence[str], es_client: Any): """ Reads a file (or some other iterator) of JSON lines, parses them into a dict, then inserts rows. """ cur = self.db.cursor() for line in json_lines: if not line: continue obj = json.loads(line) meta = obj['metadata'] assert "periodicals" in meta['collection'] #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0] issue_item = meta['identifier'] # don't index meta items # TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents" if issue_item.endswith("_index") or issue_item.endswith("_contents"): continue sim_pubid=meta['sim_pubid'] year: Optional[int] = None if meta.get('date') and meta['date'][:4].isdigit(): year = int(meta['date'][:4]) volume = meta.get('volume') issue = meta.get('issue') first_page: Optional[int] = None last_page: Optional[int] = None if obj.get('page_numbers'): pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']] pages = [int(p) for p in pages if p.isdigit()] if len(pages): first_page = min(pages) last_page = max(pages) release_count: Optional[int] = None if year and volume and issue: container_id = self.pubid2container(sim_pubid) if container_id: release_count = es_issue_count(es_client, container_id, year, volume, issue) row = SimIssueRow( issue_item=issue_item, sim_pubid=sim_pubid, year=year, volume=volume, issue=issue, first_page=first_page, last_page=last_page, release_count=release_count, ) self.insert_sim_issue(row, cur) cur.close() self.db.commit() def load_counts(self, es_client: Any): all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;')) cur: Any = self.db.cursor() for (sim_pubid, container_ident) in all_pub_containers: aggs = es_container_aggs(es_client, container_ident) for agg in aggs: row = ReleaseCountsRow( sim_pubid=sim_pubid, year_in_sim=False, # TODO release_count=agg['count'], year=agg['year'], volume=agg['volume'], ) self.insert_release_counts(row, cur) cur.close() self.db.commit() def main(): """ Run this command like: python -m fatcat_scholar.issue_db """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() parser.add_argument("--db-file", help="sqlite3 database file to open", default='data/issue_db.sqlite', type=str) sub = subparsers.add_parser('init_db', help="create sqlite3 output file and tables") sub.set_defaults(func='init_db') sub = subparsers.add_parser('load_pubs', help="update container-level stats from JSON file") sub.set_defaults(func='load_pubs') sub.add_argument("json_file", help="collection-level metadata, as JSON-lines", nargs='?', default=sys.stdin, type=argparse.FileType('r')) sub = subparsers.add_parser('load_issues', help="update item-level stats from JSON file") sub.set_defaults(func='load_issues') sub.add_argument("json_file", help="item-level metadata, as JSON-lines", nargs='?', default=sys.stdin, type=argparse.FileType('r')) sub = subparsers.add_parser('load_counts', help="update volume-level stats from elasticsearch endpoint") sub.set_defaults(func='load_counts') args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do! (try --help)") sys.exit(-1) idb = IssueDB(args.db_file) api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient()) es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") if args.func == 'load_pubs': idb.load_pubs(args.json_file, api) elif args.func == 'load_issues': idb.load_issues(args.json_file, es_client) elif args.func == 'load_counts': idb.load_counts(es_client) else: func = getattr(idb, args.func) func() if __name__=="__main__": main()