diff options
Diffstat (limited to 'fatcat_scholar/issue_db.py')
-rw-r--r-- | fatcat_scholar/issue_db.py | 250 |
1 files changed, 164 insertions, 86 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 4f5ff53..12ffa32 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -1,4 +1,3 @@ - import sys import json import sqlite3 @@ -9,6 +8,7 @@ import fatcat_openapi_client import elasticsearch from elasticsearch_dsl import Search, Q + @dataclass class SimPubRow: sim_pubid: str @@ -23,7 +23,17 @@ class SimPubRow: wikidata_qid: Optional[str] def tuple(self): - return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid) + return ( + self.sim_pubid, + self.pub_collection, + self.title, + self.issn, + self.pub_type, + self.publisher, + self.container_issnl, + self.container_ident, + self.wikidata_qid, + ) @classmethod def from_tuple(cls, row: Any) -> "SimPubRow": @@ -39,6 +49,7 @@ class SimPubRow: wikidata_qid=row[8], ) + @dataclass class SimIssueRow: """ @@ -46,6 +57,7 @@ class SimIssueRow: - distinguish between release count that can do full link with pages, or just in this year/volume/issue? """ + issue_item: str sim_pubid: str year: Optional[int] @@ -56,7 +68,16 @@ class SimIssueRow: release_count: Optional[int] def tuple(self): - return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count) + return ( + self.issue_item, + self.sim_pubid, + self.year, + self.volume, + self.issue, + self.first_page, + self.last_page, + self.release_count, + ) @classmethod def from_tuple(cls, row: Any) -> "SimIssueRow": @@ -71,6 +92,7 @@ class SimIssueRow: release_count=row[7], ) + @dataclass class ReleaseCountsRow: sim_pubid: str @@ -80,82 +102,100 @@ class ReleaseCountsRow: volume: Optional[str] def tuple(self): - return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count) + return ( + self.sim_pubid, + self.year, + self.volume, + self.year_in_sim, + self.release_count, + ) -def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int: +def es_issue_count( + es_client: Any, container_id: str, year: int, volume: str, issue: str +) -> int: search = Search(using=es_client, index="fatcat_release") - search = search\ - .filter("term", container_id=container_id)\ - .filter("term", year=year)\ - .filter("term", volume=volume)\ - .filter("term", issue=issue)\ + search = ( + search.filter("term", container_id=container_id) + .filter("term", year=year) + .filter("term", volume=volume) + .filter("term", issue=issue) .extra(request_cache=True) + ) return search.count() + def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]: """ What is being returned is a list of dicts, each with year, volume, count keys. """ search = Search(using=es_client, index="fatcat_release") - search = search\ - .filter("term", container_id=container_id) - search.aggs\ - .bucket('years', 'terms', field="year")\ - .bucket('volumes', 'terms', field="volume") + search = search.filter("term", container_id=container_id) + search.aggs.bucket("years", "terms", field="year").bucket( + "volumes", "terms", field="volume" + ) search = search[:0] res = search.execute() ret = [] for year in res.aggregations.years.buckets: for volume in year.volumes.buckets: ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key)) - #print(ret[-1]) + # print(ret[-1]) return ret -class IssueDB(): +class IssueDB: def __init__(self, db_file): """ To create a temporary database, pass ":memory:" as db_file """ - self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE') + self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self._pubid2container_map: Dict[str, Optional[str]] = dict() self._container2pubid_map: Dict[str, Optional[str]] = dict() def init_db(self): - self.db.executescript(""" + self.db.executescript( + """ PRAGMA main.page_size = 4096; PRAGMA main.cache_size = 20000; PRAGMA main.locking_mode = EXCLUSIVE; PRAGMA main.synchronous = OFF; - """) - with open('schema/issue_db.sql', 'r') as fschema: + """ + ) + with open("schema/issue_db.sql", "r") as fschema: self.db.executescript(fschema.read()) def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", - pub.tuple()) + cur.execute( + "INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", pub.tuple() + ) def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", - issue.tuple()) + cur.execute( + "INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", issue.tuple() + ) def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", - counts.tuple()) + cur.execute( + "INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple() + ) def pubid2container(self, sim_pubid: str) -> Optional[str]: if sim_pubid in self._pubid2container_map: return self._pubid2container_map[sim_pubid] - row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])) + row = list( + self.db.execute( + "SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid] + ) + ) if row: self._pubid2container_map[sim_pubid] = row[0][0] return row[0][0] @@ -166,7 +206,12 @@ class IssueDB(): def container2pubid(self, container_ident: str) -> Optional[str]: if container_ident in self._container2pubid_map: return self._container2pubid_map[container_ident] - row = list(self.db.execute("SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", [container_ident])) + row = list( + self.db.execute( + "SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", + [container_ident], + ) + ) if row: self._container2pubid_map[container_ident] = row[0][0] return row[0][0] @@ -174,14 +219,23 @@ class IssueDB(): self._pubid2container_map[container_ident] = None return None - def lookup_issue(self, sim_pubid: str, volume: str, issue: str) -> Optional[SimIssueRow]: - row = list(self.db.execute("SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", [sim_pubid, volume, issue])) + def lookup_issue( + self, sim_pubid: str, volume: str, issue: str + ) -> Optional[SimIssueRow]: + row = list( + self.db.execute( + "SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", + [sim_pubid, volume, issue], + ) + ) if not row: return None return SimIssueRow.from_tuple(row[0]) def lookup_pub(self, sim_pubid: str) -> Optional[SimPubRow]: - row = list(self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])) + row = list( + self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]) + ) if not row: return None return SimPubRow.from_tuple(row[0]) @@ -196,22 +250,22 @@ class IssueDB(): if not line: continue obj = json.loads(line) - meta = obj['metadata'] - assert "periodicals" in meta['collection'] + meta = obj["metadata"] + assert "periodicals" in meta["collection"] container: Optional[fatcat_openapi_client.ContainerEntity] = None - if meta.get('issn'): + if meta.get("issn"): try: - container = api.lookup_container(issnl=meta['issn']) + container = api.lookup_container(issnl=meta["issn"]) except fatcat_openapi_client.ApiException as ae: if ae.status != 404: raise ae row = SimPubRow( - sim_pubid=meta['sim_pubid'], - pub_collection=meta['identifier'], - title=meta['title'], - issn=meta.get('issn'), - pub_type=meta.get('pub_type'), - publisher=meta.get('publisher'), + sim_pubid=meta["sim_pubid"], + pub_collection=meta["identifier"], + title=meta["title"], + issn=meta.get("issn"), + pub_type=meta.get("pub_type"), + publisher=meta.get("publisher"), container_issnl=container and container.issnl, container_ident=container and container.ident, wikidata_qid=container and container.wikidata_qid, @@ -230,28 +284,32 @@ class IssueDB(): if not line: continue obj = json.loads(line) - meta = obj['metadata'] - assert "periodicals" in meta['collection'] - #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0] - issue_item = meta['identifier'] + meta = obj["metadata"] + assert "periodicals" in meta["collection"] + # pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0] + issue_item = meta["identifier"] # don't index meta items # TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents" if issue_item.endswith("_index") or issue_item.endswith("_contents"): continue - sim_pubid=meta['sim_pubid'] + sim_pubid = meta["sim_pubid"] year: Optional[int] = None - if meta.get('date') and meta['date'][:4].isdigit(): - year = int(meta['date'][:4]) - volume = meta.get('volume') - issue = meta.get('issue') + if meta.get("date") and meta["date"][:4].isdigit(): + year = int(meta["date"][:4]) + volume = meta.get("volume") + issue = meta.get("issue") first_page: Optional[int] = None last_page: Optional[int] = None - if obj.get('page_numbers'): - pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']] + if obj.get("page_numbers"): + pages = [ + p["pageNumber"] + for p in obj["page_numbers"]["pages"] + if p["pageNumber"] + ] pages = [int(p) for p in pages if p.isdigit()] if len(pages): first_page = min(pages) @@ -261,7 +319,9 @@ class IssueDB(): if year and volume and issue: container_id = self.pubid2container(sim_pubid) if container_id: - release_count = es_issue_count(es_client, container_id, year, volume, issue) + release_count = es_issue_count( + es_client, container_id, year, volume, issue + ) row = SimIssueRow( issue_item=issue_item, @@ -278,17 +338,21 @@ class IssueDB(): self.db.commit() def load_counts(self, es_client: Any): - all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;')) + all_pub_containers = list( + self.db.execute( + "SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;" + ) + ) cur: Any = self.db.cursor() for (sim_pubid, container_ident) in all_pub_containers: aggs = es_container_aggs(es_client, container_ident) for agg in aggs: row = ReleaseCountsRow( sim_pubid=sim_pubid, - year_in_sim=False, # TODO - release_count=agg['count'], - year=agg['year'], - volume=agg['volume'], + year_in_sim=False, # TODO + release_count=agg["count"], + year=agg["year"], + volume=agg["volume"], ) self.insert_release_counts(row, cur) cur.close() @@ -303,35 +367,48 @@ def main(): """ parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) subparsers = parser.add_subparsers() - parser.add_argument("--db-file", + parser.add_argument( + "--db-file", help="sqlite3 database file to open", - default='data/issue_db.sqlite', - type=str) - - sub = subparsers.add_parser('init_db', - help="create sqlite3 output file and tables") - sub.set_defaults(func='init_db') - - sub = subparsers.add_parser('load_pubs', - help="update container-level stats from JSON file") - sub.set_defaults(func='load_pubs') - sub.add_argument("json_file", + default="data/issue_db.sqlite", + type=str, + ) + + sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables") + sub.set_defaults(func="init_db") + + sub = subparsers.add_parser( + "load_pubs", help="update container-level stats from JSON file" + ) + sub.set_defaults(func="load_pubs") + sub.add_argument( + "json_file", help="collection-level metadata, as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) - - sub = subparsers.add_parser('load_issues', - help="update item-level stats from JSON file") - sub.set_defaults(func='load_issues') - sub.add_argument("json_file", + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) + + sub = subparsers.add_parser( + "load_issues", help="update item-level stats from JSON file" + ) + sub.set_defaults(func="load_issues") + sub.add_argument( + "json_file", help="item-level metadata, as JSON-lines", - nargs='?', default=sys.stdin, type=argparse.FileType('r')) + nargs="?", + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub = subparsers.add_parser('load_counts', - help="update volume-level stats from elasticsearch endpoint") - sub.set_defaults(func='load_counts') + sub = subparsers.add_parser( + "load_counts", help="update volume-level stats from elasticsearch endpoint" + ) + sub.set_defaults(func="load_counts") args = parser.parse_args() if not args.__dict__.get("func"): @@ -342,15 +419,16 @@ def main(): api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient()) es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") - if args.func == 'load_pubs': + if args.func == "load_pubs": idb.load_pubs(args.json_file, api) - elif args.func == 'load_issues': + elif args.func == "load_issues": idb.load_issues(args.json_file, es_client) - elif args.func == 'load_counts': + elif args.func == "load_counts": idb.load_counts(es_client) else: func = getattr(idb, args.func) func() -if __name__=="__main__": + +if __name__ == "__main__": main() |