1 files changed, 164 insertions, 86 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
index 4f5ff53..12ffa32 100644
--- a/fatcat_scholar/issue_db.py
+++ b/fatcat_scholar/issue_db.py
@@ -1,4 +1,3 @@
-
 import sys
 import json
 import sqlite3
@@ -9,6 +8,7 @@ import fatcat_openapi_client
 import elasticsearch
 from elasticsearch_dsl import Search, Q
 
+
 @dataclass
 class SimPubRow:
     sim_pubid: str
@@ -23,7 +23,17 @@ class SimPubRow:
     wikidata_qid: Optional[str]
 
     def tuple(self):
-        return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid)
+        return (
+            self.sim_pubid,
+            self.pub_collection,
+            self.title,
+            self.issn,
+            self.pub_type,
+            self.publisher,
+            self.container_issnl,
+            self.container_ident,
+            self.wikidata_qid,
+        )
 
     @classmethod
     def from_tuple(cls, row: Any) -> "SimPubRow":
@@ -39,6 +49,7 @@ class SimPubRow:
             wikidata_qid=row[8],
         )
 
+
 @dataclass
 class SimIssueRow:
     """
@@ -46,6 +57,7 @@ class SimIssueRow:
     - distinguish between release count that can do full link with pages, or
       just in this year/volume/issue?
     """
+
     issue_item: str
     sim_pubid: str
     year: Optional[int]
@@ -56,7 +68,16 @@ class SimIssueRow:
     release_count: Optional[int]
 
     def tuple(self):
-        return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count)
+        return (
+            self.issue_item,
+            self.sim_pubid,
+            self.year,
+            self.volume,
+            self.issue,
+            self.first_page,
+            self.last_page,
+            self.release_count,
+        )
 
     @classmethod
     def from_tuple(cls, row: Any) -> "SimIssueRow":
@@ -71,6 +92,7 @@ class SimIssueRow:
             release_count=row[7],
         )
 
+
 @dataclass
 class ReleaseCountsRow:
     sim_pubid: str
@@ -80,82 +102,100 @@ class ReleaseCountsRow:
     volume: Optional[str]
 
     def tuple(self):
-        return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count)
+        return (
+            self.sim_pubid,
+            self.year,
+            self.volume,
+            self.year_in_sim,
+            self.release_count,
+        )
 
 
-def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int:
+def es_issue_count(
+    es_client: Any, container_id: str, year: int, volume: str, issue: str
+) -> int:
     search = Search(using=es_client, index="fatcat_release")
-    search = search\
-        .filter("term", container_id=container_id)\
-        .filter("term", year=year)\
-        .filter("term", volume=volume)\
-        .filter("term", issue=issue)\
+    search = (
+        search.filter("term", container_id=container_id)
+        .filter("term", year=year)
+        .filter("term", volume=volume)
+        .filter("term", issue=issue)
         .extra(request_cache=True)
+    )
 
     return search.count()
 
+
 def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]:
     """
     What is being returned is a list of dicts, each with year, volume, count
     keys.
     """
     search = Search(using=es_client, index="fatcat_release")
-    search = search\
-        .filter("term", container_id=container_id)
-    search.aggs\
-        .bucket('years', 'terms', field="year")\
-        .bucket('volumes', 'terms', field="volume")
+    search = search.filter("term", container_id=container_id)
+    search.aggs.bucket("years", "terms", field="year").bucket(
+        "volumes", "terms", field="volume"
+    )
     search = search[:0]
     res = search.execute()
     ret = []
     for year in res.aggregations.years.buckets:
         for volume in year.volumes.buckets:
             ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key))
-            #print(ret[-1])
+            # print(ret[-1])
     return ret
 
-class IssueDB():
 
+class IssueDB:
     def __init__(self, db_file):
         """
         To create a temporary database, pass ":memory:" as db_file
         """
-        self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+        self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
         self._pubid2container_map: Dict[str, Optional[str]] = dict()
         self._container2pubid_map: Dict[str, Optional[str]] = dict()
 
     def init_db(self):
-        self.db.executescript("""
+        self.db.executescript(
+            """
             PRAGMA main.page_size = 4096;
             PRAGMA main.cache_size = 20000;
             PRAGMA main.locking_mode = EXCLUSIVE;
             PRAGMA main.synchronous = OFF;
-        """)
-        with open('schema/issue_db.sql', 'r') as fschema:
+        """
+        )
+        with open("schema/issue_db.sql", "r") as fschema:
             self.db.executescript(fschema.read())
 
     def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None:
         if not cur:
             cur = self.db.cursor()
-        cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)",
-            pub.tuple())
+        cur.execute(
+            "INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", pub.tuple()
+        )
 
     def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None:
         if not cur:
             cur = self.db.cursor()
-        cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)",
-            issue.tuple())
+        cur.execute(
+            "INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", issue.tuple()
+        )
 
     def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None:
         if not cur:
             cur = self.db.cursor()
-        cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)",
-            counts.tuple())
+        cur.execute(
+            "INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple()
+        )
 
     def pubid2container(self, sim_pubid: str) -> Optional[str]:
         if sim_pubid in self._pubid2container_map:
             return self._pubid2container_map[sim_pubid]
-        row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+        row = list(
+            self.db.execute(
+                "SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]
+            )
+        )
         if row:
             self._pubid2container_map[sim_pubid] = row[0][0]
             return row[0][0]
@@ -166,7 +206,12 @@ class IssueDB():
     def container2pubid(self, container_ident: str) -> Optional[str]:
         if container_ident in self._container2pubid_map:
             return self._container2pubid_map[container_ident]
-        row = list(self.db.execute("SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", [container_ident]))
+        row = list(
+            self.db.execute(
+                "SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;",
+                [container_ident],
+            )
+        )
         if row:
             self._container2pubid_map[container_ident] = row[0][0]
             return row[0][0]
@@ -174,14 +219,23 @@ class IssueDB():
             self._pubid2container_map[container_ident] = None
             return None
 
-    def lookup_issue(self, sim_pubid: str, volume: str, issue: str) -> Optional[SimIssueRow]:
-        row = list(self.db.execute("SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", [sim_pubid, volume, issue]))
+    def lookup_issue(
+        self, sim_pubid: str, volume: str, issue: str
+    ) -> Optional[SimIssueRow]:
+        row = list(
+            self.db.execute(
+                "SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;",
+                [sim_pubid, volume, issue],
+            )
+        )
         if not row:
             return None
         return SimIssueRow.from_tuple(row[0])
 
     def lookup_pub(self, sim_pubid: str) -> Optional[SimPubRow]:
-        row = list(self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+        row = list(
+            self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])
+        )
         if not row:
             return None
         return SimPubRow.from_tuple(row[0])
@@ -196,22 +250,22 @@ class IssueDB():
             if not line:
                 continue
             obj = json.loads(line)
-            meta = obj['metadata']
-            assert "periodicals" in meta['collection']
+            meta = obj["metadata"]
+            assert "periodicals" in meta["collection"]
             container: Optional[fatcat_openapi_client.ContainerEntity] = None
-            if meta.get('issn'):
+            if meta.get("issn"):
                 try:
-                    container = api.lookup_container(issnl=meta['issn'])
+                    container = api.lookup_container(issnl=meta["issn"])
                 except fatcat_openapi_client.ApiException as ae:
                     if ae.status != 404:
                         raise ae
             row = SimPubRow(
-                sim_pubid=meta['sim_pubid'],
-                pub_collection=meta['identifier'],
-                title=meta['title'],
-                issn=meta.get('issn'),
-                pub_type=meta.get('pub_type'),
-                publisher=meta.get('publisher'),
+                sim_pubid=meta["sim_pubid"],
+                pub_collection=meta["identifier"],
+                title=meta["title"],
+                issn=meta.get("issn"),
+                pub_type=meta.get("pub_type"),
+                publisher=meta.get("publisher"),
                 container_issnl=container and container.issnl,
                 container_ident=container and container.ident,
                 wikidata_qid=container and container.wikidata_qid,
@@ -230,28 +284,32 @@ class IssueDB():
             if not line:
                 continue
             obj = json.loads(line)
-            meta = obj['metadata']
-            assert "periodicals" in meta['collection']
-            #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
-            issue_item = meta['identifier']
+            meta = obj["metadata"]
+            assert "periodicals" in meta["collection"]
+            # pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
+            issue_item = meta["identifier"]
 
             # don't index meta items
             # TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents"
             if issue_item.endswith("_index") or issue_item.endswith("_contents"):
                 continue
 
-            sim_pubid=meta['sim_pubid']
+            sim_pubid = meta["sim_pubid"]
 
             year: Optional[int] = None
-            if meta.get('date') and meta['date'][:4].isdigit():
-                year = int(meta['date'][:4])
-            volume = meta.get('volume')
-            issue = meta.get('issue')
+            if meta.get("date") and meta["date"][:4].isdigit():
+                year = int(meta["date"][:4])
+            volume = meta.get("volume")
+            issue = meta.get("issue")
 
             first_page: Optional[int] = None
             last_page: Optional[int] = None
-            if obj.get('page_numbers'):
-                pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']]
+            if obj.get("page_numbers"):
+                pages = [
+                    p["pageNumber"]
+                    for p in obj["page_numbers"]["pages"]
+                    if p["pageNumber"]
+                ]
                 pages = [int(p) for p in pages if p.isdigit()]
                 if len(pages):
                     first_page = min(pages)
@@ -261,7 +319,9 @@ class IssueDB():
             if year and volume and issue:
                 container_id = self.pubid2container(sim_pubid)
                 if container_id:
-                    release_count = es_issue_count(es_client, container_id, year, volume, issue)
+                    release_count = es_issue_count(
+                        es_client, container_id, year, volume, issue
+                    )
 
             row = SimIssueRow(
                 issue_item=issue_item,
@@ -278,17 +338,21 @@ class IssueDB():
         self.db.commit()
 
     def load_counts(self, es_client: Any):
-        all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;'))
+        all_pub_containers = list(
+            self.db.execute(
+                "SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;"
+            )
+        )
         cur: Any = self.db.cursor()
         for (sim_pubid, container_ident) in all_pub_containers:
             aggs = es_container_aggs(es_client, container_ident)
             for agg in aggs:
                 row = ReleaseCountsRow(
                     sim_pubid=sim_pubid,
-                    year_in_sim=False, # TODO
-                    release_count=agg['count'],
-                    year=agg['year'],
-                    volume=agg['volume'],
+                    year_in_sim=False,  # TODO
+                    release_count=agg["count"],
+                    year=agg["year"],
+                    volume=agg["volume"],
                 )
                 self.insert_release_counts(row, cur)
         cur.close()
@@ -303,35 +367,48 @@ def main():
     """
 
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
     subparsers = parser.add_subparsers()
 
-    parser.add_argument("--db-file",
+    parser.add_argument(
+        "--db-file",
         help="sqlite3 database file to open",
-        default='data/issue_db.sqlite',
-        type=str)
-
-    sub = subparsers.add_parser('init_db',
-        help="create sqlite3 output file and tables")
-    sub.set_defaults(func='init_db')
-
-    sub = subparsers.add_parser('load_pubs',
-        help="update container-level stats from JSON file")
-    sub.set_defaults(func='load_pubs')
-    sub.add_argument("json_file",
+        default="data/issue_db.sqlite",
+        type=str,
+    )
+
+    sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables")
+    sub.set_defaults(func="init_db")
+
+    sub = subparsers.add_parser(
+        "load_pubs", help="update container-level stats from JSON file"
+    )
+    sub.set_defaults(func="load_pubs")
+    sub.add_argument(
+        "json_file",
         help="collection-level metadata, as JSON-lines",
-        nargs='?', default=sys.stdin, type=argparse.FileType('r'))
-
-    sub = subparsers.add_parser('load_issues',
-        help="update item-level stats from JSON file")
-    sub.set_defaults(func='load_issues')
-    sub.add_argument("json_file",
+        nargs="?",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
+
+    sub = subparsers.add_parser(
+        "load_issues", help="update item-level stats from JSON file"
+    )
+    sub.set_defaults(func="load_issues")
+    sub.add_argument(
+        "json_file",
         help="item-level metadata, as JSON-lines",
-        nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+        nargs="?",
+        default=sys.stdin,
+        type=argparse.FileType("r"),
+    )
 
-    sub = subparsers.add_parser('load_counts',
-        help="update volume-level stats from elasticsearch endpoint")
-    sub.set_defaults(func='load_counts')
+    sub = subparsers.add_parser(
+        "load_counts", help="update volume-level stats from elasticsearch endpoint"
+    )
+    sub.set_defaults(func="load_counts")
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -342,15 +419,16 @@ def main():
     api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient())
     es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
 
-    if args.func == 'load_pubs':
+    if args.func == "load_pubs":
         idb.load_pubs(args.json_file, api)
-    elif args.func == 'load_issues':
+    elif args.func == "load_issues":
         idb.load_issues(args.json_file, es_client)
-    elif args.func == 'load_counts':
+    elif args.func == "load_counts":
         idb.load_counts(es_client)
     else:
         func = getattr(idb, args.func)
         func()
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()