first pass at issue-db tool

author: Bryan Newbold <bnewbold@archive.org> 2020-05-15 18:06:55 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-05-15 18:06:55 -0700
commit: 3ed9fe3aac4b79d16ffa9d8008fefb0363a88468 (patch)
tree: bb65eaf6a815f2f5830da07df7ce556ffb7e7e2e
parent: 565d72d24dbdef3c549a2ff111cda4343281607d (diff)
download: fatcat-scholar-3ed9fe3aac4b79d16ffa9d8008fefb0363a88468.tar.gz
fatcat-scholar-3ed9fe3aac4b79d16ffa9d8008fefb0363a88468.zip
2 files changed, 321 insertions, 0 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
new file mode 100644
index 0000000..7fdc4a9
--- /dev/null
+++ b/fatcat_scholar/issue_db.py
@@ -0,0 +1,286 @@
+
+import sys
+import json
+import sqlite3
+import argparse
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+import fatcat_openapi_client
+import elasticsearch
+from elasticsearch_dsl import Search, Q
+
+@dataclass
+class SimPubRow:
+    sim_pubid: str
+    pub_collection: str
+    title: str
+    issn: Optional[str]
+    pub_type: Optional[str]
+    publisher: Optional[str]
+
+    container_issnl: Optional[str]
+    container_ident: Optional[str]
+    wikidata_qid: Optional[str]
+
+    def tuple(self):
+        return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid)
+
+@dataclass
+class SimIssueRow:
+    """
+    TODO:
+    - distinguish between release count that can do full link with pages, or just in this year/volume/issue?
+    """
+    issue_item: str
+    sim_pubid: str
+    year: Optional[int]
+    volume: Optional[str]
+    issue: Optional[str]
+    first_page: Optional[int]
+    last_page: Optional[int]
+    release_count: Optional[int]
+
+    def tuple(self):
+        return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count)
+
+@dataclass
+class ReleaseCountsRow:
+    sim_pubid: str
+    year_in_sim: bool
+    release_count: int
+    year: Optional[int]
+    volume: Optional[str]
+
+    def tuple(self):
+        return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count)
+
+
+def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int:
+    search = Search(using=es_client, index="fatcat_release")
+    search = search\
+        .filter("term", container_id=container_id)\
+        .filter("term", year=year)\
+        .filter("term", volume=volume)\
+        .filter("term", issue=issue)
+
+    return search.count()
+
+def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]:
+    """
+    """
+    query = {
+        "size": 0,
+        "query": {
+            "term": { "container_id": ident }
+        },
+        "aggs": { "container_stats": { "filters": { "filters": {
+                  "in_web": { "term": { "in_web": "true" } },
+                  "in_kbart": { "term": { "in_kbart": "true" } },
+                  "is_preserved": { "term": { "is_preserved": "true" } },
+        }}}}
+    }
+    params=dict(request_cache="true")
+    buckets = resp['aggregations']['container_stats']['buckets']
+    stats = {
+        'ident': ident,
+        'issnl': issnl,
+        'total': resp['hits']['total'],
+        'in_web': buckets['in_web']['doc_count'],
+        'in_kbart': buckets['in_kbart']['doc_count'],
+        'is_preserved': buckets['is_preserved']['doc_count'],
+    }
+    return stats
+
+class IssueDB():
+
+    def __init__(self, db_file):
+        """
+        To create a temporary database, pass ":memory:" as db_file
+        """
+        self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+        self._pubid2container_map: Dict[str, Optional[str]] = dict()
+
+    def init_db(self):
+        self.db.executescript("""
+            PRAGMA main.page_size = 4096;
+            PRAGMA main.cache_size = 20000;
+            PRAGMA main.locking_mode = EXCLUSIVE;
+            PRAGMA main.synchronous = OFF;
+        """)
+        with open('schema/issue_db.sql', 'r') as fschema:
+            self.db.executescript(fschema.read())
+
+    def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None:
+        if not cur:
+            cur = self.db.cursor()
+        cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)",
+            pub.tuple())
+
+    def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None:
+        if not cur:
+            cur = self.db.cursor()
+        cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)",
+            issue.tuple())
+
+    def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None:
+        if not cur:
+            cur = self.db.cursor()
+        cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?,?,?,?,?)",
+            counts.tuple())
+
+    def pubid2container(self, sim_pubid: str) -> Optional[str]:
+        if sim_pubid in self._pubid2container_map:
+            return self._pubid2container_map[sim_pubid]
+        row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+        if row:
+            self._pubid2container_map[sim_pubid] = row[0][0]
+            return row[0][0]
+        else:
+            self._pubid2container_map[sim_pubid] = None
+            return None
+
+    def load_pubs(self, json_lines: Sequence[str], api: Any):
+        """
+        Reads a file (or some other iterator) of JSON lines, parses them into a
+        dict, then inserts rows.
+        """
+        cur = self.db.cursor()
+        for line in json_lines:
+            if not line:
+                continue
+            obj = json.loads(line)
+            meta = obj['metadata']
+            assert "periodicals" in meta['collection']
+            container: Optional[ContainerEntity] = None
+            if meta.get('issn'):
+                try:
+                    container = api.lookup_container(issnl=meta['issn'])
+                except fatcat_openapi_client.ApiException as ae:
+                    if ae.status != 404:
+                        raise ae
+            row = SimPubRow(
+                sim_pubid=meta['sim_pubid'],
+                pub_collection=meta['identifier'],
+                title=meta['title'],
+                issn=meta.get('issn'),
+                pub_type=meta.get('pub_type'),
+                publisher=meta.get('publisher'),
+                container_issnl=container and container.issnl,
+                container_ident=container and container.ident,
+                wikidata_qid=container and container.wikidata_qid,
+            )
+            self.insert_sim_pub(row, cur)
+        cur.close()
+        self.db.commit()
+
+    def load_issues(self, json_lines: Sequence[str], es_client: Any):
+        """
+        Reads a file (or some other iterator) of JSON lines, parses them into a
+        dict, then inserts rows.
+        """
+        cur = self.db.cursor()
+        for line in json_lines:
+            if not line:
+                continue
+            obj = json.loads(line)
+            meta = obj['metadata']
+            assert "periodicals" in meta['collection']
+            #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
+            issue_item = meta['identifier']
+
+            # don't index meta items
+            # TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents"
+            if issue_item.endswith("_index") or issue_item.endswith("_contents"):
+                continue
+
+            sim_pubid=meta['sim_pubid']
+
+            year: Optional[int] = None
+            if meta.get('date'):
+                year = int(meta['date'][:4])
+            volume = meta.get('volume')
+            issue = meta.get('issue')
+
+            first_page: Optional[int] = None
+            last_page: Optional[int] = None
+            if obj.get('page_numbers'):
+                pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']]
+                pages = [int(p) for p in pages if p.isdigit()]
+                if len(pages):
+                    first_page = min(pages)
+                    last_page = max(pages)
+
+            release_count: Optional[int] = None
+            if year and volume and issue:
+                container_id = self.pubid2container(sim_pubid)
+                if container_id:
+                    release_count = es_issue_count(es_client, container_id, year, volume, issue)
+
+            row = SimIssueRow(
+                issue_item=issue_item,
+                sim_pubid=sim_pubid,
+                year=year,
+                volume=volume,
+                issue=issue,
+                first_page=first_page,
+                last_page=last_page,
+                release_count=release_count,
+            )
+            self.insert_sim_issue(row, cur)
+        cur.close()
+        self.db.commit()
+
+
+def main():
+    """
+    Run this command like:
+
+        python -m fatcat_scholar.issue_db
+    """
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    subparsers = parser.add_subparsers()
+
+    parser.add_argument("--db-file",
+        help="sqlite3 database file to open",
+        default='issue_db.sqlite',
+        type=str)
+
+    sub = subparsers.add_parser('init_db',
+        help="create sqlite3 output file and tables")
+    sub.set_defaults(func='init_db')
+
+    sub = subparsers.add_parser('load_pubs',
+        help="update container-level stats from JSON file")
+    sub.set_defaults(func='load_pubs')
+    sub.add_argument("json_file",
+        help="collection-level metadata, as JSON-lines",
+        nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+    sub = subparsers.add_parser('load_issues',
+        help="update item-level stats from JSON file")
+    sub.set_defaults(func='load_issues')
+    sub.add_argument("json_file",
+        help="item-level metadata, as JSON-lines",
+        nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+
+    idb = IssueDB(args.db_file)
+    api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient())
+    es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
+
+    if args.func == 'load_pubs':
+        idb.load_pubs(args.json_file, api)
+    elif args.func == 'load_issues':
+        idb.load_issues(args.json_file, es_client)
+    else:
+        func = getattr(idb, args.func)
+        func()
+
+if __name__=="__main__":
+    main()
diff --git a/schema/issue_db.sql b/schema/issue_db.sql
new file mode 100644
index 0000000..e07e97e
--- /dev/null
+++ b/schema/issue_db.sql
@@ -0,0 +1,35 @@
+
+CREATE TABLE IF NOT EXISTS sim_pub (
+    sim_pubid TEXT NOT NULL PRIMARY KEY,
+    pub_collection TEXT NOT NULL UNIQUE,
+    title TEXT NOT NULL,
+    issn TEXT,
+    pub_type TEXT,
+    publisher TEXT,
+    container_issnl TEXT,
+    container_ident TEXT,
+    wikidata_qid TEXT
+);
+
+CREATE TABLE IF NOT EXISTS sim_issue (
+    issue_item TEXT NOT NULL PRIMARY KEY,
+    sim_pubid NOT NULL,
+    year INTEGER,
+    volume TEXT,
+    issue TEXT,
+    first_page INTEGER,
+    last_page INTEGER,
+    release_count INTEGER,
+    FOREIGN KEY(sim_pubid) REFERENCES sim_pub(sim_pubid)
+);
+
+-- intent here is to capture how many releases are just not getting matched due
+-- to missing issue metadata
+CREATE TABLE IF NOT EXISTS release_counts (
+    sim_pubid TEXT NOT NULL PRIMARY KEY,
+    year TEXT,
+    volume TEXT,
+    year_in_sim BOOLEAN,
+    release_count INTEGER,
+    FOREIGN KEY(sim_pubid) REFERENCES sim_pub(sim_pubid)
+);
author	Bryan Newbold <bnewbold@archive.org>	2020-05-15 18:06:55 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-05-15 18:06:55 -0700
commit	3ed9fe3aac4b79d16ffa9d8008fefb0363a88468 (patch)
tree	bb65eaf6a815f2f5830da07df7ce556ffb7e7e2e
parent	565d72d24dbdef3c549a2ff111cda4343281607d (diff)
download	fatcat-scholar-3ed9fe3aac4b79d16ffa9d8008fefb0363a88468.tar.gz fatcat-scholar-3ed9fe3aac4b79d16ffa9d8008fefb0363a88468.zip