aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-15 18:06:55 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-15 18:06:55 -0700
commit3ed9fe3aac4b79d16ffa9d8008fefb0363a88468 (patch)
treebb65eaf6a815f2f5830da07df7ce556ffb7e7e2e
parent565d72d24dbdef3c549a2ff111cda4343281607d (diff)
downloadfatcat-scholar-3ed9fe3aac4b79d16ffa9d8008fefb0363a88468.tar.gz
fatcat-scholar-3ed9fe3aac4b79d16ffa9d8008fefb0363a88468.zip
first pass at issue-db tool
-rw-r--r--fatcat_scholar/issue_db.py286
-rw-r--r--schema/issue_db.sql35
2 files changed, 321 insertions, 0 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
new file mode 100644
index 0000000..7fdc4a9
--- /dev/null
+++ b/fatcat_scholar/issue_db.py
@@ -0,0 +1,286 @@
+
+import sys
+import json
+import sqlite3
+import argparse
+from dataclasses import dataclass
+from typing import List, Dict, Tuple, Optional, Any, Sequence
+import fatcat_openapi_client
+import elasticsearch
+from elasticsearch_dsl import Search, Q
+
+@dataclass
+class SimPubRow:
+ sim_pubid: str
+ pub_collection: str
+ title: str
+ issn: Optional[str]
+ pub_type: Optional[str]
+ publisher: Optional[str]
+
+ container_issnl: Optional[str]
+ container_ident: Optional[str]
+ wikidata_qid: Optional[str]
+
+ def tuple(self):
+ return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid)
+
+@dataclass
+class SimIssueRow:
+ """
+ TODO:
+ - distinguish between release count that can do full link with pages, or just in this year/volume/issue?
+ """
+ issue_item: str
+ sim_pubid: str
+ year: Optional[int]
+ volume: Optional[str]
+ issue: Optional[str]
+ first_page: Optional[int]
+ last_page: Optional[int]
+ release_count: Optional[int]
+
+ def tuple(self):
+ return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count)
+
+@dataclass
+class ReleaseCountsRow:
+ sim_pubid: str
+ year_in_sim: bool
+ release_count: int
+ year: Optional[int]
+ volume: Optional[str]
+
+ def tuple(self):
+ return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count)
+
+
+def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int:
+ search = Search(using=es_client, index="fatcat_release")
+ search = search\
+ .filter("term", container_id=container_id)\
+ .filter("term", year=year)\
+ .filter("term", volume=volume)\
+ .filter("term", issue=issue)
+
+ return search.count()
+
+def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]:
+ """
+ """
+ query = {
+ "size": 0,
+ "query": {
+ "term": { "container_id": ident }
+ },
+ "aggs": { "container_stats": { "filters": { "filters": {
+ "in_web": { "term": { "in_web": "true" } },
+ "in_kbart": { "term": { "in_kbart": "true" } },
+ "is_preserved": { "term": { "is_preserved": "true" } },
+ }}}}
+ }
+ params=dict(request_cache="true")
+ buckets = resp['aggregations']['container_stats']['buckets']
+ stats = {
+ 'ident': ident,
+ 'issnl': issnl,
+ 'total': resp['hits']['total'],
+ 'in_web': buckets['in_web']['doc_count'],
+ 'in_kbart': buckets['in_kbart']['doc_count'],
+ 'is_preserved': buckets['is_preserved']['doc_count'],
+ }
+ return stats
+
+class IssueDB():
+
+ def __init__(self, db_file):
+ """
+ To create a temporary database, pass ":memory:" as db_file
+ """
+ self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self._pubid2container_map: Dict[str, Optional[str]] = dict()
+
+ def init_db(self):
+ self.db.executescript("""
+ PRAGMA main.page_size = 4096;
+ PRAGMA main.cache_size = 20000;
+ PRAGMA main.locking_mode = EXCLUSIVE;
+ PRAGMA main.synchronous = OFF;
+ """)
+ with open('schema/issue_db.sql', 'r') as fschema:
+ self.db.executescript(fschema.read())
+
+ def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None:
+ if not cur:
+ cur = self.db.cursor()
+ cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)",
+ pub.tuple())
+
+ def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None:
+ if not cur:
+ cur = self.db.cursor()
+ cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)",
+ issue.tuple())
+
+ def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None:
+ if not cur:
+ cur = self.db.cursor()
+ cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?,?,?,?,?)",
+ counts.tuple())
+
+ def pubid2container(self, sim_pubid: str) -> Optional[str]:
+ if sim_pubid in self._pubid2container_map:
+ return self._pubid2container_map[sim_pubid]
+ row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+ if row:
+ self._pubid2container_map[sim_pubid] = row[0][0]
+ return row[0][0]
+ else:
+ self._pubid2container_map[sim_pubid] = None
+ return None
+
+ def load_pubs(self, json_lines: Sequence[str], api: Any):
+ """
+ Reads a file (or some other iterator) of JSON lines, parses them into a
+ dict, then inserts rows.
+ """
+ cur = self.db.cursor()
+ for line in json_lines:
+ if not line:
+ continue
+ obj = json.loads(line)
+ meta = obj['metadata']
+ assert "periodicals" in meta['collection']
+ container: Optional[ContainerEntity] = None
+ if meta.get('issn'):
+ try:
+ container = api.lookup_container(issnl=meta['issn'])
+ except fatcat_openapi_client.ApiException as ae:
+ if ae.status != 404:
+ raise ae
+ row = SimPubRow(
+ sim_pubid=meta['sim_pubid'],
+ pub_collection=meta['identifier'],
+ title=meta['title'],
+ issn=meta.get('issn'),
+ pub_type=meta.get('pub_type'),
+ publisher=meta.get('publisher'),
+ container_issnl=container and container.issnl,
+ container_ident=container and container.ident,
+ wikidata_qid=container and container.wikidata_qid,
+ )
+ self.insert_sim_pub(row, cur)
+ cur.close()
+ self.db.commit()
+
+ def load_issues(self, json_lines: Sequence[str], es_client: Any):
+ """
+ Reads a file (or some other iterator) of JSON lines, parses them into a
+ dict, then inserts rows.
+ """
+ cur = self.db.cursor()
+ for line in json_lines:
+ if not line:
+ continue
+ obj = json.loads(line)
+ meta = obj['metadata']
+ assert "periodicals" in meta['collection']
+ #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
+ issue_item = meta['identifier']
+
+ # don't index meta items
+ # TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents"
+ if issue_item.endswith("_index") or issue_item.endswith("_contents"):
+ continue
+
+ sim_pubid=meta['sim_pubid']
+
+ year: Optional[int] = None
+ if meta.get('date'):
+ year = int(meta['date'][:4])
+ volume = meta.get('volume')
+ issue = meta.get('issue')
+
+ first_page: Optional[int] = None
+ last_page: Optional[int] = None
+ if obj.get('page_numbers'):
+ pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']]
+ pages = [int(p) for p in pages if p.isdigit()]
+ if len(pages):
+ first_page = min(pages)
+ last_page = max(pages)
+
+ release_count: Optional[int] = None
+ if year and volume and issue:
+ container_id = self.pubid2container(sim_pubid)
+ if container_id:
+ release_count = es_issue_count(es_client, container_id, year, volume, issue)
+
+ row = SimIssueRow(
+ issue_item=issue_item,
+ sim_pubid=sim_pubid,
+ year=year,
+ volume=volume,
+ issue=issue,
+ first_page=first_page,
+ last_page=last_page,
+ release_count=release_count,
+ )
+ self.insert_sim_issue(row, cur)
+ cur.close()
+ self.db.commit()
+
+
+def main():
+ """
+ Run this command like:
+
+ python -m fatcat_scholar.issue_db
+ """
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ subparsers = parser.add_subparsers()
+
+ parser.add_argument("--db-file",
+ help="sqlite3 database file to open",
+ default='issue_db.sqlite',
+ type=str)
+
+ sub = subparsers.add_parser('init_db',
+ help="create sqlite3 output file and tables")
+ sub.set_defaults(func='init_db')
+
+ sub = subparsers.add_parser('load_pubs',
+ help="update container-level stats from JSON file")
+ sub.set_defaults(func='load_pubs')
+ sub.add_argument("json_file",
+ help="collection-level metadata, as JSON-lines",
+ nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+ sub = subparsers.add_parser('load_issues',
+ help="update item-level stats from JSON file")
+ sub.set_defaults(func='load_issues')
+ sub.add_argument("json_file",
+ help="item-level metadata, as JSON-lines",
+ nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do! (try --help)")
+ sys.exit(-1)
+
+ idb = IssueDB(args.db_file)
+ api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient())
+ es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
+
+ if args.func == 'load_pubs':
+ idb.load_pubs(args.json_file, api)
+ elif args.func == 'load_issues':
+ idb.load_issues(args.json_file, es_client)
+ else:
+ func = getattr(idb, args.func)
+ func()
+
+if __name__=="__main__":
+ main()
diff --git a/schema/issue_db.sql b/schema/issue_db.sql
new file mode 100644
index 0000000..e07e97e
--- /dev/null
+++ b/schema/issue_db.sql
@@ -0,0 +1,35 @@
+
+CREATE TABLE IF NOT EXISTS sim_pub (
+ sim_pubid TEXT NOT NULL PRIMARY KEY,
+ pub_collection TEXT NOT NULL UNIQUE,
+ title TEXT NOT NULL,
+ issn TEXT,
+ pub_type TEXT,
+ publisher TEXT,
+ container_issnl TEXT,
+ container_ident TEXT,
+ wikidata_qid TEXT
+);
+
+CREATE TABLE IF NOT EXISTS sim_issue (
+ issue_item TEXT NOT NULL PRIMARY KEY,
+ sim_pubid NOT NULL,
+ year INTEGER,
+ volume TEXT,
+ issue TEXT,
+ first_page INTEGER,
+ last_page INTEGER,
+ release_count INTEGER,
+ FOREIGN KEY(sim_pubid) REFERENCES sim_pub(sim_pubid)
+);
+
+-- intent here is to capture how many releases are just not getting matched due
+-- to missing issue metadata
+CREATE TABLE IF NOT EXISTS release_counts (
+ sim_pubid TEXT NOT NULL PRIMARY KEY,
+ year TEXT,
+ volume TEXT,
+ year_in_sim BOOLEAN,
+ release_count INTEGER,
+ FOREIGN KEY(sim_pubid) REFERENCES sim_pub(sim_pubid)
+);