summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/issue_db.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/issue_db.py')
-rw-r--r--fatcat_scholar/issue_db.py250
1 files changed, 164 insertions, 86 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
index 4f5ff53..12ffa32 100644
--- a/fatcat_scholar/issue_db.py
+++ b/fatcat_scholar/issue_db.py
@@ -1,4 +1,3 @@
-
import sys
import json
import sqlite3
@@ -9,6 +8,7 @@ import fatcat_openapi_client
import elasticsearch
from elasticsearch_dsl import Search, Q
+
@dataclass
class SimPubRow:
sim_pubid: str
@@ -23,7 +23,17 @@ class SimPubRow:
wikidata_qid: Optional[str]
def tuple(self):
- return (self.sim_pubid, self.pub_collection, self.title, self.issn, self.pub_type, self.publisher, self.container_issnl, self.container_ident, self.wikidata_qid)
+ return (
+ self.sim_pubid,
+ self.pub_collection,
+ self.title,
+ self.issn,
+ self.pub_type,
+ self.publisher,
+ self.container_issnl,
+ self.container_ident,
+ self.wikidata_qid,
+ )
@classmethod
def from_tuple(cls, row: Any) -> "SimPubRow":
@@ -39,6 +49,7 @@ class SimPubRow:
wikidata_qid=row[8],
)
+
@dataclass
class SimIssueRow:
"""
@@ -46,6 +57,7 @@ class SimIssueRow:
- distinguish between release count that can do full link with pages, or
just in this year/volume/issue?
"""
+
issue_item: str
sim_pubid: str
year: Optional[int]
@@ -56,7 +68,16 @@ class SimIssueRow:
release_count: Optional[int]
def tuple(self):
- return (self.issue_item, self.sim_pubid, self.year, self.volume, self.issue, self.first_page, self.last_page, self.release_count)
+ return (
+ self.issue_item,
+ self.sim_pubid,
+ self.year,
+ self.volume,
+ self.issue,
+ self.first_page,
+ self.last_page,
+ self.release_count,
+ )
@classmethod
def from_tuple(cls, row: Any) -> "SimIssueRow":
@@ -71,6 +92,7 @@ class SimIssueRow:
release_count=row[7],
)
+
@dataclass
class ReleaseCountsRow:
sim_pubid: str
@@ -80,82 +102,100 @@ class ReleaseCountsRow:
volume: Optional[str]
def tuple(self):
- return (self.sim_pubid, self.year, self.volume, self.year_in_sim, self.release_count)
+ return (
+ self.sim_pubid,
+ self.year,
+ self.volume,
+ self.year_in_sim,
+ self.release_count,
+ )
-def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int:
+def es_issue_count(
+ es_client: Any, container_id: str, year: int, volume: str, issue: str
+) -> int:
search = Search(using=es_client, index="fatcat_release")
- search = search\
- .filter("term", container_id=container_id)\
- .filter("term", year=year)\
- .filter("term", volume=volume)\
- .filter("term", issue=issue)\
+ search = (
+ search.filter("term", container_id=container_id)
+ .filter("term", year=year)
+ .filter("term", volume=volume)
+ .filter("term", issue=issue)
.extra(request_cache=True)
+ )
return search.count()
+
def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]:
"""
What is being returned is a list of dicts, each with year, volume, count
keys.
"""
search = Search(using=es_client, index="fatcat_release")
- search = search\
- .filter("term", container_id=container_id)
- search.aggs\
- .bucket('years', 'terms', field="year")\
- .bucket('volumes', 'terms', field="volume")
+ search = search.filter("term", container_id=container_id)
+ search.aggs.bucket("years", "terms", field="year").bucket(
+ "volumes", "terms", field="volume"
+ )
search = search[:0]
res = search.execute()
ret = []
for year in res.aggregations.years.buckets:
for volume in year.volumes.buckets:
ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key))
- #print(ret[-1])
+ # print(ret[-1])
return ret
-class IssueDB():
+class IssueDB:
def __init__(self, db_file):
"""
To create a temporary database, pass ":memory:" as db_file
"""
- self.db = sqlite3.connect(db_file, isolation_level='EXCLUSIVE')
+ self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE")
self._pubid2container_map: Dict[str, Optional[str]] = dict()
self._container2pubid_map: Dict[str, Optional[str]] = dict()
def init_db(self):
- self.db.executescript("""
+ self.db.executescript(
+ """
PRAGMA main.page_size = 4096;
PRAGMA main.cache_size = 20000;
PRAGMA main.locking_mode = EXCLUSIVE;
PRAGMA main.synchronous = OFF;
- """)
- with open('schema/issue_db.sql', 'r') as fschema:
+ """
+ )
+ with open("schema/issue_db.sql", "r") as fschema:
self.db.executescript(fschema.read())
def insert_sim_pub(self, pub: SimPubRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)",
- pub.tuple())
+ cur.execute(
+ "INSERT OR REPLACE INTO sim_pub VALUES (?,?,?,?,?,?,?,?,?)", pub.tuple()
+ )
def insert_sim_issue(self, issue: SimIssueRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)",
- issue.tuple())
+ cur.execute(
+ "INSERT OR REPLACE INTO sim_issue VALUES (?,?,?,?,?,?,?,?)", issue.tuple()
+ )
def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)",
- counts.tuple())
+ cur.execute(
+ "INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple()
+ )
def pubid2container(self, sim_pubid: str) -> Optional[str]:
if sim_pubid in self._pubid2container_map:
return self._pubid2container_map[sim_pubid]
- row = list(self.db.execute("SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+ row = list(
+ self.db.execute(
+ "SELECT container_ident FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]
+ )
+ )
if row:
self._pubid2container_map[sim_pubid] = row[0][0]
return row[0][0]
@@ -166,7 +206,12 @@ class IssueDB():
def container2pubid(self, container_ident: str) -> Optional[str]:
if container_ident in self._container2pubid_map:
return self._container2pubid_map[container_ident]
- row = list(self.db.execute("SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;", [container_ident]))
+ row = list(
+ self.db.execute(
+ "SELECT sim_pubid FROM sim_pub WHERE container_ident = ?;",
+ [container_ident],
+ )
+ )
if row:
self._container2pubid_map[container_ident] = row[0][0]
return row[0][0]
@@ -174,14 +219,23 @@ class IssueDB():
self._pubid2container_map[container_ident] = None
return None
- def lookup_issue(self, sim_pubid: str, volume: str, issue: str) -> Optional[SimIssueRow]:
- row = list(self.db.execute("SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;", [sim_pubid, volume, issue]))
+ def lookup_issue(
+ self, sim_pubid: str, volume: str, issue: str
+ ) -> Optional[SimIssueRow]:
+ row = list(
+ self.db.execute(
+ "SELECT * FROM sim_issue WHERE sim_pubid = ? AND volume = ? AND issue = ?;",
+ [sim_pubid, volume, issue],
+ )
+ )
if not row:
return None
return SimIssueRow.from_tuple(row[0])
def lookup_pub(self, sim_pubid: str) -> Optional[SimPubRow]:
- row = list(self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid]))
+ row = list(
+ self.db.execute("SELECT * FROM sim_pub WHERE sim_pubid = ?;", [sim_pubid])
+ )
if not row:
return None
return SimPubRow.from_tuple(row[0])
@@ -196,22 +250,22 @@ class IssueDB():
if not line:
continue
obj = json.loads(line)
- meta = obj['metadata']
- assert "periodicals" in meta['collection']
+ meta = obj["metadata"]
+ assert "periodicals" in meta["collection"]
container: Optional[fatcat_openapi_client.ContainerEntity] = None
- if meta.get('issn'):
+ if meta.get("issn"):
try:
- container = api.lookup_container(issnl=meta['issn'])
+ container = api.lookup_container(issnl=meta["issn"])
except fatcat_openapi_client.ApiException as ae:
if ae.status != 404:
raise ae
row = SimPubRow(
- sim_pubid=meta['sim_pubid'],
- pub_collection=meta['identifier'],
- title=meta['title'],
- issn=meta.get('issn'),
- pub_type=meta.get('pub_type'),
- publisher=meta.get('publisher'),
+ sim_pubid=meta["sim_pubid"],
+ pub_collection=meta["identifier"],
+ title=meta["title"],
+ issn=meta.get("issn"),
+ pub_type=meta.get("pub_type"),
+ publisher=meta.get("publisher"),
container_issnl=container and container.issnl,
container_ident=container and container.ident,
wikidata_qid=container and container.wikidata_qid,
@@ -230,28 +284,32 @@ class IssueDB():
if not line:
continue
obj = json.loads(line)
- meta = obj['metadata']
- assert "periodicals" in meta['collection']
- #pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
- issue_item = meta['identifier']
+ meta = obj["metadata"]
+ assert "periodicals" in meta["collection"]
+ # pub_collection = [c for c in meta['collection'] if c.startswith("pub_")][0]
+ issue_item = meta["identifier"]
# don't index meta items
# TODO: handle more weird suffixes like "1-2", "_part_1", "_index-contents"
if issue_item.endswith("_index") or issue_item.endswith("_contents"):
continue
- sim_pubid=meta['sim_pubid']
+ sim_pubid = meta["sim_pubid"]
year: Optional[int] = None
- if meta.get('date') and meta['date'][:4].isdigit():
- year = int(meta['date'][:4])
- volume = meta.get('volume')
- issue = meta.get('issue')
+ if meta.get("date") and meta["date"][:4].isdigit():
+ year = int(meta["date"][:4])
+ volume = meta.get("volume")
+ issue = meta.get("issue")
first_page: Optional[int] = None
last_page: Optional[int] = None
- if obj.get('page_numbers'):
- pages = [p['pageNumber'] for p in obj['page_numbers']['pages'] if p['pageNumber']]
+ if obj.get("page_numbers"):
+ pages = [
+ p["pageNumber"]
+ for p in obj["page_numbers"]["pages"]
+ if p["pageNumber"]
+ ]
pages = [int(p) for p in pages if p.isdigit()]
if len(pages):
first_page = min(pages)
@@ -261,7 +319,9 @@ class IssueDB():
if year and volume and issue:
container_id = self.pubid2container(sim_pubid)
if container_id:
- release_count = es_issue_count(es_client, container_id, year, volume, issue)
+ release_count = es_issue_count(
+ es_client, container_id, year, volume, issue
+ )
row = SimIssueRow(
issue_item=issue_item,
@@ -278,17 +338,21 @@ class IssueDB():
self.db.commit()
def load_counts(self, es_client: Any):
- all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;'))
+ all_pub_containers = list(
+ self.db.execute(
+ "SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;"
+ )
+ )
cur: Any = self.db.cursor()
for (sim_pubid, container_ident) in all_pub_containers:
aggs = es_container_aggs(es_client, container_ident)
for agg in aggs:
row = ReleaseCountsRow(
sim_pubid=sim_pubid,
- year_in_sim=False, # TODO
- release_count=agg['count'],
- year=agg['year'],
- volume=agg['volume'],
+ year_in_sim=False, # TODO
+ release_count=agg["count"],
+ year=agg["year"],
+ volume=agg["volume"],
)
self.insert_release_counts(row, cur)
cur.close()
@@ -303,35 +367,48 @@ def main():
"""
parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
subparsers = parser.add_subparsers()
- parser.add_argument("--db-file",
+ parser.add_argument(
+ "--db-file",
help="sqlite3 database file to open",
- default='data/issue_db.sqlite',
- type=str)
-
- sub = subparsers.add_parser('init_db',
- help="create sqlite3 output file and tables")
- sub.set_defaults(func='init_db')
-
- sub = subparsers.add_parser('load_pubs',
- help="update container-level stats from JSON file")
- sub.set_defaults(func='load_pubs')
- sub.add_argument("json_file",
+ default="data/issue_db.sqlite",
+ type=str,
+ )
+
+ sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables")
+ sub.set_defaults(func="init_db")
+
+ sub = subparsers.add_parser(
+ "load_pubs", help="update container-level stats from JSON file"
+ )
+ sub.set_defaults(func="load_pubs")
+ sub.add_argument(
+ "json_file",
help="collection-level metadata, as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
-
- sub = subparsers.add_parser('load_issues',
- help="update item-level stats from JSON file")
- sub.set_defaults(func='load_issues')
- sub.add_argument("json_file",
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ sub = subparsers.add_parser(
+ "load_issues", help="update item-level stats from JSON file"
+ )
+ sub.set_defaults(func="load_issues")
+ sub.add_argument(
+ "json_file",
help="item-level metadata, as JSON-lines",
- nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+ nargs="?",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
- sub = subparsers.add_parser('load_counts',
- help="update volume-level stats from elasticsearch endpoint")
- sub.set_defaults(func='load_counts')
+ sub = subparsers.add_parser(
+ "load_counts", help="update volume-level stats from elasticsearch endpoint"
+ )
+ sub.set_defaults(func="load_counts")
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -342,15 +419,16 @@ def main():
api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient())
es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")
- if args.func == 'load_pubs':
+ if args.func == "load_pubs":
idb.load_pubs(args.json_file, api)
- elif args.func == 'load_issues':
+ elif args.func == "load_issues":
idb.load_issues(args.json_file, es_client)
- elif args.func == 'load_counts':
+ elif args.func == "load_counts":
idb.load_counts(es_client)
else:
func = getattr(idb, args.func)
func()
-if __name__=="__main__":
+
+if __name__ == "__main__":
main()