From 2d3d0274a23f0e52bff8b786aa7a930cb5b74c99 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 16 May 2020 13:37:02 -0700 Subject: more progress on issue_db --- fatcat_scholar/issue_db.py | 75 +++++++++++++++++++++++++++++----------------- notes/issue_db.md | 13 ++++++++ 2 files changed, 60 insertions(+), 28 deletions(-) create mode 100644 notes/issue_db.md diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py index 7fdc4a9..0d33e17 100644 --- a/fatcat_scholar/issue_db.py +++ b/fatcat_scholar/issue_db.py @@ -29,7 +29,8 @@ class SimPubRow: class SimIssueRow: """ TODO: - - distinguish between release count that can do full link with pages, or just in this year/volume/issue? + - distinguish between release count that can do full link with pages, or + just in this year/volume/issue? """ issue_item: str sim_pubid: str @@ -61,35 +62,30 @@ def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, is .filter("term", container_id=container_id)\ .filter("term", year=year)\ .filter("term", volume=volume)\ - .filter("term", issue=issue) + .filter("term", issue=issue)\ + .extra(request_cache=True) return search.count() def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]: """ + What is being returned is a list of dicts, each with year, volume, count + keys. """ - query = { - "size": 0, - "query": { - "term": { "container_id": ident } - }, - "aggs": { "container_stats": { "filters": { "filters": { - "in_web": { "term": { "in_web": "true" } }, - "in_kbart": { "term": { "in_kbart": "true" } }, - "is_preserved": { "term": { "is_preserved": "true" } }, - }}}} - } - params=dict(request_cache="true") - buckets = resp['aggregations']['container_stats']['buckets'] - stats = { - 'ident': ident, - 'issnl': issnl, - 'total': resp['hits']['total'], - 'in_web': buckets['in_web']['doc_count'], - 'in_kbart': buckets['in_kbart']['doc_count'], - 'is_preserved': buckets['is_preserved']['doc_count'], - } - return stats + search = Search(using=es_client, index="fatcat_release") + search = search\ + .filter("term", container_id=container_id) + search.aggs\ + .bucket('years', 'terms', field="year")\ + .bucket('volumes', 'terms', field="volume") + search = search[:0] + res = search.execute() + ret = [] + for year in res.aggregations.years.buckets: + for volume in year.volumes.buckets: + ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key)) + #print(ret[-1]) + return ret class IssueDB(): @@ -125,7 +121,7 @@ class IssueDB(): def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None: if not cur: cur = self.db.cursor() - cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?,?,?,?,?)", + cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)", counts.tuple()) def pubid2container(self, sim_pubid: str) -> Optional[str]: @@ -151,7 +147,7 @@ class IssueDB(): obj = json.loads(line) meta = obj['metadata'] assert "periodicals" in meta['collection'] - container: Optional[ContainerEntity] = None + container: Optional[fatcat_openapi_client.ContainerEntity] = None if meta.get('issn'): try: container = api.lookup_container(issnl=meta['issn']) @@ -196,7 +192,7 @@ class IssueDB(): sim_pubid=meta['sim_pubid'] year: Optional[int] = None - if meta.get('date'): + if meta.get('date') and meta['date'][:4].isdigit(): year = int(meta['date'][:4]) volume = meta.get('volume') issue = meta.get('issue') @@ -230,6 +226,23 @@ class IssueDB(): cur.close() self.db.commit() + def load_counts(self, es_client: Any): + all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;')) + cur: Any = self.db.cursor() + for (sim_pubid, container_ident) in all_pub_containers: + aggs = es_container_aggs(es_client, container_ident) + for agg in aggs: + row = ReleaseCountsRow( + sim_pubid=sim_pubid, + year_in_sim=False, # TODO + release_count=agg['count'], + year=agg['year'], + volume=agg['volume'], + ) + self.insert_release_counts(row, cur) + cur.close() + self.db.commit() + def main(): """ @@ -244,7 +257,7 @@ def main(): parser.add_argument("--db-file", help="sqlite3 database file to open", - default='issue_db.sqlite', + default='data/issue_db.sqlite', type=str) sub = subparsers.add_parser('init_db', @@ -265,6 +278,10 @@ def main(): help="item-level metadata, as JSON-lines", nargs='?', default=sys.stdin, type=argparse.FileType('r')) + sub = subparsers.add_parser('load_counts', + help="update volume-level stats from elasticsearch endpoint") + sub.set_defaults(func='load_counts') + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do! (try --help)") @@ -278,6 +295,8 @@ def main(): idb.load_pubs(args.json_file, api) elif args.func == 'load_issues': idb.load_issues(args.json_file, es_client) + elif args.func == 'load_counts': + idb.load_counts(es_client) else: func = getattr(idb, args.func) func() diff --git a/notes/issue_db.md b/notes/issue_db.md new file mode 100644 index 0000000..26f98d2 --- /dev/null +++ b/notes/issue_db.md @@ -0,0 +1,13 @@ + +## Commands + + mkdir -p data + ia search "collection:periodicals collection:sim_microfilm mediatype:collection" --itemlist | rg "^pub_" > data/sim_collections.tsv + ia search "collection:periodicals collection:sim_microfilm mediatype:texts" --itemlist | rg "^sim_" > data/sim_items.tsv + + cat data/sim_collections.tsv | parallel -j4 ia metadata {} | jq . -c | pv -l > data/sim_collections.json + cat data/sim_items.tsv | parallel -j8 ia metadata {} | jq . -c | pv -l > data/sim_items.json + + cat data/sim_collections.2020-05-15.json | pv -l | python -m fatcat_scholar.issue_db load_pubs + cat data/sim_items.2020-05-15.json | pv -l | python -m fatcat_scholar.issue_db load_issues + python -m fatcat_scholar.issue_db load_counts -- cgit v1.2.3