aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-16 13:37:02 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-16 13:37:02 -0700
commit2d3d0274a23f0e52bff8b786aa7a930cb5b74c99 (patch)
treea884037b549c81e38b317153b8eb1f4913b3cd67
parent202c6fc3cd1a5b1ba2bb005219e271dc7d4977f5 (diff)
downloadfatcat-scholar-2d3d0274a23f0e52bff8b786aa7a930cb5b74c99.tar.gz
fatcat-scholar-2d3d0274a23f0e52bff8b786aa7a930cb5b74c99.zip
more progress on issue_db
-rw-r--r--fatcat_scholar/issue_db.py75
-rw-r--r--notes/issue_db.md13
2 files changed, 60 insertions, 28 deletions
diff --git a/fatcat_scholar/issue_db.py b/fatcat_scholar/issue_db.py
index 7fdc4a9..0d33e17 100644
--- a/fatcat_scholar/issue_db.py
+++ b/fatcat_scholar/issue_db.py
@@ -29,7 +29,8 @@ class SimPubRow:
class SimIssueRow:
"""
TODO:
- - distinguish between release count that can do full link with pages, or just in this year/volume/issue?
+ - distinguish between release count that can do full link with pages, or
+ just in this year/volume/issue?
"""
issue_item: str
sim_pubid: str
@@ -61,35 +62,30 @@ def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, is
.filter("term", container_id=container_id)\
.filter("term", year=year)\
.filter("term", volume=volume)\
- .filter("term", issue=issue)
+ .filter("term", issue=issue)\
+ .extra(request_cache=True)
return search.count()
def es_container_aggs(es_client: Any, container_id: str) -> List[Dict[str, Any]]:
"""
+ What is being returned is a list of dicts, each with year, volume, count
+ keys.
"""
- query = {
- "size": 0,
- "query": {
- "term": { "container_id": ident }
- },
- "aggs": { "container_stats": { "filters": { "filters": {
- "in_web": { "term": { "in_web": "true" } },
- "in_kbart": { "term": { "in_kbart": "true" } },
- "is_preserved": { "term": { "is_preserved": "true" } },
- }}}}
- }
- params=dict(request_cache="true")
- buckets = resp['aggregations']['container_stats']['buckets']
- stats = {
- 'ident': ident,
- 'issnl': issnl,
- 'total': resp['hits']['total'],
- 'in_web': buckets['in_web']['doc_count'],
- 'in_kbart': buckets['in_kbart']['doc_count'],
- 'is_preserved': buckets['is_preserved']['doc_count'],
- }
- return stats
+ search = Search(using=es_client, index="fatcat_release")
+ search = search\
+ .filter("term", container_id=container_id)
+ search.aggs\
+ .bucket('years', 'terms', field="year")\
+ .bucket('volumes', 'terms', field="volume")
+ search = search[:0]
+ res = search.execute()
+ ret = []
+ for year in res.aggregations.years.buckets:
+ for volume in year.volumes.buckets:
+ ret.append(dict(count=volume.doc_count, year=year.key, volume=volume.key))
+ #print(ret[-1])
+ return ret
class IssueDB():
@@ -125,7 +121,7 @@ class IssueDB():
def insert_release_counts(self, counts: ReleaseCountsRow, cur: Any = None) -> None:
if not cur:
cur = self.db.cursor()
- cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?,?,?,?,?)",
+ cur.execute("INSERT OR REPLACE INTO release_counts VALUES (?,?,?,?,?)",
counts.tuple())
def pubid2container(self, sim_pubid: str) -> Optional[str]:
@@ -151,7 +147,7 @@ class IssueDB():
obj = json.loads(line)
meta = obj['metadata']
assert "periodicals" in meta['collection']
- container: Optional[ContainerEntity] = None
+ container: Optional[fatcat_openapi_client.ContainerEntity] = None
if meta.get('issn'):
try:
container = api.lookup_container(issnl=meta['issn'])
@@ -196,7 +192,7 @@ class IssueDB():
sim_pubid=meta['sim_pubid']
year: Optional[int] = None
- if meta.get('date'):
+ if meta.get('date') and meta['date'][:4].isdigit():
year = int(meta['date'][:4])
volume = meta.get('volume')
issue = meta.get('issue')
@@ -230,6 +226,23 @@ class IssueDB():
cur.close()
self.db.commit()
+ def load_counts(self, es_client: Any):
+ all_pub_containers = list(self.db.execute('SELECT sim_pubid, container_ident FROM sim_pub WHERE container_ident IS NOT NULL;'))
+ cur: Any = self.db.cursor()
+ for (sim_pubid, container_ident) in all_pub_containers:
+ aggs = es_container_aggs(es_client, container_ident)
+ for agg in aggs:
+ row = ReleaseCountsRow(
+ sim_pubid=sim_pubid,
+ year_in_sim=False, # TODO
+ release_count=agg['count'],
+ year=agg['year'],
+ volume=agg['volume'],
+ )
+ self.insert_release_counts(row, cur)
+ cur.close()
+ self.db.commit()
+
def main():
"""
@@ -244,7 +257,7 @@ def main():
parser.add_argument("--db-file",
help="sqlite3 database file to open",
- default='issue_db.sqlite',
+ default='data/issue_db.sqlite',
type=str)
sub = subparsers.add_parser('init_db',
@@ -265,6 +278,10 @@ def main():
help="item-level metadata, as JSON-lines",
nargs='?', default=sys.stdin, type=argparse.FileType('r'))
+ sub = subparsers.add_parser('load_counts',
+ help="update volume-level stats from elasticsearch endpoint")
+ sub.set_defaults(func='load_counts')
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("tell me what to do! (try --help)")
@@ -278,6 +295,8 @@ def main():
idb.load_pubs(args.json_file, api)
elif args.func == 'load_issues':
idb.load_issues(args.json_file, es_client)
+ elif args.func == 'load_counts':
+ idb.load_counts(es_client)
else:
func = getattr(idb, args.func)
func()
diff --git a/notes/issue_db.md b/notes/issue_db.md
new file mode 100644
index 0000000..26f98d2
--- /dev/null
+++ b/notes/issue_db.md
@@ -0,0 +1,13 @@
+
+## Commands
+
+ mkdir -p data
+ ia search "collection:periodicals collection:sim_microfilm mediatype:collection" --itemlist | rg "^pub_" > data/sim_collections.tsv
+ ia search "collection:periodicals collection:sim_microfilm mediatype:texts" --itemlist | rg "^sim_" > data/sim_items.tsv
+
+ cat data/sim_collections.tsv | parallel -j4 ia metadata {} | jq . -c | pv -l > data/sim_collections.json
+ cat data/sim_items.tsv | parallel -j8 ia metadata {} | jq . -c | pv -l > data/sim_items.json
+
+ cat data/sim_collections.2020-05-15.json | pv -l | python -m fatcat_scholar.issue_db load_pubs
+ cat data/sim_items.2020-05-15.json | pv -l | python -m fatcat_scholar.issue_db load_issues
+ python -m fatcat_scholar.issue_db load_counts