From 2e781738937efecbfc527a47ade6c3deaba64247 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 6 Apr 2021 20:04:03 -0700 Subject: container search schema: preservation stats, new fields Includes transform code updates and partial test coverage. --- python/tests/transform_elasticsearch.py | 47 +++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) (limited to 'python/tests') diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index 9cf77d4a..ba2b7ea2 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -147,11 +147,48 @@ def test_elasticsearch_release_from_json(): def test_elasticsearch_container_transform(journal_metadata_importer): with open('tests/files/journal_metadata.sample.json', 'r') as f: - raw = json.loads(f.readline()) - c = journal_metadata_importer.parse_record(raw) - c.state = 'active' - es = container_to_elasticsearch(c) - assert es['publisher'] == c.publisher + raw1 = json.loads(f.readline()) + raw2 = json.loads(f.readline()) + c1 = journal_metadata_importer.parse_record(raw1) + c1.state = 'active' + c2 = journal_metadata_importer.parse_record(raw2) + c2.state = 'active' + + c1.extra['publisher_type'] = "big5" + c1.extra['discipline'] = "history" + es = container_to_elasticsearch(c1) + assert es['publisher'] == c1.publisher + assert es['discipline'] == c1.extra['discipline'] + assert es['publisher_type'] == c1.extra['publisher_type'] + assert es['keepers'] == [] + + stats = { + "ident": "en4qj5ijrbf5djxx7p5zzpjyoq", + "in_kbart": 11136, + "in_web": 9501, + "is_preserved": 11136, + "issnl": "2050-084X", + "preservation": { + "bright": 9501, + "dark": 1635, + "none": 0, + "shadows_only": 0, + "total": 11136 + }, + "release_type": { + "_unknown": 9, + "article-journal": 11124, + "editorial": 2, + "letter": 1 + }, + "total": 11136 + } + es = container_to_elasticsearch(c2, stats=stats) + assert es['name'] == c2.name + assert es['publisher'] == c2.publisher + assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"] + assert es['any_kbart'] == True + def test_elasticsearch_file_transform(matched_importer): f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity) -- cgit v1.2.3 From 0e171b5aeb77690ead3bb896be196fdcc5c69a39 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 6 Apr 2021 20:05:05 -0700 Subject: search container stats: changes to be called from index code path Eg, allowing injection of more config values --- python/fatcat_web/search.py | 13 ++++++++++--- python/tests/web_search.py | 10 ++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'python/tests') diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 0cdb604a..2811b9a0 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -424,7 +424,7 @@ def get_elastic_search_coverage(query: ReleaseQuery) -> dict: return stats -def get_elastic_container_stats(ident, issnl=None): +def get_elastic_container_stats(ident, issnl=None, es_client=None, es_index=None, merge_shadows=None): """ Returns dict: ident @@ -435,7 +435,14 @@ def get_elastic_container_stats(ident, issnl=None): preserved """ - search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_RELEASE_INDEX']) + if not es_client: + es_client = app.es_client + if not es_index: + es_index = app.config['ELASTICSEARCH_RELEASE_INDEX'] + if merge_shadows is None: + merge_shadows = app.config['FATCAT_MERGE_SHADOW_PRESERVATION'] + + search = Search(using=es_client, index=es_index) search = search.query( 'term', container_id=ident, @@ -479,7 +486,7 @@ def get_elastic_container_stats(ident, issnl=None): for k in ('bright', 'dark', 'shadows_only', 'none'): if not k in preservation_bucket: preservation_bucket[k] = 0 - if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']: + if merge_shadows: preservation_bucket['none'] += preservation_bucket['shadows_only'] preservation_bucket['shadows_only'] = 0 release_type_bucket = agg_to_dict(resp.aggregations.release_type) diff --git a/python/tests/web_search.py b/python/tests/web_search.py index a7bf7ec7..8df01466 100644 --- a/python/tests/web_search.py +++ b/python/tests/web_search.py @@ -165,6 +165,16 @@ def test_container_stats(app, mocker): ] rv = app.get('/container/issnl/1234-5678/stats.json') assert rv.status_code == 200 + stats = rv.json + assert isinstance(stats['total'], int) + assert isinstance(stats['release_type'], dict) + assert isinstance(stats['preservation']['total'], int) + assert isinstance(stats['preservation']['bright'], int) + assert isinstance(stats['preservation']['dark'], int) + assert isinstance(stats['preservation']['none'], int) rv = app.get('/container/aaaaaaaaaaaaaeiraaaaaaaaam/stats.json') assert rv.status_code == 200 + stats = rv.json + assert isinstance(stats['total'], int) + assert stats['ident'] == "aaaaaaaaaaaaaeiraaaaaaaaam" -- cgit v1.2.3 From b0c5db8a2bd2e389f99df1b44120c18fa5bc3e52 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 6 Apr 2021 20:27:05 -0700 Subject: transform tool: container transform stats lookup support --- python/fatcat_transform.py | 28 ++++++++++++++++++++-- .../container_jxqqgho7bncrvgfyfznramju3q.json | 1 + 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 python/tests/files/container_jxqqgho7bncrvgfyfznramju3q.json (limited to 'python/tests') diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index 8e01c860..93c39e2f 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -9,11 +9,14 @@ import sys import json import argparse +import elasticsearch from fatcat_openapi_client import ReleaseEntity, ContainerEntity, FileEntity, ChangelogEntry + from fatcat_tools import entity_from_json, \ release_to_elasticsearch, container_to_elasticsearch, \ file_to_elasticsearch, changelog_to_elasticsearch, public_api, \ release_to_csl, citeproc_csl +from fatcat_web.search import get_elastic_container_stats def run_elasticsearch_releases(args): @@ -28,6 +31,8 @@ def run_elasticsearch_releases(args): json.dumps(release_to_elasticsearch(entity)) + '\n') def run_elasticsearch_containers(args): + es_client = elasticsearch.Elasticsearch(args.fatcat_elasticsearch_url) + es_release_index = "fatcat_release" for line in args.json_input: line = line.strip() if not line: @@ -35,8 +40,21 @@ def run_elasticsearch_containers(args): entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) if entity.state != 'active': continue - args.json_output.write( - json.dumps(container_to_elasticsearch(entity)) + '\n') + + if args.query_stats: + es_doc = container_to_elasticsearch( + entity, + stats=get_elastic_container_stats( + entity.ident, + es_client=es_client, + es_index=es_release_index, + merge_shadows=True, + ), + ) + else: + es_doc = container_to_elasticsearch(entity) + + args.json_output.write(json.dumps(es_doc) + '\n') def run_elasticsearch_files(args): for line in args.json_input: @@ -77,6 +95,9 @@ def main(): parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="connect to this host/port") + parser.add_argument('--fatcat-elasticsearch-url', + default="http://localhost:9200", + help="connect to this host/port") subparsers = parser.add_subparsers() sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases', @@ -98,6 +119,9 @@ def main(): sub_elasticsearch_containers.add_argument('json_output', help="where to send output", default=sys.stdout, type=argparse.FileType('w')) + sub_elasticsearch_containers.add_argument('--query-stats', + action='store_true', + help="whether to query release search index for container stats") sub_elasticsearch_files = subparsers.add_parser('elasticsearch-files', help="convert fatcat file JSON schema to elasticsearch file schema") diff --git a/python/tests/files/container_jxqqgho7bncrvgfyfznramju3q.json b/python/tests/files/container_jxqqgho7bncrvgfyfznramju3q.json new file mode 100644 index 00000000..bb4d46f9 --- /dev/null +++ b/python/tests/files/container_jxqqgho7bncrvgfyfznramju3q.json @@ -0,0 +1 @@ +{"extra":{"abbrev":"Annu. Rev. Pharmacol. Toxicol.","country":"us","ezb":{"color":"red","ezb_id":"2460"},"ia":{"sim":{"peer_reviewed":true,"pub_type":"Scholarly Journals","scholarly_peer_reviewed":true,"sim_pubid":"5091","year_spans":[[1961,2009]]}},"issne":"1545-4304","issnp":"0362-1642","kbart":{"hathitrust":{"year_spans":[[1976,1992]]},"portico":{"year_spans":[[1961,1999],[2001,2001],[2003,2003],[2005,2006],[2008,2010],[2012,2019]]},"scholarsportal":{"year_spans":[[1961,2003],[2005,2019]]}},"languages":["en"],"sherpa_romeo":{"color":"yellow"},"urls":["https://www.annualreviews.org/journal/pharmtox","https://www.annualreviews.org/loi/pharmtox","http://arjournals.annualreviews.org/loi/pharmtox"]},"ident":"jxqqgho7bncrvgfyfznramju3q","issnl":"0362-1642","name":"Annual Review of Pharmacology and Toxicology","publisher":"Annual Reviews","revision":"ff56081b-9130-47a6-9e14-9901c2808502","state":"active"} -- cgit v1.2.3