From 8b3b3e5892a10bf6748c7824549641d20e2447d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Apr 2020 17:49:08 -0700 Subject: add dedupe and query-fatcat commands --- covid19_tool.py | 30 ++++++++++++++++++ fatcat_covid19/dedupe.py | 27 ++++++++++++++++ fatcat_covid19/query.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 fatcat_covid19/dedupe.py create mode 100644 fatcat_covid19/query.py diff --git a/covid19_tool.py b/covid19_tool.py index 345aa6e..6b84f69 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -10,7 +10,9 @@ import sys import argparse from fatcat_covid19.parse import parse_cord19_file +from fatcat_covid19.query import query_fatcat from fatcat_covid19.enrich import enrich_fatcat_file +from fatcat_covid19.dedupe import dedupe_file from fatcat_covid19.derivatives import enrich_derivatives_file from fatcat_covid19.transform import transform_es_file @@ -36,6 +38,30 @@ def main(): type=argparse.FileType('w'), default=sys.stdout) + sub_query_fatcat = subparsers.add_parser('query-fatcat', + help="query fatcat search index for releases") + sub_query_fatcat.set_defaults( + action='query-fatcat', + ) + sub_query_fatcat.add_argument('--json-output', + help="file to write to", + type=argparse.FileType('w'), + default=sys.stdout) + + sub_dedupe = subparsers.add_parser('dedupe', + help="emit only one JSON line per fatcat release_id") + sub_dedupe.set_defaults( + action='dedupe', + ) + sub_dedupe.add_argument('--json-input', + help="input JSON rows file (eg, CORD-19 parsed JSON)", + type=argparse.FileType('r'), + default=sys.stdin) + sub_dedupe.add_argument('--json-output', + help="file to write to", + type=argparse.FileType('w'), + default=sys.stdout) + sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat', help="lookup fatcat releases from JSON metadata") sub_enrich_fatcat.set_defaults( @@ -98,6 +124,10 @@ def main(): if args.action == 'parse-cord19': parse_cord19_file(args.csv_path, args.json_output) + elif args.action == 'query-fatcat': + query_fatcat(args.json_output) + elif args.action == 'dedupe': + dedupe_file(args.json_input, args.json_output) elif args.action == 'enrich-fatcat': enrich_fatcat_file(args.json_file, args.json_output) elif args.action == 'enrich-derivatives': diff --git a/fatcat_covid19/dedupe.py b/fatcat_covid19/dedupe.py new file mode 100644 index 0000000..86cf8fa --- /dev/null +++ b/fatcat_covid19/dedupe.py @@ -0,0 +1,27 @@ + +import sys +import json +import datetime + + +def dedupe_file(json_input, json_output): + """ + Takes JSON file of "fatcat enriched" content, and de-dupes based on the + fatcat identifier. + """ + rows = dict() + for l in json_input: + l = json.loads(l) + key = l.get('release_id') + if not key: + continue + if not key in rows: + rows[key] = l + continue + for other_info in ['cord19_paper', 'fatcat_hit',]: + if other_info in l: + rows[key][other_info] = l[other_info] + + for k in rows: + print(json.dumps(rows[k], sort_keys=True), file=json_output) + diff --git a/fatcat_covid19/query.py b/fatcat_covid19/query.py new file mode 100644 index 0000000..4397fc5 --- /dev/null +++ b/fatcat_covid19/query.py @@ -0,0 +1,80 @@ + +import os +import sys +import json +import datetime + +import elasticsearch +from elasticsearch_dsl import Search, Q + +from fatcat_covid19.common import requests_retry_session + + +def query_fatcat(json_output): + """ + Queries fatcat search index (the full regular fatcat.wiki release index) + for COVID-19 keywords and phrases, iterates over the result set (using + scroll), and fetches full release entity (via api.fatcat.wik) for each. + """ + api_session = requests_retry_session() + + es_backend = os.environ.get( + "ELASTICSEARCH_BACKEND", + default="https://search.fatcat.wiki", + ) + es_index = "fatcat_release" + es_client = elasticsearch.Elasticsearch(es_backend) + + search = Search(using=es_client, index=es_index) + + search = search.exclude("terms", release_type=["stub", "component", "abstract"]) + + # "Emerald Expert Briefings" + search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"]) + + # ResearchGate + search = search.exclude("terms", doi_prefix=["10.13140"]) + + # some industrial thing + search = search.exclude("query_string", query='"Report on SARS backfit evaluation"', fields=["title"]) + + # physic experiment + search = search.exclude("query_string", query='"TOF-SARS"', fields=["title"]) + + # species not related to SARS + # something based on excluding "lake" in title might be easier? + search = search.exclude("query_string", query='"G.O. Sars"', fields=["title"]) + search = search.exclude("query_string", query='"Gomphocythere Sars"', fields=["title"]) + search = search.exclude("query_string", query='"Australis Sars"', fields=["title"]) + search = search.exclude("query_string", query='"scutifer Sars"', fields=["title"]) + search = search.exclude("query_string", query='"lumholtzi Sars"', fields=["title"]) + + search = search.query( + Q("query_string", query='"COVID-19" coronavirus coronaviruses "sars-cov-2" "2019-nCoV" "SARS-CoV" "MERS-CoV" SARS', default_operator="OR", fields=["title", "original_title"]) | + Q("query_string", query='pandemic influenza', default_operator="AND", fields=["biblio"]) | + Q("query_string", query='epidemic influenza', default_operator="AND", fields=["biblio"]) | + Q("query_string", query='pandemic ventilator', default_operator="AND", fields=["biblio"]) + ) + + print("Expecting {} search hits".format(search.count()), file=sys.stderr) + + search = search.params(clear_scroll=False) + search = search.params(_source=False) + + results = search.scan() + for hit in results: + release_id = hit.meta.id + resp = api_session.get( + 'https://api.fatcat.wiki/v0/release/{}'.format(release_id), + params={ + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'references', + }) + resp.raise_for_status() + row = dict( + fatcat_hit=hit.meta._d_, + release_id=release_id, + fatcat_release=resp.json(), + ) + print(json.dumps(row, sort_keys=True), file=json_output) + -- cgit v1.2.3