From 328892b0e571395fbbf8a22ca8a4216c6cf71074 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 15:13:43 -0700 Subject: refactor enrich into fatcat_covid19 --- bin/cord19_fatcat_enrich.py | 106 -------------------------------------------- bin/parse_cord19_csv.py | 3 +- covid19_tool.py | 16 ++++++- fatcat_covid19/enrich.py | 67 ++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 109 deletions(-) delete mode 100755 bin/cord19_fatcat_enrich.py create mode 100644 fatcat_covid19/enrich.py diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py deleted file mode 100755 index 2478227..0000000 --- a/bin/cord19_fatcat_enrich.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 - -""" -Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat -metadata. - -TODO: refactor into `fatcat_covid19` module and wrapper CLI script. -""" - -import sys -import json -import argparse -import datetime - -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error - - -def requests_retry_session(retries=10, backoff_factor=3, - status_forcelist=(500, 502, 504), session=None): - """ - From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests - """ - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) - return session - - -def do_line(row, args): - - pubmed_id = row.get('pubmed_id') or None - pmcid = row.get('pmcid') or None - doi = row.get('doi') or None - fatcat_release = None - - if doi == '0.1126/science.abb7331': - doi = '10.1126/science.abb7331' - - if not fatcat_release and pmcid: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'pmcid': pmcid, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - if not fatcat_release and doi: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'doi': doi, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - if not fatcat_release and pubmed_id: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'pmid': pubmed_id, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - - obj = dict( - cord19_paper=row, - ) - if fatcat_release: - obj['fatcat_release'] = fatcat_release - obj['release_id'] = fatcat_release['ident'] - obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id']) - print(json.dumps(obj, sort_keys=True)) - -def run(args): - for l in args.json_file: - l = json.loads(l) - do_line(l, args) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="CORD-19 parsed JSON file", - type=argparse.FileType('r')) - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() - diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py index 55cd81b..dbc6cc5 100755 --- a/bin/parse_cord19_csv.py +++ b/bin/parse_cord19_csv.py @@ -17,4 +17,5 @@ with open(CSVFILE, newline='') as csvfile: row = dict(row) row['mag_id'] = row.pop('Microsoft Academic Paper ID') row['who_covidence_id'] = row.pop('WHO #Covidence').replace('#', '') - print(json.dumps(row, sort_keys=True)) + obj = dict(cord19_paper=row) + print(json.dumps(obj, sort_keys=True)) diff --git a/covid19_tool.py b/covid19_tool.py index 1cf8dce..e984c28 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -70,14 +70,26 @@ def main(): type=argparse.FileType('r'), default=sys.stdout) + sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat', + help="lookup fatcat releases from JSON metadata") + sub_enrich_fatcat.add_argument('json_file', + help="input JSON rows file (eg, CORD-19 parsed JSON)", + type=argparse.FileType('r')) + sub_enrich_fatcat.add_argument('--json-output', + help="file to write to", + type=argparse.FileType('r'), + default=sys.stdout) + args = parser.parse_args() if args.action == 'webface': app.run(debug=args.debug, host=args.host, port=args.port) - if args.action == 'derivatives': + elif args.action == 'derivatives': enrich_derivatives_file(args.json_file, args.json_output, args.base_dir) - if args.action == 'transform-es': + elif args.action == 'transform-es': + transform_es_file(args.json_file, args.json_output) + elif args.action == 'enrich-fatcat': transform_es_file(args.json_file, args.json_output) else: print("tell me what to do!") diff --git a/fatcat_covid19/enrich.py b/fatcat_covid19/enrich.py new file mode 100644 index 0000000..245a357 --- /dev/null +++ b/fatcat_covid19/enrich.py @@ -0,0 +1,67 @@ + +import sys +import json +import datetime + +from fatcat_covid19.common import requests_retry_session + + +def enrich_fatcat_row(row, api_session): + + cord19_paper = row.get('cord19_paper') + if not cord19_paper: + return row + + pubmed_id = cord19_paper.get('pubmed_id') or None + pmcid = cord19_paper.get('pmcid') or None + doi = cord19_paper.get('doi') or None + fatcat_release = None + + if doi == '0.1126/science.abb7331': + doi = '10.1126/science.abb7331' + + if not fatcat_release and pmcid: + resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'pmcid': pmcid, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + if not fatcat_release and doi: + resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'doi': doi, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + if not fatcat_release and pubmed_id: + resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'pmid': pubmed_id, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + + if fatcat_release: + row['fatcat_release'] = fatcat_release + row['release_id'] = fatcat_release['ident'] + print(json.dumps(row, sort_keys=True)) + + +def enrich_fatcat_file(json_input, json_output): + """ + Takes a JSON-transformed CORD-19 *metadata* file and enriches it with + fatcat metadata. + """ + api_session = requests_retry_session() + for l in json_input: + l = json.loads(l) + result = enrich_fatcat_row(l, api_session) + if result: + print(json.dumps(result, sort_keys=True), file=json_output) -- cgit v1.2.3