From a2d06feede4645ac4072d471150fc8d32b9636dc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 24 Mar 2020 20:33:21 -0700 Subject: move and tweak scripts --- scripts/cord19_fatcat_enrich.py | 104 +++++++++++++++++++++++++++++++++++++ scripts/deliver_file2disk.py | 10 ---- scripts/who_enrich.py | 110 ---------------------------------------- 3 files changed, 104 insertions(+), 120 deletions(-) create mode 100755 scripts/cord19_fatcat_enrich.py delete mode 100755 scripts/who_enrich.py diff --git a/scripts/cord19_fatcat_enrich.py b/scripts/cord19_fatcat_enrich.py new file mode 100755 index 0000000..5d3a554 --- /dev/null +++ b/scripts/cord19_fatcat_enrich.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +""" +Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat +metadata. +""" + +import sys +import json +import argparse +import datetime + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + + +def requests_retry_session(retries=10, backoff_factor=3, + status_forcelist=(500, 502, 504), session=None): + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + +def do_line(row, args): + + pubmed_id = row.get('pubmed_id') or None + pmcid = row.get('pmcid') or None + doi = row.get('doi') or None + fatcat_release = None + + if doi == '0.1126/science.abb7331': + doi = '10.1126/science.abb7331' + + if not fatcat_release and pmcid: + resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'pmcid': pmcid, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + if not fatcat_release and doi: + resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'doi': doi, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + if not fatcat_release and pubmed_id: + resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', + params={ + 'pmid': pubmed_id, + 'expand': 'container,files,filesets,webcaptures', + 'hide': 'abstracts,references', + }) + if resp.status_code == 200: + fatcat_release = resp.json() + + obj = dict( + cord19_paper=row, + ) + if fatcat_release: + obj['fatcat_release'] = fatcat_release + obj['release_id'] = fatcat_release['ident'] + obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id']) + print(json.dumps(obj, sort_keys=True)) + +def run(args): + for l in sys.stdin: + l = json.loads(l) + do_line(l, args) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="CORD-19 parsed JSON file", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + args.session = requests_retry_session() + + run(args) + +if __name__ == '__main__': + main() + diff --git a/scripts/deliver_file2disk.py b/scripts/deliver_file2disk.py index d661acc..9ec234a 100755 --- a/scripts/deliver_file2disk.py +++ b/scripts/deliver_file2disk.py @@ -10,16 +10,6 @@ Behavior: - try downloading from any archive.org or web.archive.org URLs - verify SHA-1 - write out to disk - -TODO: -x blob_path(sha1hex) -> returns relative/local path file would be saved to -x filter_files(files) -> list of files to try -- fetch_release(release) -> tries to download PDF bytes -- fetch_file(file) -> returns bytes of fetched file -- fetch_content(url) -> tries to download PDF bytes - -LATER: -- GRBOID XML as well, from minio? """ # XXX: some broken MRO thing going on in here due to python3 object wrangling diff --git a/scripts/who_enrich.py b/scripts/who_enrich.py deleted file mode 100755 index b445927..0000000 --- a/scripts/who_enrich.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script takes a "Paper" MAG TSV file which has been joined with (at most) a -single "PaperExtendedAttributes", parses it into JSON, and does fatcat fetches -to "enrich" the output. Outputs a single JSON object per line with attributes: - - mag_id - mag_paper - release_id - fatcat_release - -Input columns: - -""" - -import sys -import json -import argparse -import datetime - -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error - - -def requests_retry_session(retries=10, backoff_factor=3, - status_forcelist=(500, 502, 504), session=None): - """ - From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests - """ - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) - return session - - -def do_line(row, args): - - pubmed_id = row.get('pubmed_id') or None - pmcid = row.get('pmcid') or None - doi = row.get('doi') or None - fatcat_release = None - - if not fatcat_release and pmcid: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'pmcid': pmcid, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - if not fatcat_release and doi: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'doi': doi, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - if not fatcat_release and pubmed_id: - resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', - params={ - 'pmid': pubmed_id, - 'expand': 'container,files,filesets,webcaptures', - 'hide': 'abstracts,references', - }) - if resp.status_code == 200: - fatcat_release = resp.json() - - obj = dict( - who_paper=row, - ) - if fatcat_release: - obj['fatcat_release'] = fatcat_release - obj['release_id'] = fatcat_release['ident'] - obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id']) - print(json.dumps(obj, sort_keys=True)) - -def run(args): - for l in sys.stdin: - l = json.loads(l) - do_line(l, args) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="WHO/S2 parsed JSON file", - type=argparse.FileType('r')) - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() - -- cgit v1.2.3