aboutsummaryrefslogtreecommitdiffstats
path: root/bin/cord19_fatcat_enrich.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-01 15:27:25 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-01 15:27:25 -0700
commit485c8b8432d839bb3cc0bd67152adda4bbf0df20 (patch)
treebe8b0b11c3a9ec722366945bcd880245222cc6c7 /bin/cord19_fatcat_enrich.py
parent7f30c43cd4955d596ebc51a56aea7996b1005e47 (diff)
downloadfatcat-covid19-485c8b8432d839bb3cc0bd67152adda4bbf0df20.tar.gz
fatcat-covid19-485c8b8432d839bb3cc0bd67152adda4bbf0df20.zip
move scripts/ to bin/
Diffstat (limited to 'bin/cord19_fatcat_enrich.py')
-rwxr-xr-xbin/cord19_fatcat_enrich.py104
1 files changed, 104 insertions, 0 deletions
diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py
new file mode 100755
index 0000000..a911007
--- /dev/null
+++ b/bin/cord19_fatcat_enrich.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+"""
+Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat
+metadata.
+"""
+
+import sys
+import json
+import argparse
+import datetime
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+
+def requests_retry_session(retries=10, backoff_factor=3,
+ status_forcelist=(500, 502, 504), session=None):
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount('http://', adapter)
+ session.mount('https://', adapter)
+ return session
+
+
+def do_line(row, args):
+
+ pubmed_id = row.get('pubmed_id') or None
+ pmcid = row.get('pmcid') or None
+ doi = row.get('doi') or None
+ fatcat_release = None
+
+ if doi == '0.1126/science.abb7331':
+ doi = '10.1126/science.abb7331'
+
+ if not fatcat_release and pmcid:
+ resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
+ params={
+ 'pmcid': pmcid,
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'abstracts,references',
+ })
+ if resp.status_code == 200:
+ fatcat_release = resp.json()
+ if not fatcat_release and doi:
+ resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
+ params={
+ 'doi': doi,
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'abstracts,references',
+ })
+ if resp.status_code == 200:
+ fatcat_release = resp.json()
+ if not fatcat_release and pubmed_id:
+ resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
+ params={
+ 'pmid': pubmed_id,
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'abstracts,references',
+ })
+ if resp.status_code == 200:
+ fatcat_release = resp.json()
+
+ obj = dict(
+ cord19_paper=row,
+ )
+ if fatcat_release:
+ obj['fatcat_release'] = fatcat_release
+ obj['release_id'] = fatcat_release['ident']
+ obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id'])
+ print(json.dumps(obj, sort_keys=True))
+
+def run(args):
+ for l in args.json_file:
+ l = json.loads(l)
+ do_line(l, args)
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="CORD-19 parsed JSON file",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+ args.session = requests_retry_session()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
+