aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xbin/cord19_fatcat_enrich.py106
-rwxr-xr-xbin/parse_cord19_csv.py3
-rwxr-xr-xcovid19_tool.py16
-rw-r--r--fatcat_covid19/enrich.py67
4 files changed, 83 insertions, 109 deletions
diff --git a/bin/cord19_fatcat_enrich.py b/bin/cord19_fatcat_enrich.py
deleted file mode 100755
index 2478227..0000000
--- a/bin/cord19_fatcat_enrich.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat
-metadata.
-
-TODO: refactor into `fatcat_covid19` module and wrapper CLI script.
-"""
-
-import sys
-import json
-import argparse
-import datetime
-
-import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
-
-
-def requests_retry_session(retries=10, backoff_factor=3,
- status_forcelist=(500, 502, 504), session=None):
- """
- From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
- """
- session = session or requests.Session()
- retry = Retry(
- total=retries,
- read=retries,
- connect=retries,
- backoff_factor=backoff_factor,
- status_forcelist=status_forcelist,
- )
- adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
- return session
-
-
-def do_line(row, args):
-
- pubmed_id = row.get('pubmed_id') or None
- pmcid = row.get('pmcid') or None
- doi = row.get('doi') or None
- fatcat_release = None
-
- if doi == '0.1126/science.abb7331':
- doi = '10.1126/science.abb7331'
-
- if not fatcat_release and pmcid:
- resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
- params={
- 'pmcid': pmcid,
- 'expand': 'container,files,filesets,webcaptures',
- 'hide': 'abstracts,references',
- })
- if resp.status_code == 200:
- fatcat_release = resp.json()
- if not fatcat_release and doi:
- resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
- params={
- 'doi': doi,
- 'expand': 'container,files,filesets,webcaptures',
- 'hide': 'abstracts,references',
- })
- if resp.status_code == 200:
- fatcat_release = resp.json()
- if not fatcat_release and pubmed_id:
- resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
- params={
- 'pmid': pubmed_id,
- 'expand': 'container,files,filesets,webcaptures',
- 'hide': 'abstracts,references',
- })
- if resp.status_code == 200:
- fatcat_release = resp.json()
-
- obj = dict(
- cord19_paper=row,
- )
- if fatcat_release:
- obj['fatcat_release'] = fatcat_release
- obj['release_id'] = fatcat_release['ident']
- obj['fatcat_url'] = "https://fatcat.wiki/release/{}".format(obj['release_id'])
- print(json.dumps(obj, sort_keys=True))
-
-def run(args):
- for l in args.json_file:
- l = json.loads(l)
- do_line(l, args)
-
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="CORD-19 parsed JSON file",
- type=argparse.FileType('r'))
- subparsers = parser.add_subparsers()
-
- args = parser.parse_args()
- args.session = requests_retry_session()
-
- run(args)
-
-if __name__ == '__main__':
- main()
-
diff --git a/bin/parse_cord19_csv.py b/bin/parse_cord19_csv.py
index 55cd81b..dbc6cc5 100755
--- a/bin/parse_cord19_csv.py
+++ b/bin/parse_cord19_csv.py
@@ -17,4 +17,5 @@ with open(CSVFILE, newline='') as csvfile:
row = dict(row)
row['mag_id'] = row.pop('Microsoft Academic Paper ID')
row['who_covidence_id'] = row.pop('WHO #Covidence').replace('#', '')
- print(json.dumps(row, sort_keys=True))
+ obj = dict(cord19_paper=row)
+ print(json.dumps(obj, sort_keys=True))
diff --git a/covid19_tool.py b/covid19_tool.py
index 1cf8dce..e984c28 100755
--- a/covid19_tool.py
+++ b/covid19_tool.py
@@ -70,14 +70,26 @@ def main():
type=argparse.FileType('r'),
default=sys.stdout)
+ sub_enrich_fatcat = subparsers.add_parser('enrich-fatcat',
+ help="lookup fatcat releases from JSON metadata")
+ sub_enrich_fatcat.add_argument('json_file',
+ help="input JSON rows file (eg, CORD-19 parsed JSON)",
+ type=argparse.FileType('r'))
+ sub_enrich_fatcat.add_argument('--json-output',
+ help="file to write to",
+ type=argparse.FileType('r'),
+ default=sys.stdout)
+
args = parser.parse_args()
if args.action == 'webface':
app.run(debug=args.debug, host=args.host, port=args.port)
- if args.action == 'derivatives':
+ elif args.action == 'derivatives':
enrich_derivatives_file(args.json_file, args.json_output,
args.base_dir)
- if args.action == 'transform-es':
+ elif args.action == 'transform-es':
+ transform_es_file(args.json_file, args.json_output)
+ elif args.action == 'enrich-fatcat':
transform_es_file(args.json_file, args.json_output)
else:
print("tell me what to do!")
diff --git a/fatcat_covid19/enrich.py b/fatcat_covid19/enrich.py
new file mode 100644
index 0000000..245a357
--- /dev/null
+++ b/fatcat_covid19/enrich.py
@@ -0,0 +1,67 @@
+
+import sys
+import json
+import datetime
+
+from fatcat_covid19.common import requests_retry_session
+
+
+def enrich_fatcat_row(row, api_session):
+
+ cord19_paper = row.get('cord19_paper')
+ if not cord19_paper:
+ return row
+
+ pubmed_id = cord19_paper.get('pubmed_id') or None
+ pmcid = cord19_paper.get('pmcid') or None
+ doi = cord19_paper.get('doi') or None
+ fatcat_release = None
+
+ if doi == '0.1126/science.abb7331':
+ doi = '10.1126/science.abb7331'
+
+ if not fatcat_release and pmcid:
+ resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup',
+ params={
+ 'pmcid': pmcid,
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'abstracts,references',
+ })
+ if resp.status_code == 200:
+ fatcat_release = resp.json()
+ if not fatcat_release and doi:
+ resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup',
+ params={
+ 'doi': doi,
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'abstracts,references',
+ })
+ if resp.status_code == 200:
+ fatcat_release = resp.json()
+ if not fatcat_release and pubmed_id:
+ resp = api_session.get('https://api.fatcat.wiki/v0/release/lookup',
+ params={
+ 'pmid': pubmed_id,
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'abstracts,references',
+ })
+ if resp.status_code == 200:
+ fatcat_release = resp.json()
+
+ if fatcat_release:
+ row['fatcat_release'] = fatcat_release
+ row['release_id'] = fatcat_release['ident']
+ print(json.dumps(row, sort_keys=True))
+
+
+def enrich_fatcat_file(json_input, json_output):
+ """
+ Takes a JSON-transformed CORD-19 *metadata* file and enriches it with
+ fatcat metadata.
+ """
+ api_session = requests_retry_session()
+ for l in json_input:
+ l = json.loads(l)
+ result = enrich_fatcat_row(l, api_session)
+ if result:
+ print(json.dumps(result, sort_keys=True), file=json_output)