diff options
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/cord19_fatcat_enrich.py (renamed from scripts/who_enrich.py) | 20 | ||||
-rwxr-xr-x | scripts/deliver_file2disk.py | 10 |
2 files changed, 7 insertions, 23 deletions
diff --git a/scripts/who_enrich.py b/scripts/cord19_fatcat_enrich.py index b445927..5d3a554 100755 --- a/scripts/who_enrich.py +++ b/scripts/cord19_fatcat_enrich.py @@ -1,17 +1,8 @@ #!/usr/bin/env python3 """ -This script takes a "Paper" MAG TSV file which has been joined with (at most) a -single "PaperExtendedAttributes", parses it into JSON, and does fatcat fetches -to "enrich" the output. Outputs a single JSON object per line with attributes: - - mag_id - mag_paper - release_id - fatcat_release - -Input columns: - +Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat +metadata. """ import sys @@ -50,6 +41,9 @@ def do_line(row, args): doi = row.get('doi') or None fatcat_release = None + if doi == '0.1126/science.abb7331': + doi = '10.1126/science.abb7331' + if not fatcat_release and pmcid: resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup', params={ @@ -79,7 +73,7 @@ def do_line(row, args): fatcat_release = resp.json() obj = dict( - who_paper=row, + cord19_paper=row, ) if fatcat_release: obj['fatcat_release'] = fatcat_release @@ -96,7 +90,7 @@ def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="WHO/S2 parsed JSON file", + help="CORD-19 parsed JSON file", type=argparse.FileType('r')) subparsers = parser.add_subparsers() diff --git a/scripts/deliver_file2disk.py b/scripts/deliver_file2disk.py index d661acc..9ec234a 100755 --- a/scripts/deliver_file2disk.py +++ b/scripts/deliver_file2disk.py @@ -10,16 +10,6 @@ Behavior: - try downloading from any archive.org or web.archive.org URLs - verify SHA-1 - write out to disk - -TODO: -x blob_path(sha1hex) -> returns relative/local path file would be saved to -x filter_files(files) -> list of files to try -- fetch_release(release) -> tries to download PDF bytes -- fetch_file(file) -> returns bytes of fetched file -- fetch_content(url) -> tries to download PDF bytes - -LATER: -- GRBOID XML as well, from minio? """ # XXX: some broken MRO thing going on in here due to python3 object wrangling |