aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-24 20:33:21 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-24 20:33:21 -0700
commita2d06feede4645ac4072d471150fc8d32b9636dc (patch)
tree28e64c2b4d4a5b82301fd4fd698fd488a7f0619e
parent19aaf791c645d651da4736d081f8a75cb67832f6 (diff)
downloadfatcat-covid19-a2d06feede4645ac4072d471150fc8d32b9636dc.tar.gz
fatcat-covid19-a2d06feede4645ac4072d471150fc8d32b9636dc.zip
move and tweak scripts
-rwxr-xr-xscripts/cord19_fatcat_enrich.py (renamed from scripts/who_enrich.py)20
-rwxr-xr-xscripts/deliver_file2disk.py10
2 files changed, 7 insertions, 23 deletions
diff --git a/scripts/who_enrich.py b/scripts/cord19_fatcat_enrich.py
index b445927..5d3a554 100755
--- a/scripts/who_enrich.py
+++ b/scripts/cord19_fatcat_enrich.py
@@ -1,17 +1,8 @@
#!/usr/bin/env python3
"""
-This script takes a "Paper" MAG TSV file which has been joined with (at most) a
-single "PaperExtendedAttributes", parses it into JSON, and does fatcat fetches
-to "enrich" the output. Outputs a single JSON object per line with attributes:
-
- mag_id
- mag_paper
- release_id
- fatcat_release
-
-Input columns:
-
+Takes a JSON-transformed CORD-19 *metadata* file and enriches it with fatcat
+metadata.
"""
import sys
@@ -50,6 +41,9 @@ def do_line(row, args):
doi = row.get('doi') or None
fatcat_release = None
+ if doi == '0.1126/science.abb7331':
+ doi = '10.1126/science.abb7331'
+
if not fatcat_release and pmcid:
resp = args.session.get('https://api.fatcat.wiki/v0/release/lookup',
params={
@@ -79,7 +73,7 @@ def do_line(row, args):
fatcat_release = resp.json()
obj = dict(
- who_paper=row,
+ cord19_paper=row,
)
if fatcat_release:
obj['fatcat_release'] = fatcat_release
@@ -96,7 +90,7 @@ def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
- help="WHO/S2 parsed JSON file",
+ help="CORD-19 parsed JSON file",
type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
diff --git a/scripts/deliver_file2disk.py b/scripts/deliver_file2disk.py
index d661acc..9ec234a 100755
--- a/scripts/deliver_file2disk.py
+++ b/scripts/deliver_file2disk.py
@@ -10,16 +10,6 @@ Behavior:
- try downloading from any archive.org or web.archive.org URLs
- verify SHA-1
- write out to disk
-
-TODO:
-x blob_path(sha1hex) -> returns relative/local path file would be saved to
-x filter_files(files) -> list of files to try
-- fetch_release(release) -> tries to download PDF bytes
-- fetch_file(file) -> returns bytes of fetched file
-- fetch_content(url) -> tries to download PDF bytes
-
-LATER:
-- GRBOID XML as well, from minio?
"""
# XXX: some broken MRO thing going on in here due to python3 object wrangling