diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 16:46:09 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 16:46:09 -0800 |
commit | 47e0b699b2a350b0081a64ebbcaba991c53cfb52 (patch) | |
tree | e7967828a6c4284d1bd189ac973c5f0041965179 /notes/ingest | |
parent | 885bff50bbe57322ad32f4fbfab8d846e54671f2 (diff) | |
download | sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.tar.gz sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.zip |
ingest/backfill notes
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/.gitignore | 2 | ||||
-rw-r--r-- | notes/ingest/20191023_testing.md | 8 | ||||
-rwxr-xr-x | notes/ingest/es_csv_to_json.py | 37 |
3 files changed, 47 insertions, 0 deletions
diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore new file mode 100644 index 0000000..343a25c --- /dev/null +++ b/notes/ingest/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.json diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/20191023_testing.md new file mode 100644 index 0000000..481c4e2 --- /dev/null +++ b/notes/ingest/20191023_testing.md @@ -0,0 +1,8 @@ + +exported not-archived DOIs for elife, as well as general list. + + wc -l recent\ missing\ oa\ releases.csv + 161828 recent missing oa releases.csv + + wc -l missing\ elife\ DOIs.csv + 1779 missing elife DOIs.csv diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py new file mode 100755 index 0000000..4cd1811 --- /dev/null +++ b/notes/ingest/es_csv_to_json.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +""" + input like: + + doi,ident,"release_stage" + "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published + "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published + "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published + "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published + "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published + + output like: + + { + "base_url": "https://doi.org/10.7554/elife.38904", + "ext_ids": { + "doi": "10.7554/elife.38904" + }, + "fatcat_release": "mxj534diw5gatc26rkif3io5xm", + "release_stage": "published" + } +""" + +import csv, sys, json + +reader = csv.DictReader(sys.stdin) +for row in reader: + d = { + "base_url": "https://doi.org/{}".format(row['doi']), + "ext_ids": { + "doi": row['doi'], + }, + "fatcat_release": row['ident'], + "release_stage": row['release_stage'], + } + print(json.dumps(d)) |