From 47e0b699b2a350b0081a64ebbcaba991c53cfb52 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 Nov 2019 16:46:09 -0800 Subject: ingest/backfill notes --- notes/ingest/.gitignore | 2 ++ notes/ingest/20191023_testing.md | 8 ++++++++ notes/ingest/es_csv_to_json.py | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 notes/ingest/.gitignore create mode 100644 notes/ingest/20191023_testing.md create mode 100755 notes/ingest/es_csv_to_json.py (limited to 'notes') diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore new file mode 100644 index 0000000..343a25c --- /dev/null +++ b/notes/ingest/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.json diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/20191023_testing.md new file mode 100644 index 0000000..481c4e2 --- /dev/null +++ b/notes/ingest/20191023_testing.md @@ -0,0 +1,8 @@ + +exported not-archived DOIs for elife, as well as general list. + + wc -l recent\ missing\ oa\ releases.csv + 161828 recent missing oa releases.csv + + wc -l missing\ elife\ DOIs.csv + 1779 missing elife DOIs.csv diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py new file mode 100755 index 0000000..4cd1811 --- /dev/null +++ b/notes/ingest/es_csv_to_json.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +""" + input like: + + doi,ident,"release_stage" + "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published + "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published + "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published + "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published + "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published + + output like: + + { + "base_url": "https://doi.org/10.7554/elife.38904", + "ext_ids": { + "doi": "10.7554/elife.38904" + }, + "fatcat_release": "mxj534diw5gatc26rkif3io5xm", + "release_stage": "published" + } +""" + +import csv, sys, json + +reader = csv.DictReader(sys.stdin) +for row in reader: + d = { + "base_url": "https://doi.org/{}".format(row['doi']), + "ext_ids": { + "doi": row['doi'], + }, + "fatcat_release": row['ident'], + "release_stage": row['release_stage'], + } + print(json.dumps(d)) -- cgit v1.2.3