aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-13 16:46:09 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-13 16:46:09 -0800
commit47e0b699b2a350b0081a64ebbcaba991c53cfb52 (patch)
treee7967828a6c4284d1bd189ac973c5f0041965179 /notes
parent885bff50bbe57322ad32f4fbfab8d846e54671f2 (diff)
downloadsandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.tar.gz
sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.zip
ingest/backfill notes
Diffstat (limited to 'notes')
-rw-r--r--notes/ingest/.gitignore2
-rw-r--r--notes/ingest/20191023_testing.md8
-rwxr-xr-xnotes/ingest/es_csv_to_json.py37
3 files changed, 47 insertions, 0 deletions
diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore
new file mode 100644
index 0000000..343a25c
--- /dev/null
+++ b/notes/ingest/.gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.json
diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/20191023_testing.md
new file mode 100644
index 0000000..481c4e2
--- /dev/null
+++ b/notes/ingest/20191023_testing.md
@@ -0,0 +1,8 @@
+
+exported not-archived DOIs for elife, as well as general list.
+
+ wc -l recent\ missing\ oa\ releases.csv
+ 161828 recent missing oa releases.csv
+
+ wc -l missing\ elife\ DOIs.csv
+ 1779 missing elife DOIs.csv
diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py
new file mode 100755
index 0000000..4cd1811
--- /dev/null
+++ b/notes/ingest/es_csv_to_json.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+"""
+ input like:
+
+ doi,ident,"release_stage"
+ "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published
+ "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published
+ "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published
+ "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published
+ "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published
+
+ output like:
+
+ {
+ "base_url": "https://doi.org/10.7554/elife.38904",
+ "ext_ids": {
+ "doi": "10.7554/elife.38904"
+ },
+ "fatcat_release": "mxj534diw5gatc26rkif3io5xm",
+ "release_stage": "published"
+ }
+"""
+
+import csv, sys, json
+
+reader = csv.DictReader(sys.stdin)
+for row in reader:
+ d = {
+ "base_url": "https://doi.org/{}".format(row['doi']),
+ "ext_ids": {
+ "doi": row['doi'],
+ },
+ "fatcat_release": row['ident'],
+ "release_stage": row['release_stage'],
+ }
+ print(json.dumps(d))