ingest/backfill notes

author: Bryan Newbold <bnewbold@archive.org> 2019-11-13 16:46:09 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2019-11-13 16:46:09 -0800
commit: 47e0b699b2a350b0081a64ebbcaba991c53cfb52 (patch)
tree: e7967828a6c4284d1bd189ac973c5f0041965179
parent: 885bff50bbe57322ad32f4fbfab8d846e54671f2 (diff)
download: sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.tar.gz
sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.zip
3 files changed, 47 insertions, 0 deletions
diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore
new file mode 100644
index 0000000..343a25c
--- /dev/null
+++ b/notes/ingest/.gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.json
diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/20191023_testing.md
new file mode 100644
index 0000000..481c4e2
--- /dev/null
+++ b/notes/ingest/20191023_testing.md
@@ -0,0 +1,8 @@
+
+exported not-archived DOIs for elife, as well as general list.
+
+    wc -l recent\ missing\ oa\ releases.csv
+    161828 recent missing oa releases.csv
+
+    wc -l missing\ elife\ DOIs.csv
+    1779 missing elife DOIs.csv
diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py
new file mode 100755
index 0000000..4cd1811
--- /dev/null
+++ b/notes/ingest/es_csv_to_json.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+"""
+    input like:
+
+        doi,ident,"release_stage"
+        "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published
+        "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published
+        "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published
+        "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published
+        "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published
+
+    output like:
+
+    {
+      "base_url": "https://doi.org/10.7554/elife.38904",
+      "ext_ids": {
+        "doi": "10.7554/elife.38904"
+      },
+      "fatcat_release": "mxj534diw5gatc26rkif3io5xm",
+      "release_stage": "published"
+    }
+"""
+
+import csv, sys, json
+
+reader = csv.DictReader(sys.stdin)
+for row in reader:
+    d = {
+      "base_url": "https://doi.org/{}".format(row['doi']),
+      "ext_ids": {
+        "doi": row['doi'],
+      },
+      "fatcat_release": row['ident'],
+      "release_stage": row['release_stage'],
+    }
+    print(json.dumps(d))
author	Bryan Newbold <bnewbold@archive.org>	2019-11-13 16:46:09 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2019-11-13 16:46:09 -0800
commit	47e0b699b2a350b0081a64ebbcaba991c53cfb52 (patch)
tree	e7967828a6c4284d1bd189ac973c5f0041965179
parent	885bff50bbe57322ad32f4fbfab8d846e54671f2 (diff)
download	sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.tar.gz sandcrawler-47e0b699b2a350b0081a64ebbcaba991c53cfb52.zip