aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-27 22:05:50 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-27 22:05:50 +0200
commit6b15378b95392a7b387c39d4ef126ad9a71ee4bb (patch)
tree7f2c591971476c9b16cf76ac205cedf11cb90209 /python
parentb0c0677ce81c5e98904898683b9c59ae37207404 (diff)
downloadrefcat-6b15378b95392a7b387c39d4ef126ad9a71ee4bb.tar.gz
refcat-6b15378b95392a7b387c39d4ef126ad9a71ee4bb.zip
report: notes on mag
Diffstat (limited to 'python')
-rw-r--r--python/refcat/report.py32
1 files changed, 32 insertions, 0 deletions
diff --git a/python/refcat/report.py b/python/refcat/report.py
index 8060aa3..2b4099d 100644
--- a/python/refcat/report.py
+++ b/python/refcat/report.py
@@ -55,5 +55,37 @@ class BrefDOIOnly(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+# TODO: DOAJ subset
+#
+# (1) find all release idents with doaj id
+# (2) filter refcat over doaj ids
# TODO: Mag DOI
+#
+# MAG has a Papers.txt.gz file containing the ID and DOI
+#
+# (1) create a mapping from id to doi (e.g. in sqlite3)
+# (2) turn id-to-id references into doi-to-doi with lookup table
+#
+# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f2,3 | pv -l
+# 238M 0:11:16 [ 353k/s]
+#
+# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | pv -l > /dev/null
+# 260M 0:10:32 [ 412k/s]
+#
+# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | pv -l > /dev/null
+# 96.7M 0:11:05 [ 145k/s]
+#
+# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
+#
+# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
+# written 2.9G -- 3.4M/s
+# 2021/09/27 17:23:11 import done
+# 2021/09/27 17:23:11 creating index
+# 2021/09/27 17:26:44 [ok] 1/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
+# 2021/09/27 17:31:53 [ok] 2/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
+#
+# real 23m7.744s
+# user 30m55.642s
+# sys 4m17.959s
+