aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/refcat/report.py22
1 files changed, 21 insertions, 1 deletions
diff --git a/python/refcat/report.py b/python/refcat/report.py
index ae4844d..4a0c219 100644
--- a/python/refcat/report.py
+++ b/python/refcat/report.py
@@ -77,7 +77,7 @@ class BrefDOIOnly(Refcat):
# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null
# 96.7M 0:11:05 [ 145k/s]
#
-# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
+# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
#
# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
# written 2.9G -- 3.4M/s
@@ -89,4 +89,24 @@ class BrefDOIOnly(Refcat):
# real 23m7.744s
# user 30m55.642s
# sys 4m17.959s
+#
+# Can use a in memory map, too - sqlite3 lookups for 2B+ items takes a while at
+# 30 Kqps; takes 30% of RAM on 48GB, sigh; just map[int]string from paper id to doi.
+#
+# Prepare the 2-TSV list.
+#
+# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /magna/data/mag-2021-06-07/mapping.tsv
+#
+# real 15m27.348s
+# user 21m4.297s
+# sys 3m34.441s
+#
+# $ time zstdcat -T0 PaperReferences.txt.zst |
+# magrefs-mem -f /magna/data/mag-2021-06-07/mapping.tsv |
+# pv -l | zstd -c -T0 > doi_refs.tsv.zst
+#
+# $ zstdcat -T0 doi_refs.tsv.zst| pv -l | wc -l
+# 1.32G 0:06:33 [3.34M/s] [ <=> ]1315040677
+# 1315040677
+