aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/report.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/python/refcat/report.py b/python/refcat/report.py
index 2b4099d..ae4844d 100644
--- a/python/refcat/report.py
+++ b/python/refcat/report.py
@@ -62,21 +62,22 @@ class BrefDOIOnly(Refcat):
# TODO: Mag DOI
#
+# https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema
# MAG has a Papers.txt.gz file containing the ID and DOI
#
# (1) create a mapping from id to doi (e.g. in sqlite3)
# (2) turn id-to-id references into doi-to-doi with lookup table
#
-# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f2,3 | pv -l
+# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f1,3 | pv -l
# 238M 0:11:16 [ 353k/s]
#
-# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | pv -l > /dev/null
+# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | pv -l > /dev/null
# 260M 0:10:32 [ 412k/s]
#
-# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | pv -l > /dev/null
+# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null
# 96.7M 0:11:05 [ 145k/s]
#
-# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
+# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
#
# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
# written 2.9G -- 3.4M/s