diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-09-28 11:03:40 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-09-28 11:03:40 +0200 |
commit | 3af89868253a45f4e3fe912443276b31b7c72521 (patch) | |
tree | 7c0162f3667f0af3089e0017655defa6c8b6cde6 | |
parent | f75a3cd2683509cb0a090f669a911cb3155532bc (diff) | |
download | refcat-3af89868253a45f4e3fe912443276b31b7c72521.tar.gz refcat-3af89868253a45f4e3fe912443276b31b7c72521.zip |
report: fix command notes
-rw-r--r-- | python/refcat/report.py | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/python/refcat/report.py b/python/refcat/report.py index 2b4099d..ae4844d 100644 --- a/python/refcat/report.py +++ b/python/refcat/report.py @@ -62,21 +62,22 @@ class BrefDOIOnly(Refcat): # TODO: Mag DOI # +# https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema # MAG has a Papers.txt.gz file containing the ID and DOI # # (1) create a mapping from id to doi (e.g. in sqlite3) # (2) turn id-to-id references into doi-to-doi with lookup table # -# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f2,3 | pv -l +# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f1,3 | pv -l # 238M 0:11:16 [ 353k/s] # -# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | pv -l > /dev/null +# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | pv -l > /dev/null # 260M 0:10:32 [ 412k/s] # -# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | pv -l > /dev/null +# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null # 96.7M 0:11:05 [ 145k/s] # -# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db +# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db # # 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db # written 2.9G -- 3.4M/s |