diff options
-rw-r--r-- | python/refcat/report.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/python/refcat/report.py b/python/refcat/report.py index 8060aa3..2b4099d 100644 --- a/python/refcat/report.py +++ b/python/refcat/report.py @@ -55,5 +55,37 @@ class BrefDOIOnly(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +# TODO: DOAJ subset +# +# (1) find all release idents with doaj id +# (2) filter refcat over doaj ids # TODO: Mag DOI +# +# MAG has a Papers.txt.gz file containing the ID and DOI +# +# (1) create a mapping from id to doi (e.g. in sqlite3) +# (2) turn id-to-id references into doi-to-doi with lookup table +# +# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f2,3 | pv -l +# 238M 0:11:16 [ 353k/s] +# +# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | pv -l > /dev/null +# 260M 0:10:32 [ 412k/s] +# +# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | pv -l > /dev/null +# 96.7M 0:11:05 [ 145k/s] +# +# time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f2,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db +# +# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db +# written 2.9G -- 3.4M/s +# 2021/09/27 17:23:11 import done +# 2021/09/27 17:23:11 creating index +# 2021/09/27 17:26:44 [ok] 1/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db +# 2021/09/27 17:31:53 [ok] 2/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db +# +# real 23m7.744s +# user 30m55.642s +# sys 4m17.959s + |