""" Tasks for techreport. """ import luigi from refcat.base import Zstd, shellout from refcat.tasks import BrefWithDOI, OpenCitations, Refcat class COCIDOIOnly(Refcat): """ Extract DOI-DOI pair, order dois lexicographically. """ def requires(self): return OpenCitations() def run(self): """ https://unix.stackexchange.com/a/37470/376 """ output = shellout(r""" zstdcat -T0 {input} | tail -n +2 | cut -d , -f2,3 | perl -F, -lane 'printf qq[%s\n], join ",", sort @F' | LC_ALL=C sort -T {tmpdir} -S25% | zstd -c -T0 > {output} """, input=self.input().path, tmpdir=self.tmpdir) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) class BrefDOIOnly(Refcat): """ Bref, reduced to doi, so we can compare with others. """ def requires(self): return BrefWithDOI() def run(self): output = shellout(r""" zstdcat -T0 {input} | parallel --pipe -j 24 --block 10M "jq -R -rc 'fromjson? | [.source_doi, .target_doi] | @tsv'" | tr $'\t' ',' | perl -F, -lane 'printf qq[%s\n], join ",", sort @F' | LC_ALL=C sort -T {tmpdir} -S25% | zstd -c -T0 > {output} """, input=self.input().path, tmpdir=self.tmpdir) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) # TODO: DOAJ subset # # (1) find all release idents with doaj id # (2) filter refcat over doaj ids # TODO: Mag DOI # # https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema # MAG has a Papers.txt.gz file containing the ID and DOI # # (1) create a mapping from id to doi (e.g. in sqlite3) # (2) turn id-to-id references into doi-to-doi with lookup table # # $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f1,3 | pv -l # 238M 0:11:16 [ 353k/s] # # $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | pv -l > /dev/null # 260M 0:10:32 [ 412k/s] # # $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null # 96.7M 0:11:05 [ 145k/s] # # $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db # # 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db # written 2.9G -- 3.4M/s # 2021/09/27 17:23:11 import done # 2021/09/27 17:23:11 creating index # 2021/09/27 17:26:44 [ok] 1/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db # 2021/09/27 17:31:53 [ok] 2/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db # # real 23m7.744s # user 30m55.642s # sys 4m17.959s # # Can use a in memory map, too - sqlite3 lookups for 2B+ items takes a while at # 30 Kqps; takes 30% of RAM on 48GB, sigh; just map[int]string from paper id to doi. # # Prepare the 2-TSV list. # # $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /magna/data/mag-2021-06-07/mapping.tsv # # real 15m27.348s # user 21m4.297s # sys 3m34.441s # # $ time zstdcat -T0 PaperReferences.txt.zst | # magrefs-mem -f /magna/data/mag-2021-06-07/mapping.tsv | # pv -l | zstd -c -T0 > doi_refs.tsv.zst # # $ zstdcat -T0 doi_refs.tsv.zst| pv -l | wc -l # 1.32G 0:06:33 [3.34M/s] [ <=> ]1315040677 # 1315040677