python/refcat/report.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

"""
Tasks for techreport.
"""
import luigi

from refcat.base import Zstd, shellout
from refcat.tasks import BrefWithDOI, OpenCitations, Refcat


class COCIDOIOnly(Refcat):
    """
    Extract DOI-DOI pair, order dois lexicographically.
    """
    def requires(self):
        return OpenCitations()

    def run(self):
        """ https://unix.stackexchange.com/a/37470/376 """
        output = shellout(r"""
                          zstdcat -T0 {input} |
                          tail -n +2 |
                          cut -d , -f2,3 |
                          perl -F, -lane 'printf qq[%s\n], join ",", sort @F' |
                          LC_ALL=C sort -T {tmpdir} -S25% |
                          zstd -c -T0 > {output}
                          """,
                          input=self.input().path,
                          tmpdir=self.tmpdir)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)


class BrefDOIOnly(Refcat):
    """
    Bref, reduced to doi, so we can compare with others.
    """
    def requires(self):
        return BrefWithDOI()

    def run(self):
        output = shellout(r"""
                          zstdcat -T0 {input} |
                          parallel --pipe -j 24 --block 10M "jq -R -rc 'fromjson? | [.source_doi, .target_doi] | @tsv'" |
                          tr $'\t' ',' |
                          perl -F, -lane 'printf qq[%s\n], join ",", sort @F' |
                          LC_ALL=C sort -T {tmpdir} -S25% |
                          zstd -c -T0 > {output}
                          """,
                          input=self.input().path,
                          tmpdir=self.tmpdir)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)


# TODO: DOAJ subset
#
# (1) find all release idents with doaj id
# (2) filter refcat over doaj ids

# TODO: Mag DOI
#
# https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema
# MAG has a Papers.txt.gz file containing the ID and DOI
#
# (1) create a mapping from id to doi (e.g. in sqlite3)
# (2) turn id-to-id references into doi-to-doi with lookup table
#
# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f1,3 | pv -l
# 238M 0:11:16 [ 353k/s]
#
# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | pv -l > /dev/null
# 260M 0:10:32 [ 412k/s]
#
# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null
# 96.7M 0:11:05 [ 145k/s]
#
# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
#
# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
# written 2.9G -- 3.4M/s
# 2021/09/27 17:23:11 import done
# 2021/09/27 17:23:11 creating index
# 2021/09/27 17:26:44 [ok] 1/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
# 2021/09/27 17:31:53 [ok] 2/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
#
# real    23m7.744s
# user    30m55.642s
# sys     4m17.959s
#
# Can use a in memory map, too - sqlite3 lookups for 2B+ items takes a while at
# 30 Kqps; takes 30% of RAM on 48GB, sigh; just map[int]string from paper id to doi.
#
# Prepare the 2-TSV list.
#
# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /magna/data/mag-2021-06-07/mapping.tsv
#
# real    15m27.348s
# user    21m4.297s
# sys     3m34.441s
#
# $ time zstdcat -T0 PaperReferences.txt.zst |
#     magrefs-mem -f /magna/data/mag-2021-06-07/mapping.tsv |
#     pv -l | zstd -c -T0 > doi_refs.tsv.zst
#
# $ zstdcat -T0 doi_refs.tsv.zst| pv -l | wc -l
# 1.32G 0:06:33 [3.34M/s] [                             <=>                                                                                                                                                                                     ]1315040677
# 1315040677