aboutsummaryrefslogtreecommitdiffstats
path: root/python/refcat/techreport.py
blob: cc35b7a197766578e4489b5f61a3be0983405a16 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
Tasks for techreport.
"""
import luigi
from refcat.tasks import Refcat, OpenCitations, BrefWithDOI
from refcat.base import shellout, Zstd


class COCIDOIOnly(Refcat):
    """
    Extract DOI-DOI pair, order dois lexicographically.
    """
    def requires(self):
        return OpenCitations()

    def run(self):
        """ https://unix.stackexchange.com/a/37470/376 """
        output = shellout(r"""
                          zstdcat -T0 {input} |
                          tail -n +2 |
                          cut -d , -f2,3 |
                          perl -F, -lane 'printf qq[%s\n], join ",", sort @F' |
                          zstd -c -T0 > {output}
                          """,
                          input=self.input().path)
        luigi.LocalTarget(output).move(self.output().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)


class BrefDOIOnly(Refcat):
    """
    Bref, reduced to doi, so we can compare with others.
    """
    def requires(self):
        return BrefWithDOI()

    def run(self):
        output = shellout(r"""
                          zstdcat -T0 {input} |
                          parallel --pipe -j 16 --block 10M "jq -rc '[.source_doi, .target_doi] | @tsv'" |
                          tr $'\t' ',' |
                          perl -F, -lane 'printf qq[%s\n], join ",", sort @F' |
                          zstd -c -T0 > {output}
                          """,
                          input=self.input().path)

    def output(self):
        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)