1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
"""
Tasks for techreport.
"""
import luigi
from refcat.tasks import Refcat, OpenCitations, BrefWithDOI
from refcat.base import shellout, Zstd
class COCIDOIOnly(Refcat):
"""
Extract DOI-DOI pair, order dois lexicographically.
"""
def requires(self):
return OpenCitations()
def run(self):
""" https://unix.stackexchange.com/a/37470/376 """
output = shellout(r"""
zstdcat -T0 {input} |
tail -n +2 |
cut -d , -f2,3 |
perl -F, -lane 'printf qq[%s\n], join ",", sort @F' |
zstd -c -T0 > {output}
""",
input=self.input().path)
luigi.LocalTarget(output).move(self.output().path)
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
class BrefDOIOnly(Refcat):
"""
Bref, reduced to doi, so we can compare with others.
"""
def requires(self):
return BrefWithDOI()
def run(self):
output = shellout(r"""
zstdcat -T0 {input} |
parallel --pipe -j 16 --block 10M "jq -rc '[.source_doi, .target_doi] | @tsv'" |
tr $'\t' ',' |
perl -F, -lane 'printf qq[%s\n], join ",", sort @F' |
zstd -c -T0 > {output}
""",
input=self.input().path)
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
|