aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-15 21:43:59 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-19 20:29:17 +0200
commit3fe4ef7b4c35b86161c61637d752017721166b47 (patch)
tree85dc4a39cbaa4c60ff41ffda68be3e735fb1cb23 /python
parenta38706f5eca295e36fb45963bca4931d611cb84f (diff)
downloadrefcat-3fe4ef7b4c35b86161c61637d752017721166b47.tar.gz
refcat-3fe4ef7b4c35b86161c61637d752017721166b47.zip
add BibliorefSortedIdent, RefsSortedIdent
we'll need this to complete the unmatched refs
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py58
1 files changed, 52 insertions, 6 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 8459608..5eaca50 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -177,6 +177,7 @@ class MAGPapers(luigi.ExternalTask, Refcat):
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
+
class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
"""
From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
@@ -193,6 +194,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
# ----8< Derivations
+
class RefsWithUnstructured(Refcat):
"""
Augment refs with data from unstructured. Do this first, so we can use it
@@ -1097,8 +1099,6 @@ class RefsFatcatClusters(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
# ==== new style zippy biblioref generation
@@ -1228,7 +1228,8 @@ class BiblioRefV2(Refcat):
skate-bref-id |
zstd -T0 >> {output}
""",
- input=target.path, output=tmpf)
+ input=target.path,
+ output=tmpf)
luigi.LocalTarget(tmpf).move(self.output().path)
def output(self):
@@ -1317,8 +1318,10 @@ class RGSitemapFatcatSortedKeys(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
# ==== MAG
+
class MAGDOI(Refcat):
"""
List of MAG DOI.
@@ -1339,13 +1342,14 @@ class MAGDOI(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
# ==== WikipediaCitations
+
class BiblioRefWikiDOISortedKeys(Refcat):
"""
Sorted DOI keys from wikipedia.
"""
-
def requires(self):
return WikipediaCitationsMinimalDataset()
@@ -1355,14 +1359,15 @@ class BiblioRefWikiDOISortedKeys(Refcat):
skate-wikipedia-doi |
LC_ALL=C sort -S 10% -k2,2 |
zstd -T0 -c > {output}
- """, input=self.input().path)
+ """,
+ input=self.input().path)
luigi.LocalTarget(output).move(self.output().path)
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-class BiblioRefWiki(Refcat):
+class BiblioRefWiki(Refcat):
def requires(self):
return {
"wiki": BiblioRefWikiDOISortedKeys(),
@@ -1380,3 +1385,44 @@ class BiblioRefWiki(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
+# ==== Prepare unmatched
+
+
+class BibliorefSortedIdent(Refcat):
+ def requires(self):
+ return BiblioRefV2()
+
+ def run(self):
+ output = shellout("""
+ zstdcat -T0 {input} |
+ skate-derive-key -b 50000 -verbose -F source_release_ident |
+ LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
+ zstd -T0 -c > {output}
+ """,
+ tmpdir=self.tmpdir,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
+class RefSortedIdent(Refcat):
+ def requires(self):
+ return RefsWithUnstructured()
+
+ def run(self):
+ output = shellout("""
+ zstdcat -T0 {input} |
+ skate-derive-key -b 50000 -verbose -F release_ident |
+ LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} |
+ zstd -T0 -c > {output}
+ """,
+ tmpdir=self.tmpdir,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)