aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-29 02:01:11 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-29 02:01:11 +0200
commitd4662b1984cc749ba5986418541dd53952f48732 (patch)
treeb22b7907c9818917e301497f0ca9960f6aa1e75b /python
parent8e473663cd695bebea35105c7ac2201b82d09ae5 (diff)
downloadrefcat-d4662b1984cc749ba5986418541dd53952f48732.tar.gz
refcat-d4662b1984cc749ba5986418541dd53952f48732.zip
tasks: update docs
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py210
1 files changed, 126 insertions, 84 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c76e67c..1e18e8b 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -40,60 +40,6 @@ Set of luigi tasks to derive a citation graph.
Config (e.g. raw input data) taken from $HOME/.config/refcat/settings.ini.
-
-=================
-
-TODO and report notes
-
-* [ ] how many of these links point to DOI?
-* [ ] how many DOI refs do we have, which DOI do we miss?
-* [ ] exact title matches?
-* [ ] lowercase title matches?
-* [ ] fuzzy title matches?
-* [ ] lowercase DOI lists; common dois, dois only in references
-
-Derive release entity schema from refs, join with release export then run
-clustering and verify.
-
-Task BiblioRef is a current artifact:
-
- \_ BiblioRef(dataset=full, date=2021-02-20)
- \_ BiblioRefFromJoin(dataset=full, date=2021-02-20)
- \_ RefsFatcatGroupJoin(dataset=full, date=2021-02-20)
- \_ RefsFatcatPMIDJoin(dataset=full, date=2021-02-20)
- \_ RefsPMID(dataset=full, date=2021-02-20)
- \_ Refs(dataset=full, date=2021-02-20)
- \_ FatcatPMID(dataset=full, date=2021-02-20)
- \_ ReleaseExportReduced(dataset=full, date=2021-02-20)
- \_ ReleaseExportExpanded(dataset=full, date=2021-02-20)
- \_ RefsFatcatDOIJoin(dataset=full, date=2021-02-20)
- \_ RefsDOI(dataset=full, date=2021-02-20)
- \_ Refs(dataset=full, date=2021-02-20)
- \_ FatcatDOI(dataset=full, date=2021-02-20)
- \_ ReleaseExportReduced(dataset=full, date=2021-02-20)
- \_ ReleaseExportExpanded(dataset=full, date=2021-02-20)
- \_ RefsFatcatArxivJoin(dataset=full, date=2021-02-20)
- \_ FatcatArxiv(dataset=full, date=2021-02-20)
- \_ ReleaseExportReduced(dataset=full, date=2021-02-20)
- \_ ReleaseExportExpanded(dataset=full, date=2021-02-20)
- \_ RefsArxiv(dataset=full, date=2021-02-20)
- \_ Refs(dataset=full, date=2021-02-20)
- \_ RefsFatcatPMCIDJoin(dataset=full, date=2021-02-20)
- \_ FatcatPMCID(dataset=full, date=2021-02-20)
- \_ ReleaseExportReduced(dataset=full, date=2021-02-20)
- \_ ReleaseExportExpanded(dataset=full, date=2021-02-20)
- \_ RefsPMCID(dataset=full, date=2021-02-20)
- \_ Refs(dataset=full, date=2021-02-20)
- \_ BiblioRefFuzzy(dataset=full, date=2021-02-20)
- \_ RefsFatcatClusterVerify(dataset=full, date=2021-02-20)
- \_ RefsFatcatClusters(dataset=full, date=2021-02-20)
- \_ RefsFatcatSortedKeys(dataset=full, date=2021-02-20)
- \_ RefsReleasesMerged(dataset=full, date=2021-02-20)
- \_ RefsToRelease(dataset=full, date=2021-02-20)
- \_ Refs(dataset=full, date=2021-02-20)
- \_ ReleaseExportReduced(dataset=full, date=2021-02-20)
- \_ ReleaseExportExpanded(dataset=full, date=2021-02-20)
-
"""
import argparse
@@ -196,14 +142,27 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat):
class OpenLibraryWorks(luigi.ExternalTask, Refcat):
"""
- Works file.
+ Works dump, from https://openlibrary.org/developers/dumps.
"""
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd)
+class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
+ """
+ Works dump, from https://openlibrary.org/developers/dumps.
+ """
+ def output(self):
+ return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
+
+
# ----8< Derivations
+#
+# Augmentation and reductions of raw data
+# ---------------------------------------
+#
+
class RefsWithUnstructured(Refcat):
"""
@@ -273,6 +232,12 @@ class UnmatchedRefs(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+#
+# Generate URL list for CDX lookup
+# --------------------------------
+#
+
+
class URLTabs(Refcat):
"""
Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s).
@@ -325,6 +290,12 @@ class URLList(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+#
+# Generate (key, doc) from refs
+# -----------------------------
+#
+
+
class RefsDOI(Refcat):
"""
Sorted (doi, doc) tuples from refs. 225m48.755s
@@ -420,24 +391,10 @@ class RefsArxiv(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-class RefsToRelease(Refcat):
- """
- Convert refs to release.
- """
- def requires(self):
- return RefsWithUnstructured()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- skate-conv -f ref -w 24 -b 100000 |
- zstd -T0 -c > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+#
+# Generate (key, doc) from fatcat
+# -------------------------------
+#
class FatcatDOI(Refcat):
@@ -533,6 +490,12 @@ class FatcatArxiv(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+#
+# Key extraction for fuzzy matching
+# ---------------------------------
+#
+
+
class FatcatMapped(Refcat):
"""
Fatcat mapped "tsand".
@@ -559,6 +522,27 @@ class FatcatMapped(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+class RefsToRelease(Refcat):
+ """
+ Convert refs to release, since fuzzy verification works on release entities
+ currently.
+ """
+ def requires(self):
+ return RefsWithUnstructured()
+
+ def run(self):
+ output = shellout("""
+ zstdcat -T0 {input} |
+ skate-conv -f ref -w 24 -b 100000 |
+ zstd -T0 -c > {output}
+ """,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+
class RefsMapped(Refcat):
"""
Apply mapper on refs. 281min (about 100k/s).
@@ -585,6 +569,12 @@ class RefsMapped(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+#
+# Biblioref generation from identifier matches
+# --------------------------------------------
+#
+
+
class BrefZipDOI(Refcat):
"""
Run skate-reduce from two files.
@@ -677,6 +667,12 @@ class BrefZipArxiv(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+#
+# Biblioref generation from fuzzy matching
+# ----------------------------------------
+#
+
+
class BrefZipFuzzy(Refcat):
"""
Run skate-reduce from two files, fuzzy mode; 1039m55.350s, skate-reduce not
@@ -703,9 +699,15 @@ class BrefZipFuzzy(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+#
+# WIP: Open Library
+# -----------------
+#
+
+
class OpenLibraryEditionsByWork(Refcat):
"""
- Have editions keyed by work id, 9m5.037s.
+ DEPRECATED. Have editions keyed by work id, 9m5.037s.
"""
def requires(self):
return OpenLibraryEditions()
@@ -728,7 +730,7 @@ class OpenLibraryEditionsByWork(Refcat):
class OpenLibraryWorksSorted(Refcat):
"""
- Sorted by work id.
+ DEPRECATED. Sorted by work id.
"""
def requires(self):
return OpenLibraryWorks()
@@ -748,23 +750,59 @@ class OpenLibraryWorksSorted(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-class OpenLibraryRelease(Refcat):
+
+#
+# Open Library Fuzzy matching (OL editions -> release, key extraction)
+# --------------------------------------------------------------------
+#
+
+
+class OpenLibraryAuthorMapping(Refcat):
"""
- Turn Open Library into Release Entities with author mapping.
+ Create an OL author id to author name TSV mapping.
"""
+ def requires(self):
+ return OpenLibraryAuthors()
+
def run(self):
- # TODO: remove hardcoded values.
output = shellout("""
- zstdcat -T0 /magna/data/ol_dump_editions_latest.txt.zst |
+ zstdcat -T0 {input} |
+ LC_ALL=C cut -f 5 |
+ jq -rc '[.key, .name]|@tsv' |
+ zstd -T0 > {output}
+ """,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+
+class OpenLibraryEditionsToRelease(Refcat):
+ """
+ Turn Open Library editions into release entities with author mapping.
+ """
+ def requires(self):
+ return {
+ "oled": OpenLibraryEditions(),
+ "map": OpenLibraryAuthorMapping(),
+ }
+
+ def run(self):
+ output = shellout("""
+ zstdcat -T0 {input} |
cut -f5 |
- skate-conv -B -f oled -Xa <(zstdcat -T0 /magna/data/ol_author_mapping.tsv.zst) |
+ skate-conv -B -f oled -Xa <(zstdcat -T0 {map}) |
zstd -T0 -c > {output}
- """)
+ """,
+ input=self.input().get("oled"),
+ map=self.input().get("map"))
luigi.LocalTarget(output).move(self.output().path)
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
class OpenLibraryMapped(Refcat):
"""
A mapped open library editions set.
@@ -772,7 +810,7 @@ class OpenLibraryMapped(Refcat):
mapper = luigi.Parameter(default="ts", description="mapper short name")
def requires(self):
- return OpenLibraryRelease()
+ return OpenLibraryEditionsToRelease()
def run(self):
output = shellout("""
@@ -791,12 +829,17 @@ class OpenLibraryMapped(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+#
+# Open Library additional matches
+# -------------------------------
+#
+
+
class UnmatchedMapped(Refcat):
"""
Map unmatched refs (in release schema) to titles to do approximate title
matches with OL; 35m14.801s.
"""
-
def requires(self):
return UnmatchedRefs()
@@ -814,4 +857,3 @@ class UnmatchedMapped(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-