From d4662b1984cc749ba5986418541dd53952f48732 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 29 May 2021 02:01:11 +0200 Subject: tasks: update docs --- python/refcat/tasks.py | 210 +++++++++++++++++++++++++++++-------------------- 1 file changed, 126 insertions(+), 84 deletions(-) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index c76e67c..1e18e8b 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -40,60 +40,6 @@ Set of luigi tasks to derive a citation graph. Config (e.g. raw input data) taken from $HOME/.config/refcat/settings.ini. - -================= - -TODO and report notes - -* [ ] how many of these links point to DOI? -* [ ] how many DOI refs do we have, which DOI do we miss? -* [ ] exact title matches? -* [ ] lowercase title matches? -* [ ] fuzzy title matches? -* [ ] lowercase DOI lists; common dois, dois only in references - -Derive release entity schema from refs, join with release export then run -clustering and verify. - -Task BiblioRef is a current artifact: - - \_ BiblioRef(dataset=full, date=2021-02-20) - \_ BiblioRefFromJoin(dataset=full, date=2021-02-20) - \_ RefsFatcatGroupJoin(dataset=full, date=2021-02-20) - \_ RefsFatcatPMIDJoin(dataset=full, date=2021-02-20) - \_ RefsPMID(dataset=full, date=2021-02-20) - \_ Refs(dataset=full, date=2021-02-20) - \_ FatcatPMID(dataset=full, date=2021-02-20) - \_ ReleaseExportReduced(dataset=full, date=2021-02-20) - \_ ReleaseExportExpanded(dataset=full, date=2021-02-20) - \_ RefsFatcatDOIJoin(dataset=full, date=2021-02-20) - \_ RefsDOI(dataset=full, date=2021-02-20) - \_ Refs(dataset=full, date=2021-02-20) - \_ FatcatDOI(dataset=full, date=2021-02-20) - \_ ReleaseExportReduced(dataset=full, date=2021-02-20) - \_ ReleaseExportExpanded(dataset=full, date=2021-02-20) - \_ RefsFatcatArxivJoin(dataset=full, date=2021-02-20) - \_ FatcatArxiv(dataset=full, date=2021-02-20) - \_ ReleaseExportReduced(dataset=full, date=2021-02-20) - \_ ReleaseExportExpanded(dataset=full, date=2021-02-20) - \_ RefsArxiv(dataset=full, date=2021-02-20) - \_ Refs(dataset=full, date=2021-02-20) - \_ RefsFatcatPMCIDJoin(dataset=full, date=2021-02-20) - \_ FatcatPMCID(dataset=full, date=2021-02-20) - \_ ReleaseExportReduced(dataset=full, date=2021-02-20) - \_ ReleaseExportExpanded(dataset=full, date=2021-02-20) - \_ RefsPMCID(dataset=full, date=2021-02-20) - \_ Refs(dataset=full, date=2021-02-20) - \_ BiblioRefFuzzy(dataset=full, date=2021-02-20) - \_ RefsFatcatClusterVerify(dataset=full, date=2021-02-20) - \_ RefsFatcatClusters(dataset=full, date=2021-02-20) - \_ RefsFatcatSortedKeys(dataset=full, date=2021-02-20) - \_ RefsReleasesMerged(dataset=full, date=2021-02-20) - \_ RefsToRelease(dataset=full, date=2021-02-20) - \_ Refs(dataset=full, date=2021-02-20) - \_ ReleaseExportReduced(dataset=full, date=2021-02-20) - \_ ReleaseExportExpanded(dataset=full, date=2021-02-20) - """ import argparse @@ -196,14 +142,27 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat): class OpenLibraryWorks(luigi.ExternalTask, Refcat): """ - Works file. + Works dump, from https://openlibrary.org/developers/dumps. """ def output(self): return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd) +class OpenLibraryAuthors(luigi.ExternalTask, Refcat): + """ + Works dump, from https://openlibrary.org/developers/dumps. + """ + def output(self): + return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd) + + # ----8< Derivations +# +# Augmentation and reductions of raw data +# --------------------------------------- +# + class RefsWithUnstructured(Refcat): """ @@ -273,6 +232,12 @@ class UnmatchedRefs(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# +# Generate URL list for CDX lookup +# -------------------------------- +# + + class URLTabs(Refcat): """ Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s). @@ -325,6 +290,12 @@ class URLList(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +# +# Generate (key, doc) from refs +# ----------------------------- +# + + class RefsDOI(Refcat): """ Sorted (doi, doc) tuples from refs. 225m48.755s @@ -420,24 +391,10 @@ class RefsArxiv(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) -class RefsToRelease(Refcat): - """ - Convert refs to release. - """ - def requires(self): - return RefsWithUnstructured() - - def run(self): - output = shellout(""" - zstdcat -T0 {input} | - skate-conv -f ref -w 24 -b 100000 | - zstd -T0 -c > {output} - """, - input=self.input().path) - luigi.LocalTarget(output).move(self.output().path) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +# +# Generate (key, doc) from fatcat +# ------------------------------- +# class FatcatDOI(Refcat): @@ -533,6 +490,12 @@ class FatcatArxiv(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +# +# Key extraction for fuzzy matching +# --------------------------------- +# + + class FatcatMapped(Refcat): """ Fatcat mapped "tsand". @@ -559,6 +522,27 @@ class FatcatMapped(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +class RefsToRelease(Refcat): + """ + Convert refs to release, since fuzzy verification works on release entities + currently. + """ + def requires(self): + return RefsWithUnstructured() + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | + skate-conv -f ref -w 24 -b 100000 | + zstd -T0 -c > {output} + """, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + + class RefsMapped(Refcat): """ Apply mapper on refs. 281min (about 100k/s). @@ -585,6 +569,12 @@ class RefsMapped(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +# +# Biblioref generation from identifier matches +# -------------------------------------------- +# + + class BrefZipDOI(Refcat): """ Run skate-reduce from two files. @@ -677,6 +667,12 @@ class BrefZipArxiv(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# +# Biblioref generation from fuzzy matching +# ---------------------------------------- +# + + class BrefZipFuzzy(Refcat): """ Run skate-reduce from two files, fuzzy mode; 1039m55.350s, skate-reduce not @@ -703,9 +699,15 @@ class BrefZipFuzzy(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# +# WIP: Open Library +# ----------------- +# + + class OpenLibraryEditionsByWork(Refcat): """ - Have editions keyed by work id, 9m5.037s. + DEPRECATED. Have editions keyed by work id, 9m5.037s. """ def requires(self): return OpenLibraryEditions() @@ -728,7 +730,7 @@ class OpenLibraryEditionsByWork(Refcat): class OpenLibraryWorksSorted(Refcat): """ - Sorted by work id. + DEPRECATED. Sorted by work id. """ def requires(self): return OpenLibraryWorks() @@ -748,23 +750,59 @@ class OpenLibraryWorksSorted(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) -class OpenLibraryRelease(Refcat): + +# +# Open Library Fuzzy matching (OL editions -> release, key extraction) +# -------------------------------------------------------------------- +# + + +class OpenLibraryAuthorMapping(Refcat): """ - Turn Open Library into Release Entities with author mapping. + Create an OL author id to author name TSV mapping. """ + def requires(self): + return OpenLibraryAuthors() + def run(self): - # TODO: remove hardcoded values. output = shellout(""" - zstdcat -T0 /magna/data/ol_dump_editions_latest.txt.zst | + zstdcat -T0 {input} | + LC_ALL=C cut -f 5 | + jq -rc '[.key, .name]|@tsv' | + zstd -T0 > {output} + """, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + + +class OpenLibraryEditionsToRelease(Refcat): + """ + Turn Open Library editions into release entities with author mapping. + """ + def requires(self): + return { + "oled": OpenLibraryEditions(), + "map": OpenLibraryAuthorMapping(), + } + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | cut -f5 | - skate-conv -B -f oled -Xa <(zstdcat -T0 /magna/data/ol_author_mapping.tsv.zst) | + skate-conv -B -f oled -Xa <(zstdcat -T0 {map}) | zstd -T0 -c > {output} - """) + """, + input=self.input().get("oled"), + map=self.input().get("map")) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + class OpenLibraryMapped(Refcat): """ A mapped open library editions set. @@ -772,7 +810,7 @@ class OpenLibraryMapped(Refcat): mapper = luigi.Parameter(default="ts", description="mapper short name") def requires(self): - return OpenLibraryRelease() + return OpenLibraryEditionsToRelease() def run(self): output = shellout(""" @@ -791,12 +829,17 @@ class OpenLibraryMapped(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +# +# Open Library additional matches +# ------------------------------- +# + + class UnmatchedMapped(Refcat): """ Map unmatched refs (in release schema) to titles to do approximate title matches with OL; 35m14.801s. """ - def requires(self): return UnmatchedRefs() @@ -814,4 +857,3 @@ class UnmatchedMapped(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) - -- cgit v1.2.3