From 73dc8ce8c0b1500ea5cb1fa177ca2ece961726fc Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 3 Jun 2021 01:47:46 +0200 Subject: add RefsWithoutIdentifiers --- python/refcat/tasks.py | 121 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 31 deletions(-) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 1f56e84..bc2fba1 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -51,7 +51,8 @@ build. The most common pattern is map-reduce, e.g. derive a key from docs, combine the results from e.g. two such key extractions and apply some reduction, e.g. output schema generation. -# Various schema +Various schema +-------------- * release (fatcat database export) * ref (one document per reference) @@ -64,6 +65,34 @@ Some operations, e.g. "fuzzy verification" require both compared documents to be release entities. This means, that we need to convert different formats into the release format. +Mappers +------- + +For catalog (fatcat) and refs, we extract ids: + +* doi +* pmid +* pmcid +* arxiv + +We run fuzzy title matching and verification. Here, we need to convert refs to +releases to be able to run verify (could implement a verification for various +schemas, too -- but release seems complete enough). + +For OL we need to fuse authors into the editions dataset first. + +Reducers +-------- + +Exact mode for ids: + +* doi +* pmid +* pmcid +* arxiv + +For fuzzy matching, we use "fuzzy" mode (and keep only exact and strong matches). + Config ------ @@ -209,6 +238,7 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat): # --------------------------------------- # + class RefsWithUnstructured(Refcat): """ Augment refs with data from biblio.unstructured - do this first, so we can use it @@ -257,7 +287,7 @@ class UnmatchedRefs(Refcat): 260,749,705. Note, that this data contains refs, which have more information, just - hidden in unstructured. + hidden in "unstructured" field. XXX: We'll come back to this later. """ def requires(self): return RefsWithUnstructured() @@ -280,11 +310,33 @@ class UnmatchedRefs(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +class RefsWithoutIdentifiers(Refcat): + def requires(self): + return RefsWithUnstructured() + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | + parallel -j {n} --block 10M --pipe + "jq -rc 'select(.biblio.doi == null and + .biblio.pmid == null and + .biblio.pmcid == null and + .biblio.arxiv_id == null)'" | + zstd -T0 -c > {output}""", + n=self.n, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + + # # Generate URL list for CDX lookup # -------------------------------- # + class URLTabs(Refcat): """ Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s). @@ -877,15 +929,15 @@ class OpenLibraryEditionsMapped(Refcat): # -# Extra -# ----- +# Open Library Fuzzy +# ------------------ # class UnmatchedMapped(Refcat): """ - Map unmatched refs (converted to release schema on the fly) to titles to do - approximate title matches with OL; 35m14.801s. + Map unmatched refs (converted to release schema on the fly) to container + names to do approximate title matches with OL; 35m14.801s. """ def requires(self): return UnmatchedRefs() @@ -906,6 +958,38 @@ class UnmatchedMapped(Refcat): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) +class UnmatchedOpenLibraryMatchTable(Refcat): + """ + Run matching and write tabular results to file. About 50M rows. + + """ + def requires(self): + return { + "unmatched": UnmatchedMapped(), # We could include a bit more here, namely records with titles. + "ol": OpenLibraryEditionsMapped(), + } + + def run(self): + output = shellout(""" + skate-reduce -m oled + -O <(zstdcat -T0 {ol}) + -F <(zstdcat -T0 {unmatched}) | + zstd -c > {output} + """, + ol=self.input().get("ol").path, + unmatched=self.input().get("unmatched").path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + + +# +# Extra +# ----- +# + + class UnmatchedRefsToRelease(Refcat): """ Convert unmatched refs to releases. @@ -969,28 +1053,3 @@ class UnmatchedResolveJournalNamesMapped(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) - - -class UnmatchedOpenLibraryMatchTable(Refcat): - """ - Run matching and write tabular results to file. About 50M rows. - """ - def requires(self): - return { - "unmatched": UnmatchedMapped(), - "ol": OpenLibraryEditionsMapped(), - } - - def run(self): - output = shellout(""" - skate-reduce -m oled - -O <(zstdcat -T0 {ol}) - -F <(zstdcat -T0 {unmatched}) | - zstd -c > {output} - """, - ol=self.input().get("ol").path, - unmatched=self.input().get("unmatched").path) - luigi.LocalTarget(output).move(self.output().path) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) -- cgit v1.2.3