aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-03 01:47:46 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-03 01:47:46 +0200
commit73dc8ce8c0b1500ea5cb1fa177ca2ece961726fc (patch)
treef6afbe1550b61491c7ef991bd960217dddb4de9e
parentf3631ec35f772085959c9d58a998486d897ca95b (diff)
downloadrefcat-73dc8ce8c0b1500ea5cb1fa177ca2ece961726fc.tar.gz
refcat-73dc8ce8c0b1500ea5cb1fa177ca2ece961726fc.zip
add RefsWithoutIdentifiers
-rw-r--r--python/refcat/tasks.py121
1 files changed, 90 insertions, 31 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 1f56e84..bc2fba1 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -51,7 +51,8 @@ build. The most common pattern is map-reduce, e.g. derive a key from docs,
combine the results from e.g. two such key extractions and apply some
reduction, e.g. output schema generation.
-# Various schema
+Various schema
+--------------
* release (fatcat database export)
* ref (one document per reference)
@@ -64,6 +65,34 @@ Some operations, e.g. "fuzzy verification" require both compared documents to
be release entities. This means, that we need to convert different formats into
the release format.
+Mappers
+-------
+
+For catalog (fatcat) and refs, we extract ids:
+
+* doi
+* pmid
+* pmcid
+* arxiv
+
+We run fuzzy title matching and verification. Here, we need to convert refs to
+releases to be able to run verify (could implement a verification for various
+schemas, too -- but release seems complete enough).
+
+For OL we need to fuse authors into the editions dataset first.
+
+Reducers
+--------
+
+Exact mode for ids:
+
+* doi
+* pmid
+* pmcid
+* arxiv
+
+For fuzzy matching, we use "fuzzy" mode (and keep only exact and strong matches).
+
Config
------
@@ -209,6 +238,7 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
# ---------------------------------------
#
+
class RefsWithUnstructured(Refcat):
"""
Augment refs with data from biblio.unstructured - do this first, so we can use it
@@ -257,7 +287,7 @@ class UnmatchedRefs(Refcat):
260,749,705.
Note, that this data contains refs, which have more information, just
- hidden in unstructured.
+ hidden in "unstructured" field. XXX: We'll come back to this later.
"""
def requires(self):
return RefsWithUnstructured()
@@ -280,11 +310,33 @@ class UnmatchedRefs(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+class RefsWithoutIdentifiers(Refcat):
+ def requires(self):
+ return RefsWithUnstructured()
+
+ def run(self):
+ output = shellout("""
+ zstdcat -T0 {input} |
+ parallel -j {n} --block 10M --pipe
+ "jq -rc 'select(.biblio.doi == null and
+ .biblio.pmid == null and
+ .biblio.pmcid == null and
+ .biblio.arxiv_id == null)'" |
+ zstd -T0 -c > {output}""",
+ n=self.n,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
#
# Generate URL list for CDX lookup
# --------------------------------
#
+
class URLTabs(Refcat):
"""
Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s).
@@ -877,15 +929,15 @@ class OpenLibraryEditionsMapped(Refcat):
#
-# Extra
-# -----
+# Open Library Fuzzy
+# ------------------
#
class UnmatchedMapped(Refcat):
"""
- Map unmatched refs (converted to release schema on the fly) to titles to do
- approximate title matches with OL; 35m14.801s.
+ Map unmatched refs (converted to release schema on the fly) to container
+ names to do approximate title matches with OL; 35m14.801s.
"""
def requires(self):
return UnmatchedRefs()
@@ -906,6 +958,38 @@ class UnmatchedMapped(Refcat):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+class UnmatchedOpenLibraryMatchTable(Refcat):
+ """
+ Run matching and write tabular results to file. About 50M rows.
+
+ """
+ def requires(self):
+ return {
+ "unmatched": UnmatchedMapped(), # We could include a bit more here, namely records with titles.
+ "ol": OpenLibraryEditionsMapped(),
+ }
+
+ def run(self):
+ output = shellout("""
+ skate-reduce -m oled
+ -O <(zstdcat -T0 {ol})
+ -F <(zstdcat -T0 {unmatched}) |
+ zstd -c > {output}
+ """,
+ ol=self.input().get("ol").path,
+ unmatched=self.input().get("unmatched").path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+
+#
+# Extra
+# -----
+#
+
+
class UnmatchedRefsToRelease(Refcat):
"""
Convert unmatched refs to releases.
@@ -969,28 +1053,3 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class UnmatchedOpenLibraryMatchTable(Refcat):
- """
- Run matching and write tabular results to file. About 50M rows.
- """
- def requires(self):
- return {
- "unmatched": UnmatchedMapped(),
- "ol": OpenLibraryEditionsMapped(),
- }
-
- def run(self):
- output = shellout("""
- skate-reduce -m oled
- -O <(zstdcat -T0 {ol})
- -F <(zstdcat -T0 {unmatched}) |
- zstd -c > {output}
- """,
- ol=self.input().get("ol").path,
- unmatched=self.input().get("unmatched").path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)