aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py32
1 files changed, 30 insertions, 2 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c4e5b56..c6bd7e7 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -121,7 +121,7 @@ candidate ref find most likely match in the releases of a journal.
Also, many partial records do have more information in unstructured; parse this
out first.
-* [ ] OL fuzzy
+* [x] OL fuzzy
Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches,
which might indicate different editions (debug this later).
@@ -140,7 +140,7 @@ TODO: Unmatched
---------------
* raw refs may contain duplicates (e.g. "crossref" and "grobid")
-* refs should appear in order as they are found in the paper
+* refs should appear in order as they are found in the paper; can we guarantee that?
Idea was that "source release ident + ref index" should allow completeness and
order. "crossref" and "grobid" order may vary.
@@ -1066,6 +1066,34 @@ class BrefZipOpenLibrary(Refcat):
#
+# Combined Bref File
+#
+class Bref(Refcat):
+ """
+ Combine bref files from various sources.
+ """
+ def requires(self):
+ return {
+ "doi": BrefZipDOI(),
+ "pmid": BrefZipPMID(),
+ "pmcid": BrefZipPMCID(),
+ "arxiv": BrefZipArxiv(),
+ "fuzzy": BrefZipFuzzy(),
+ "ol-fuzzy": BrefZipOpenLibrary(),
+ }
+
+ def run(self):
+ _, tmpf = tempfile.mkstemp()
+ for k, v in self.input().items():
+ self.logger.debug("adding {}".format(k))
+ shellout("""cat "{}" >> {}""".format(v, tmpf))
+ luigi.LocalTarget(tmpf).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
+#
# Extra
# -----
#