aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-07 16:57:53 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-07 16:57:53 +0200
commitd44bc88a37d3adf70b10990bdf731cdfac119536 (patch)
tree9d55c6ac69806135c63211ad07b250cfc5f27f6d /python
parent76421daea5ace318ccb66a21fd881f895b69c3c1 (diff)
downloadrefcat-d44bc88a37d3adf70b10990bdf731cdfac119536.tar.gz
refcat-d44bc88a37d3adf70b10990bdf731cdfac119536.zip
add Bref task
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py32
1 files changed, 30 insertions, 2 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c4e5b56..c6bd7e7 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -121,7 +121,7 @@ candidate ref find most likely match in the releases of a journal.
Also, many partial records do have more information in unstructured; parse this
out first.
-* [ ] OL fuzzy
+* [x] OL fuzzy
Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches,
which might indicate different editions (debug this later).
@@ -140,7 +140,7 @@ TODO: Unmatched
---------------
* raw refs may contain duplicates (e.g. "crossref" and "grobid")
-* refs should appear in order as they are found in the paper
+* refs should appear in order as they are found in the paper; can we guarantee that?
Idea was that "source release ident + ref index" should allow completeness and
order. "crossref" and "grobid" order may vary.
@@ -1066,6 +1066,34 @@ class BrefZipOpenLibrary(Refcat):
#
+# Combined Bref File
+#
+class Bref(Refcat):
+ """
+ Combine bref files from various sources.
+ """
+ def requires(self):
+ return {
+ "doi": BrefZipDOI(),
+ "pmid": BrefZipPMID(),
+ "pmcid": BrefZipPMCID(),
+ "arxiv": BrefZipArxiv(),
+ "fuzzy": BrefZipFuzzy(),
+ "ol-fuzzy": BrefZipOpenLibrary(),
+ }
+
+ def run(self):
+ _, tmpf = tempfile.mkstemp()
+ for k, v in self.input().items():
+ self.logger.debug("adding {}".format(k))
+ shellout("""cat "{}" >> {}""".format(v, tmpf))
+ luigi.LocalTarget(tmpf).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
+#
# Extra
# -----
#