From d44bc88a37d3adf70b10990bdf731cdfac119536 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 7 Jun 2021 16:57:53 +0200 Subject: add Bref task --- python/refcat/tasks.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index c4e5b56..c6bd7e7 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -121,7 +121,7 @@ candidate ref find most likely match in the releases of a journal. Also, many partial records do have more information in unstructured; parse this out first. -* [ ] OL fuzzy +* [x] OL fuzzy Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches, which might indicate different editions (debug this later). @@ -140,7 +140,7 @@ TODO: Unmatched --------------- * raw refs may contain duplicates (e.g. "crossref" and "grobid") -* refs should appear in order as they are found in the paper +* refs should appear in order as they are found in the paper; can we guarantee that? Idea was that "source release ident + ref index" should allow completeness and order. "crossref" and "grobid" order may vary. @@ -1065,6 +1065,34 @@ class BrefZipOpenLibrary(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +# +# Combined Bref File +# +class Bref(Refcat): + """ + Combine bref files from various sources. + """ + def requires(self): + return { + "doi": BrefZipDOI(), + "pmid": BrefZipPMID(), + "pmcid": BrefZipPMCID(), + "arxiv": BrefZipArxiv(), + "fuzzy": BrefZipFuzzy(), + "ol-fuzzy": BrefZipOpenLibrary(), + } + + def run(self): + _, tmpf = tempfile.mkstemp() + for k, v in self.input().items(): + self.logger.debug("adding {}".format(k)) + shellout("""cat "{}" >> {}""".format(v, tmpf)) + luigi.LocalTarget(tmpf).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + + # # Extra # ----- -- cgit v1.2.3