diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-07 16:57:53 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-07 16:57:53 +0200 |
commit | d44bc88a37d3adf70b10990bdf731cdfac119536 (patch) | |
tree | 9d55c6ac69806135c63211ad07b250cfc5f27f6d /python | |
parent | 76421daea5ace318ccb66a21fd881f895b69c3c1 (diff) | |
download | refcat-d44bc88a37d3adf70b10990bdf731cdfac119536.tar.gz refcat-d44bc88a37d3adf70b10990bdf731cdfac119536.zip |
add Bref task
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 32 |
1 files changed, 30 insertions, 2 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index c4e5b56..c6bd7e7 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -121,7 +121,7 @@ candidate ref find most likely match in the releases of a journal. Also, many partial records do have more information in unstructured; parse this out first. -* [ ] OL fuzzy +* [x] OL fuzzy Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches, which might indicate different editions (debug this later). @@ -140,7 +140,7 @@ TODO: Unmatched --------------- * raw refs may contain duplicates (e.g. "crossref" and "grobid") -* refs should appear in order as they are found in the paper +* refs should appear in order as they are found in the paper; can we guarantee that? Idea was that "source release ident + ref index" should allow completeness and order. "crossref" and "grobid" order may vary. @@ -1066,6 +1066,34 @@ class BrefZipOpenLibrary(Refcat): # +# Combined Bref File +# +class Bref(Refcat): + """ + Combine bref files from various sources. + """ + def requires(self): + return { + "doi": BrefZipDOI(), + "pmid": BrefZipPMID(), + "pmcid": BrefZipPMCID(), + "arxiv": BrefZipArxiv(), + "fuzzy": BrefZipFuzzy(), + "ol-fuzzy": BrefZipOpenLibrary(), + } + + def run(self): + _, tmpf = tempfile.mkstemp() + for k, v in self.input().items(): + self.logger.debug("adding {}".format(k)) + shellout("""cat "{}" >> {}""".format(v, tmpf)) + luigi.LocalTarget(tmpf).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + + +# # Extra # ----- # |