diff options
-rw-r--r-- | python/refcat/attic.py | 11 | ||||
-rw-r--r-- | python/refcat/cli.py | 4 | ||||
-rw-r--r-- | python/refcat/tasks.py | 52 | ||||
-rw-r--r-- | python/tests/test_utils.py | 23 |
4 files changed, 67 insertions, 23 deletions
diff --git a/python/refcat/attic.py b/python/refcat/attic.py index 147380c..7633bab 100644 --- a/python/refcat/attic.py +++ b/python/refcat/attic.py @@ -625,7 +625,8 @@ class RefsCounter(Refcat): counts["has_" + k] += 1 if biblio.get('doi') or biblio.get('pmcid') or biblio.get('pmid') or biblio.get('arxiv_id'): counts['has_any_extid'] += 1 - if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get('pages'): + if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get( + 'pages'): counts['has_container_volume_issue_pages'] += 1 if biblio.get('title') and biblio.get('contrib_raw_names') and biblio.get('year'): counts['has_title_contrib_year'] += 1 @@ -941,7 +942,13 @@ class BiblioRefV2(Refcat): A v1 set of biblioref schema docs. """ def requires(self): - return [BiblioRefZippyDOI(), BiblioRefZippyArxiv(), BiblioRefZippyPMID(), BiblioRefZippyPMCID(), BiblioRefFromFuzzyClusters()] + return [ + BiblioRefZippyDOI(), + BiblioRefZippyArxiv(), + BiblioRefZippyPMID(), + BiblioRefZippyPMCID(), + BiblioRefFromFuzzyClusters() + ] def run(self): _, tmpf = tempfile.mkstemp(prefix="refcat-") diff --git a/python/refcat/cli.py b/python/refcat/cli.py index 076c71f..102c996 100644 --- a/python/refcat/cli.py +++ b/python/refcat/cli.py @@ -236,7 +236,9 @@ def main(): print("BASE {}".format(settings.BASE)) print("TMPDIR {}".format(settings.TMPDIR)) print() - names = [name for name in sorted(Register.task_names()) if name not in suppress_task_names and not name.islower()] + names = [ + name for name in sorted(Register.task_names()) if name not in suppress_task_names and not name.islower() + ] print(columnize(names)) sys.exit(0) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index b43c729..48ec180 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -4,7 +4,6 @@ Set of luigi tasks to derive a citation graph. $ refcat.pyz - ____ __ ________ / __/________ _/ /_ / ___/ _ \/ /_/ ___/ __ `/ __/ @@ -38,13 +37,36 @@ Set of luigi tasks to derive a citation graph. Refs UnmatchedRefs RefsArxiv WikipediaCitationsMinimalDataset +------------------------------------------------------------------------ + +Overview +-------- + +* raw input "tasks" as luigi.ExternalTask +* derivation + +# Various schema + +* release (fatcat database export) +* ref (one document per reference) +* OL editions (open library editions) +* OL authors (open library authors) +* wiki (a particular wikipedia reference export) + +Some operations, e.g. "fuzzy verification" require both compared documents to +be release entities. This means, that we need to convert different formats into +the release format. + +Config +------ Config (e.g. raw input data) taken from $HOME/.config/refcat/settings.ini. -> TODO +TODO +---- * [ ] partial (hold) -* [ ] unmatched +* [ ] unmatched (in a final pass) We can match by id and key, e.g. extract id and key, sort and merge (id, key) from graph, and if not available use raw input. @@ -52,7 +74,6 @@ from graph, and if not available use raw input. * [ ] QA Find duplicates and clean them up. Generate stats on match types. - """ import argparse @@ -66,13 +87,11 @@ import sys import tempfile import luigi -from fuzzycat.cluster import Cluster, release_key_title_sandcrawler from gluish.format import Zstd from gluish.task import BaseTask from gluish.utils import shellout from refcat.settings import settings -from refcat.utils import extract_dois, extract_urls, ref_to_release class Refcat(BaseTask): @@ -82,7 +101,8 @@ class Refcat(BaseTask): BASE = settings.BASE TAG = '2021-05-06' - date = luigi.DateParameter(default=datetime.date(2021, 5, 6), description="a versioning help, will be part of filename, change this manually") + date = luigi.DateParameter(default=datetime.date(2021, 5, 6), + description="a versioning help, will be part of filename, change this manually") tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False) n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False) @@ -100,7 +120,8 @@ class Refcat(BaseTask): class Refs(luigi.ExternalTask, Refcat): """ Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this - might increase in a next version. + might increase in a next version. This comes from a custom derivation from + an "heavy intermediate" format in a scholar pipeline. """ def output(self): return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -108,7 +129,8 @@ class Refs(luigi.ExternalTask, Refcat): class ReleaseExportExpanded(luigi.ExternalTask, Refcat): """ - Fatcat release export, zstd version, from e.g. https://archive.org/details/fatcat_snapshots_and_exports + Fatcat database release export, zstd version, from e.g. + https://archive.org/details/fatcat_snapshots_and_exports """ def output(self): return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd) @@ -116,7 +138,9 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat): class MAGPapers(luigi.ExternalTask, Refcat): """ - Microsoft Academic dump as archived, e.g. https://archive.org/details/mag-2020-06-25 + Microsoft Academic dump as archived, e.g. + https://archive.org/details/mag-2020-06-25 - we want this mainly for + comparisons. """ def output(self): return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd) @@ -147,7 +171,7 @@ class OpenLibraryDump(luigi.ExternalTask, Refcat): class OpenLibraryEditions(luigi.ExternalTask, Refcat): """ - Editions file. + Editions file (converted to zstd) https://openlibrary.org/developers/dumps. """ def output(self): return luigi.LocalTarget(path=settings.OL_DUMP_EDITIONS, format=Zstd) @@ -155,7 +179,8 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat): class OpenLibraryWorks(luigi.ExternalTask, Refcat): """ - Works dump, from https://openlibrary.org/developers/dumps. + Works dump (converted to zstd), from + https://openlibrary.org/developers/dumps. """ def output(self): return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd) @@ -163,7 +188,8 @@ class OpenLibraryWorks(luigi.ExternalTask, Refcat): class OpenLibraryAuthors(luigi.ExternalTask, Refcat): """ - Works dump, from https://openlibrary.org/developers/dumps. + Author dump (converted to zstd), from + https://openlibrary.org/developers/dumps. """ def output(self): return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd) diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py index 79c8919..acc1888 100644 --- a/python/tests/test_utils.py +++ b/python/tests/test_utils.py @@ -10,19 +10,24 @@ def test_extract_urls(): assert extract_urls("http://a.com/b") == ["http://a.com/b"] assert extract_urls("https://a.com/b") == ["https://a.com/b"] assert extract_urls("http=://a.com/b") == ["a.com/"] - assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == ["http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"] + assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == [ + "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/" + ] + assert extract_urls( + "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012" + ) == [] assert extract_urls( - "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012") == [] - assert extract_urls("http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en") == [ "http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en" - ] + ) == ["http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"] assert extract_urls("DOI:10.1093/forestry/cpr048") == [] assert extract_urls("www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228") == [ "www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228" ] assert extract_urls("http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"] assert extract_urls("hello http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"] - assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv", "http://bit.ly/cJbkv"] + assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == [ + "http://bit.ly/cJbkv", "http://bit.ly/cJbkv" + ] assert extract_urls("jul./set.de") == ["set.de"] @@ -35,7 +40,11 @@ def test_extract_doi(): assert extract_dois("!!10.1080/00335630.2012.714899") == ["10.1080/00335630.2012.714899"] assert extract_dois("!!10.1177/1075547007306508.!") == ["10.1177/1075547007306508"] assert extract_dois("!!445!!10.3390/nu6114822") == ["10.3390/nu6114822"] - assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == ["10.1111/j.1467J9566.2010.01286"] - assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == ["10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"] + assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == [ + "10.1111/j.1467J9566.2010.01286" + ] + assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == [ + "10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO" + ] assert extract_dois("!10.1002/ajpa.20674.!") == ["10.1002/ajpa.20674"] assert extract_dois("!10.1002/chem.201700953.!") == ["10.1002/chem.201700953"] |