aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/attic.py11
-rw-r--r--python/refcat/cli.py4
-rw-r--r--python/refcat/tasks.py52
-rw-r--r--python/tests/test_utils.py23
4 files changed, 67 insertions, 23 deletions
diff --git a/python/refcat/attic.py b/python/refcat/attic.py
index 147380c..7633bab 100644
--- a/python/refcat/attic.py
+++ b/python/refcat/attic.py
@@ -625,7 +625,8 @@ class RefsCounter(Refcat):
counts["has_" + k] += 1
if biblio.get('doi') or biblio.get('pmcid') or biblio.get('pmid') or biblio.get('arxiv_id'):
counts['has_any_extid'] += 1
- if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get('pages'):
+ if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get(
+ 'pages'):
counts['has_container_volume_issue_pages'] += 1
if biblio.get('title') and biblio.get('contrib_raw_names') and biblio.get('year'):
counts['has_title_contrib_year'] += 1
@@ -941,7 +942,13 @@ class BiblioRefV2(Refcat):
A v1 set of biblioref schema docs.
"""
def requires(self):
- return [BiblioRefZippyDOI(), BiblioRefZippyArxiv(), BiblioRefZippyPMID(), BiblioRefZippyPMCID(), BiblioRefFromFuzzyClusters()]
+ return [
+ BiblioRefZippyDOI(),
+ BiblioRefZippyArxiv(),
+ BiblioRefZippyPMID(),
+ BiblioRefZippyPMCID(),
+ BiblioRefFromFuzzyClusters()
+ ]
def run(self):
_, tmpf = tempfile.mkstemp(prefix="refcat-")
diff --git a/python/refcat/cli.py b/python/refcat/cli.py
index 076c71f..102c996 100644
--- a/python/refcat/cli.py
+++ b/python/refcat/cli.py
@@ -236,7 +236,9 @@ def main():
print("BASE {}".format(settings.BASE))
print("TMPDIR {}".format(settings.TMPDIR))
print()
- names = [name for name in sorted(Register.task_names()) if name not in suppress_task_names and not name.islower()]
+ names = [
+ name for name in sorted(Register.task_names()) if name not in suppress_task_names and not name.islower()
+ ]
print(columnize(names))
sys.exit(0)
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index b43c729..48ec180 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -4,7 +4,6 @@ Set of luigi tasks to derive a citation graph.
$ refcat.pyz
-
____ __
________ / __/________ _/ /_
/ ___/ _ \/ /_/ ___/ __ `/ __/
@@ -38,13 +37,36 @@ Set of luigi tasks to derive a citation graph.
Refs UnmatchedRefs
RefsArxiv WikipediaCitationsMinimalDataset
+------------------------------------------------------------------------
+
+Overview
+--------
+
+* raw input "tasks" as luigi.ExternalTask
+* derivation
+
+# Various schema
+
+* release (fatcat database export)
+* ref (one document per reference)
+* OL editions (open library editions)
+* OL authors (open library authors)
+* wiki (a particular wikipedia reference export)
+
+Some operations, e.g. "fuzzy verification" require both compared documents to
+be release entities. This means, that we need to convert different formats into
+the release format.
+
+Config
+------
Config (e.g. raw input data) taken from $HOME/.config/refcat/settings.ini.
-> TODO
+TODO
+----
* [ ] partial (hold)
-* [ ] unmatched
+* [ ] unmatched (in a final pass)
We can match by id and key, e.g. extract id and key, sort and merge (id, key)
from graph, and if not available use raw input.
@@ -52,7 +74,6 @@ from graph, and if not available use raw input.
* [ ] QA
Find duplicates and clean them up. Generate stats on match types.
-
"""
import argparse
@@ -66,13 +87,11 @@ import sys
import tempfile
import luigi
-from fuzzycat.cluster import Cluster, release_key_title_sandcrawler
from gluish.format import Zstd
from gluish.task import BaseTask
from gluish.utils import shellout
from refcat.settings import settings
-from refcat.utils import extract_dois, extract_urls, ref_to_release
class Refcat(BaseTask):
@@ -82,7 +101,8 @@ class Refcat(BaseTask):
BASE = settings.BASE
TAG = '2021-05-06'
- date = luigi.DateParameter(default=datetime.date(2021, 5, 6), description="a versioning help, will be part of filename, change this manually")
+ date = luigi.DateParameter(default=datetime.date(2021, 5, 6),
+ description="a versioning help, will be part of filename, change this manually")
tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False)
n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False)
@@ -100,7 +120,8 @@ class Refcat(BaseTask):
class Refs(luigi.ExternalTask, Refcat):
"""
Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
- might increase in a next version.
+ might increase in a next version. This comes from a custom derivation from
+ an "heavy intermediate" format in a scholar pipeline.
"""
def output(self):
return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd)
@@ -108,7 +129,8 @@ class Refs(luigi.ExternalTask, Refcat):
class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
"""
- Fatcat release export, zstd version, from e.g. https://archive.org/details/fatcat_snapshots_and_exports
+ Fatcat database release export, zstd version, from e.g.
+ https://archive.org/details/fatcat_snapshots_and_exports
"""
def output(self):
return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd)
@@ -116,7 +138,9 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
class MAGPapers(luigi.ExternalTask, Refcat):
"""
- Microsoft Academic dump as archived, e.g. https://archive.org/details/mag-2020-06-25
+ Microsoft Academic dump as archived, e.g.
+ https://archive.org/details/mag-2020-06-25 - we want this mainly for
+ comparisons.
"""
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
@@ -147,7 +171,7 @@ class OpenLibraryDump(luigi.ExternalTask, Refcat):
class OpenLibraryEditions(luigi.ExternalTask, Refcat):
"""
- Editions file.
+ Editions file (converted to zstd) https://openlibrary.org/developers/dumps.
"""
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP_EDITIONS, format=Zstd)
@@ -155,7 +179,8 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat):
class OpenLibraryWorks(luigi.ExternalTask, Refcat):
"""
- Works dump, from https://openlibrary.org/developers/dumps.
+ Works dump (converted to zstd), from
+ https://openlibrary.org/developers/dumps.
"""
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd)
@@ -163,7 +188,8 @@ class OpenLibraryWorks(luigi.ExternalTask, Refcat):
class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
"""
- Works dump, from https://openlibrary.org/developers/dumps.
+ Author dump (converted to zstd), from
+ https://openlibrary.org/developers/dumps.
"""
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py
index 79c8919..acc1888 100644
--- a/python/tests/test_utils.py
+++ b/python/tests/test_utils.py
@@ -10,19 +10,24 @@ def test_extract_urls():
assert extract_urls("http://a.com/b") == ["http://a.com/b"]
assert extract_urls("https://a.com/b") == ["https://a.com/b"]
assert extract_urls("http=://a.com/b") == ["a.com/"]
- assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == ["http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"]
+ assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == [
+ "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"
+ ]
+ assert extract_urls(
+ "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012"
+ ) == []
assert extract_urls(
- "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012") == []
- assert extract_urls("http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en") == [
"http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"
- ]
+ ) == ["http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"]
assert extract_urls("DOI:10.1093/forestry/cpr048") == []
assert extract_urls("www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228") == [
"www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228"
]
assert extract_urls("http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
assert extract_urls("hello http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
- assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv", "http://bit.ly/cJbkv"]
+ assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == [
+ "http://bit.ly/cJbkv", "http://bit.ly/cJbkv"
+ ]
assert extract_urls("jul./set.de") == ["set.de"]
@@ -35,7 +40,11 @@ def test_extract_doi():
assert extract_dois("!!10.1080/00335630.2012.714899") == ["10.1080/00335630.2012.714899"]
assert extract_dois("!!10.1177/1075547007306508.!") == ["10.1177/1075547007306508"]
assert extract_dois("!!445!!10.3390/nu6114822") == ["10.3390/nu6114822"]
- assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == ["10.1111/j.1467J9566.2010.01286"]
- assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == ["10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"]
+ assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == [
+ "10.1111/j.1467J9566.2010.01286"
+ ]
+ assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == [
+ "10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"
+ ]
assert extract_dois("!10.1002/ajpa.20674.!") == ["10.1002/ajpa.20674"]
assert extract_dois("!10.1002/chem.201700953.!") == ["10.1002/chem.201700953"]