4 files changed, 67 insertions, 23 deletions
diff --git a/python/refcat/attic.py b/python/refcat/attic.py
index 147380c..7633bab 100644
--- a/python/refcat/attic.py
+++ b/python/refcat/attic.py
@@ -625,7 +625,8 @@ class RefsCounter(Refcat):
                         counts["has_" + k] += 1
                 if biblio.get('doi') or biblio.get('pmcid') or biblio.get('pmid') or biblio.get('arxiv_id'):
                     counts['has_any_extid'] += 1
-                if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get('pages'):
+                if biblio.get('container_name') and biblio.get('volume') and biblio.get('issue') and biblio.get(
+                        'pages'):
                     counts['has_container_volume_issue_pages'] += 1
                 if biblio.get('title') and biblio.get('contrib_raw_names') and biblio.get('year'):
                     counts['has_title_contrib_year'] += 1
@@ -941,7 +942,13 @@ class BiblioRefV2(Refcat):
     A v1 set of biblioref schema docs.
     """
     def requires(self):
-        return [BiblioRefZippyDOI(), BiblioRefZippyArxiv(), BiblioRefZippyPMID(), BiblioRefZippyPMCID(), BiblioRefFromFuzzyClusters()]
+        return [
+            BiblioRefZippyDOI(),
+            BiblioRefZippyArxiv(),
+            BiblioRefZippyPMID(),
+            BiblioRefZippyPMCID(),
+            BiblioRefFromFuzzyClusters()
+        ]
 
     def run(self):
         _, tmpf = tempfile.mkstemp(prefix="refcat-")
diff --git a/python/refcat/cli.py b/python/refcat/cli.py
index 076c71f..102c996 100644
--- a/python/refcat/cli.py
+++ b/python/refcat/cli.py
@@ -236,7 +236,9 @@ def main():
         print("BASE      {}".format(settings.BASE))
         print("TMPDIR    {}".format(settings.TMPDIR))
         print()
-        names = [name for name in sorted(Register.task_names()) if name not in suppress_task_names and not name.islower()]
+        names = [
+            name for name in sorted(Register.task_names()) if name not in suppress_task_names and not name.islower()
+        ]
         print(columnize(names))
         sys.exit(0)
 
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index b43c729..48ec180 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -4,7 +4,6 @@ Set of luigi tasks to derive a citation graph.
 
     $ refcat.pyz
 
-
                   ____           __
        ________  / __/________ _/ /_
       / ___/ _ \/ /_/ ___/ __ `/ __/
@@ -38,13 +37,36 @@ Set of luigi tasks to derive a citation graph.
     Refs                              UnmatchedRefs
     RefsArxiv                         WikipediaCitationsMinimalDataset
 
+------------------------------------------------------------------------
+
+Overview
+--------
+
+* raw input "tasks" as luigi.ExternalTask
+* derivation
+
+# Various schema
+
+* release (fatcat database export)
+* ref (one document per reference)
+* OL editions (open library editions)
+* OL authors (open library authors)
+* wiki (a particular wikipedia reference export)
+
+Some operations, e.g. "fuzzy verification" require both compared documents to
+be release entities. This means, that we need to convert different formats into
+the release format.
+
+Config
+------
 
 Config (e.g. raw input data) taken from $HOME/.config/refcat/settings.ini.
 
-> TODO
+TODO
+----
 
 * [ ] partial (hold)
-* [ ] unmatched
+* [ ] unmatched (in a final pass)
 
 We can match by id and key, e.g. extract id and key, sort and merge (id, key)
 from graph, and if not available use raw input.
@@ -52,7 +74,6 @@ from graph, and if not available use raw input.
 * [ ] QA
 
 Find duplicates and clean them up. Generate stats on match types.
-
 """
 
 import argparse
@@ -66,13 +87,11 @@ import sys
 import tempfile
 
 import luigi
-from fuzzycat.cluster import Cluster, release_key_title_sandcrawler
 from gluish.format import Zstd
 from gluish.task import BaseTask
 from gluish.utils import shellout
 
 from refcat.settings import settings
-from refcat.utils import extract_dois, extract_urls, ref_to_release
 
 
 class Refcat(BaseTask):
@@ -82,7 +101,8 @@ class Refcat(BaseTask):
     BASE = settings.BASE
     TAG = '2021-05-06'
 
-    date = luigi.DateParameter(default=datetime.date(2021, 5, 6), description="a versioning help, will be part of filename, change this manually")
+    date = luigi.DateParameter(default=datetime.date(2021, 5, 6),
+                               description="a versioning help, will be part of filename, change this manually")
     tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False)
     n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False)
 
@@ -100,7 +120,8 @@ class Refcat(BaseTask):
 class Refs(luigi.ExternalTask, Refcat):
     """
     Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
-    might increase in a next version.
+    might increase in a next version. This comes from a custom derivation from
+    an "heavy intermediate" format in a scholar pipeline.
     """
     def output(self):
         return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd)
@@ -108,7 +129,8 @@ class Refs(luigi.ExternalTask, Refcat):
 
 class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
     """
-    Fatcat release export, zstd version, from e.g. https://archive.org/details/fatcat_snapshots_and_exports
+    Fatcat database release export, zstd version, from e.g.
+    https://archive.org/details/fatcat_snapshots_and_exports
     """
     def output(self):
         return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd)
@@ -116,7 +138,9 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
 
 class MAGPapers(luigi.ExternalTask, Refcat):
     """
-    Microsoft Academic dump as archived, e.g. https://archive.org/details/mag-2020-06-25
+    Microsoft Academic dump as archived, e.g.
+    https://archive.org/details/mag-2020-06-25 - we want this mainly for
+    comparisons.
     """
     def output(self):
         return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd)
@@ -147,7 +171,7 @@ class OpenLibraryDump(luigi.ExternalTask, Refcat):
 
 class OpenLibraryEditions(luigi.ExternalTask, Refcat):
     """
-    Editions file.
+    Editions file (converted to zstd) https://openlibrary.org/developers/dumps.
     """
     def output(self):
         return luigi.LocalTarget(path=settings.OL_DUMP_EDITIONS, format=Zstd)
@@ -155,7 +179,8 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat):
 
 class OpenLibraryWorks(luigi.ExternalTask, Refcat):
     """
-    Works dump, from https://openlibrary.org/developers/dumps.
+    Works dump (converted to zstd), from
+    https://openlibrary.org/developers/dumps.
     """
     def output(self):
         return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd)
@@ -163,7 +188,8 @@ class OpenLibraryWorks(luigi.ExternalTask, Refcat):
 
 class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
     """
-    Works dump, from https://openlibrary.org/developers/dumps.
+    Author dump (converted to zstd), from
+    https://openlibrary.org/developers/dumps.
     """
     def output(self):
         return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
diff --git a/python/tests/test_utils.py b/python/tests/test_utils.py
index 79c8919..acc1888 100644
--- a/python/tests/test_utils.py
+++ b/python/tests/test_utils.py
@@ -10,19 +10,24 @@ def test_extract_urls():
     assert extract_urls("http://a.com/b") == ["http://a.com/b"]
     assert extract_urls("https://a.com/b") == ["https://a.com/b"]
     assert extract_urls("http=://a.com/b") == ["a.com/"]
-    assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == ["http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"]
+    assert extract_urls("http://www.bioinformatics.babraham.ac.uk/projects/fastqc/") == [
+        "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/"
+    ]
+    assert extract_urls(
+        "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012"
+    ) == []
     assert extract_urls(
-        "CertificaçãoDigitalNº1311532/CA40/005129/2012Apensadoao40/006810/2011-1ºTermoAditivonº52/2012aoContratonº282/2011-Celebradoem08/08/2012") == []
-    assert extract_urls("http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en") == [
         "http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"
-    ]
+    ) == ["http://www.brookings.edu/~/media/Research/Files/Papers/2015/04/global-drug-policy/Caulkinsfinal.pdf?la=en"]
     assert extract_urls("DOI:10.1093/forestry/cpr048") == []
     assert extract_urls("www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228") == [
         "www.dtic.mil/cgi-bin/GetTRDoc?Location=U2&doc=GetTRDoc.pdf&AD=ADA475228"
     ]
     assert extract_urls("http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
     assert extract_urls("hello http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv"]
-    assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == ["http://bit.ly/cJbkv", "http://bit.ly/cJbkv"]
+    assert extract_urls("hello http://bit.ly/cJbkv http://bit.ly/cJbkv") == [
+        "http://bit.ly/cJbkv", "http://bit.ly/cJbkv"
+    ]
     assert extract_urls("jul./set.de") == ["set.de"]
 
 
@@ -35,7 +40,11 @@ def test_extract_doi():
     assert extract_dois("!!10.1080/00335630.2012.714899") == ["10.1080/00335630.2012.714899"]
     assert extract_dois("!!10.1177/1075547007306508.!") == ["10.1177/1075547007306508"]
     assert extract_dois("!!445!!10.3390/nu6114822") == ["10.3390/nu6114822"]
-    assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == ["10.1111/j.1467J9566.2010.01286"]
-    assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == ["10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"]
+    assert extract_dois("!0141-9889,!pp.!448-464!doi:!10.1111/j.1467J9566.2010.01286.!") == [
+        "10.1111/j.1467J9566.2010.01286"
+    ]
+    assert extract_dois("!10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO") == [
+        "10.1002/(SICI)1097-4679(200004)56:4<519::AID-JCLP6>3.0.CO"
+    ]
     assert extract_dois("!10.1002/ajpa.20674.!") == ["10.1002/ajpa.20674"]
     assert extract_dois("!10.1002/chem.201700953.!") == ["10.1002/chem.201700953"]