add link to blog on suffix arrays

author: Martin Czygan <martin.czygan@gmail.com> 2021-06-04 22:15:35 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-06-04 22:15:35 +0200
commit: 3ef6b66923aa97c539640085bd88cccac7ede95e (patch)
tree: 0d1711112ed91f54c82c9078b1ec3325cb5be8d7
parent: a591f434db90efd127b33ac3f1889b786e58e27e (diff)
download: refcat-3ef6b66923aa97c539640085bd88cccac7ede95e.tar.gz
refcat-3ef6b66923aa97c539640085bd88cccac7ede95e.zip
2 files changed, 44 insertions, 5 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 6421007..eae4c6b 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -102,15 +102,53 @@ TODO
 ----
 
 * [ ] partial (hold)
+
+Prepared resolver for journal abbreviations; most entries have some journal
+name, so use journal name or issn (extra step) to group candidates per journal.
+Journals may on average have 1K publications (few have 100K+); then for each
+candidate ref find most likely match in the releases of a journal.
+
+Also, many partial records do have more information in unstructured; parse this
+out first.
+
 * [ ] OL fuzzy
+
+Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches,
+which might indicate different editions (debug this later).
+
 * [ ] unmatched (in a final pass)
 
 We can match by id and key, e.g. extract id and key, sort and merge (id, key)
 from graph, and if not available use raw input.
 
-* [ ] QA
+> QA things
+
+* [ ] find duplicates and clean them up
+* [ ] generate stats on match types
+
+TODO: Unmatched
+---------------
 
-Find duplicates and clean them up. Generate stats on match types.
+* raw refs may contain duplicates (e.g. "crossref" and "grobid")
+* refs should appear in order as they are found in the paper
+
+Idea was that "source release ident + ref index" should allow completeness and
+order. "crossref" and "grobid" order may vary.
+
+In any way, we want the raw ref blob sorted by release ident - it's already
+sorted by work ident. We do have a work ident for all brefs as well, so we need
+to sort the combined bref blob by work id.
+
+    bref blob        raw ref blob
+       work_id           work_id
+
+    For each work_id we want to know, for what entries we found some ID
+    somewhere. For all others, we want to include them from the raw ref; need
+    to convert from ref to bref on the fly.
+
+    Comparison by e.g. identifiers or title. Make sure it's kind of unique.
+
+We should end up with 1.
 """
 
 import argparse
@@ -231,7 +269,6 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
     def output(self):
         return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
 
-
 # ----8< Derivations
 
 #
@@ -938,7 +975,7 @@ class OpenLibraryEditionsMapped(Refcat):
 class UnmatchedMapped(Refcat):
     """
     Map unmatched refs (converted to release schema on the fly) to container
-    names to do approximate matches with OL. 217m53.989s.
+    names to do approximate matches with OL. 221m55.746s.
     """
     def requires(self):
         return RefsWithoutIdentifiers()
diff --git a/skate/matchset.go b/skate/matchset.go
index 271bad6..3ad1047 100644
--- a/skate/matchset.go
+++ b/skate/matchset.go
@@ -11,7 +11,9 @@ import (
 // this going: We build a suffix array out of "1F<s>1F<s>..." and prepend "1F"
 // to the string to lookup to check for a match. This results in a behaviour
 // similar to strings.HasPrefix. Multiple results must be handled by the user
-// (or just work with the cases, where you get exactly one match).
+// (or just work with the cases, where you get exactly one match). A related
+// blog post on a similar technique:
+// https://eli.thegreenplace.net/2016/suffix-arrays-in-the-go-standard-library/.
 const matchSetSep = "\u001F"
 
 // MatchSet allows to match a string against multiple strings at once. Rough
author	Martin Czygan <martin.czygan@gmail.com>	2021-06-04 22:15:35 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-06-04 22:15:35 +0200
commit	3ef6b66923aa97c539640085bd88cccac7ede95e (patch)
tree	0d1711112ed91f54c82c9078b1ec3325cb5be8d7
parent	a591f434db90efd127b33ac3f1889b786e58e27e (diff)
download	refcat-3ef6b66923aa97c539640085bd88cccac7ede95e.tar.gz refcat-3ef6b66923aa97c539640085bd88cccac7ede95e.zip