aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-04 22:15:35 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-04 22:15:35 +0200
commit3ef6b66923aa97c539640085bd88cccac7ede95e (patch)
tree0d1711112ed91f54c82c9078b1ec3325cb5be8d7
parenta591f434db90efd127b33ac3f1889b786e58e27e (diff)
downloadrefcat-3ef6b66923aa97c539640085bd88cccac7ede95e.tar.gz
refcat-3ef6b66923aa97c539640085bd88cccac7ede95e.zip
add link to blog on suffix arrays
-rw-r--r--python/refcat/tasks.py45
-rw-r--r--skate/matchset.go4
2 files changed, 44 insertions, 5 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 6421007..eae4c6b 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -102,15 +102,53 @@ TODO
----
* [ ] partial (hold)
+
+Prepared resolver for journal abbreviations; most entries have some journal
+name, so use journal name or issn (extra step) to group candidates per journal.
+Journals may on average have 1K publications (few have 100K+); then for each
+candidate ref find most likely match in the releases of a journal.
+
+Also, many partial records do have more information in unstructured; parse this
+out first.
+
* [ ] OL fuzzy
+
+Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches,
+which might indicate different editions (debug this later).
+
* [ ] unmatched (in a final pass)
We can match by id and key, e.g. extract id and key, sort and merge (id, key)
from graph, and if not available use raw input.
-* [ ] QA
+> QA things
+
+* [ ] find duplicates and clean them up
+* [ ] generate stats on match types
+
+TODO: Unmatched
+---------------
-Find duplicates and clean them up. Generate stats on match types.
+* raw refs may contain duplicates (e.g. "crossref" and "grobid")
+* refs should appear in order as they are found in the paper
+
+Idea was that "source release ident + ref index" should allow completeness and
+order. "crossref" and "grobid" order may vary.
+
+In any way, we want the raw ref blob sorted by release ident - it's already
+sorted by work ident. We do have a work ident for all brefs as well, so we need
+to sort the combined bref blob by work id.
+
+ bref blob raw ref blob
+ work_id work_id
+
+ For each work_id we want to know, for what entries we found some ID
+ somewhere. For all others, we want to include them from the raw ref; need
+ to convert from ref to bref on the fly.
+
+ Comparison by e.g. identifiers or title. Make sure it's kind of unique.
+
+We should end up with 1.
"""
import argparse
@@ -231,7 +269,6 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
-
# ----8< Derivations
#
@@ -938,7 +975,7 @@ class OpenLibraryEditionsMapped(Refcat):
class UnmatchedMapped(Refcat):
"""
Map unmatched refs (converted to release schema on the fly) to container
- names to do approximate matches with OL. 217m53.989s.
+ names to do approximate matches with OL. 221m55.746s.
"""
def requires(self):
return RefsWithoutIdentifiers()
diff --git a/skate/matchset.go b/skate/matchset.go
index 271bad6..3ad1047 100644
--- a/skate/matchset.go
+++ b/skate/matchset.go
@@ -11,7 +11,9 @@ import (
// this going: We build a suffix array out of "1F<s>1F<s>..." and prepend "1F"
// to the string to lookup to check for a match. This results in a behaviour
// similar to strings.HasPrefix. Multiple results must be handled by the user
-// (or just work with the cases, where you get exactly one match).
+// (or just work with the cases, where you get exactly one match). A related
+// blog post on a similar technique:
+// https://eli.thegreenplace.net/2016/suffix-arrays-in-the-go-standard-library/.
const matchSetSep = "\u001F"
// MatchSet allows to match a string against multiple strings at once. Rough