aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-04 22:15:35 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-04 22:15:35 +0200
commit3ef6b66923aa97c539640085bd88cccac7ede95e (patch)
tree0d1711112ed91f54c82c9078b1ec3325cb5be8d7 /python
parenta591f434db90efd127b33ac3f1889b786e58e27e (diff)
downloadrefcat-3ef6b66923aa97c539640085bd88cccac7ede95e.tar.gz
refcat-3ef6b66923aa97c539640085bd88cccac7ede95e.zip
add link to blog on suffix arrays
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py45
1 files changed, 41 insertions, 4 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 6421007..eae4c6b 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -102,15 +102,53 @@ TODO
----
* [ ] partial (hold)
+
+Prepared resolver for journal abbreviations; most entries have some journal
+name, so use journal name or issn (extra step) to group candidates per journal.
+Journals may on average have 1K publications (few have 100K+); then for each
+candidate ref find most likely match in the releases of a journal.
+
+Also, many partial records do have more information in unstructured; parse this
+out first.
+
* [ ] OL fuzzy
+
+Beside 200K links via ISBN, about 10M links via title. Many "year" mismatches,
+which might indicate different editions (debug this later).
+
* [ ] unmatched (in a final pass)
We can match by id and key, e.g. extract id and key, sort and merge (id, key)
from graph, and if not available use raw input.
-* [ ] QA
+> QA things
+
+* [ ] find duplicates and clean them up
+* [ ] generate stats on match types
+
+TODO: Unmatched
+---------------
-Find duplicates and clean them up. Generate stats on match types.
+* raw refs may contain duplicates (e.g. "crossref" and "grobid")
+* refs should appear in order as they are found in the paper
+
+Idea was that "source release ident + ref index" should allow completeness and
+order. "crossref" and "grobid" order may vary.
+
+In any way, we want the raw ref blob sorted by release ident - it's already
+sorted by work ident. We do have a work ident for all brefs as well, so we need
+to sort the combined bref blob by work id.
+
+ bref blob raw ref blob
+ work_id work_id
+
+ For each work_id we want to know, for what entries we found some ID
+ somewhere. For all others, we want to include them from the raw ref; need
+ to convert from ref to bref on the fly.
+
+ Comparison by e.g. identifiers or title. Make sure it's kind of unique.
+
+We should end up with 1.
"""
import argparse
@@ -231,7 +269,6 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP_AUTHORS, format=Zstd)
-
# ----8< Derivations
#
@@ -938,7 +975,7 @@ class OpenLibraryEditionsMapped(Refcat):
class UnmatchedMapped(Refcat):
"""
Map unmatched refs (converted to release schema on the fly) to container
- names to do approximate matches with OL. 217m53.989s.
+ names to do approximate matches with OL. 221m55.746s.
"""
def requires(self):
return RefsWithoutIdentifiers()