From 6d5811693c36b9e73dedf0205c40f2aed63e2870 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 16 Dec 2020 19:56:01 -0800 Subject: add fuzzy match filtering to DOAJ importer In this default configuration, any entities with a fuzzy match (even "ambiguous") will be skipped at import time, to prevent creating duplicates. This is conservative towards not creating new/duplicate entities. In the future, as we get more confidence in fuzzy match/verification, we can start to ignore AMBIGUOUS, handle EXACT as same release, and merge STRONG (and WEAK?) matches under the same work entity. --- python/fatcat_tools/importers/doaj_article.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 03752484..191a65d8 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter): return False break - # TODO: in the future could do fuzzy match here, eg using elasticsearch + if not existing and self.do_fuzzy_match: + fuzzy_result = self.match_existing_release_fuzzy(re) + # TODO: in the future, could assign work_id for clustering, or for + # "EXACT" match, set existing and allow (optional) update code path + # to run + if fuzzy_result is not None: + self.counts["exists-fuzzy"] += 1 + return False - # create entity + # if no fuzzy existing match, create entity if not existing: return True -- cgit v1.2.3