diff options
author | bnewbold <bnewbold@archive.org> | 2020-12-18 02:13:47 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-12-18 02:13:47 +0000 |
commit | 443243e8cccba3e779b7c56d0cdb6dcd992a3100 (patch) | |
tree | b279887d9038daa19b72e53509658f016eaec452 /python/fatcat_tools | |
parent | 7d90a0404e3ecb44c7d0ca93b2c32a0f66b8d88a (diff) | |
parent | 5eeb7a9d61beb8cb40fd89bd91fcd9dd820035aa (diff) | |
download | fatcat-443243e8cccba3e779b7c56d0cdb6dcd992a3100.tar.gz fatcat-443243e8cccba3e779b7c56d0cdb6dcd992a3100.zip |
Merge branch 'bnewbold-doaj-fuzzy' into 'master'
DOAJ import fuzzy match filter
See merge request webgroup/fatcat!92
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 64 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 11 |
2 files changed, 71 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 3c810391..6dc2ab9e 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,16 +7,23 @@ import sqlite3 import datetime import subprocess from collections import Counter -from confluent_kafka import Consumer, KafkaException +from typing import Optional, Tuple import xml.etree.ElementTree as ET +import elasticsearch from bs4 import BeautifulSoup +from confluent_kafka import Consumer, KafkaException import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity from fatcat_openapi_client.rest import ApiException +from fuzzycat.matching import match_release_fuzzy +import fuzzycat.common +import fuzzycat.verify # TODO: refactor so remove need for this (re-imports for backwards compatibility) from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.transforms import entity_to_dict DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 @@ -145,12 +152,16 @@ class EntityImporter: self.api = api self.do_updates = bool(kwargs.get('do_updates', True)) + self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True) self.bezerk_mode = kwargs.get('bezerk_mode', False) self.submit_mode = kwargs.get('submit_mode', False) self.edit_batch_size = kwargs.get('edit_batch_size', 100) self.editgroup_description = kwargs.get('editgroup_description') self.editgroup_extra = eg_extra - self.reset() + + self.es_client = kwargs.get('es_client') + if not self.es_client: + self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") self._issnl_id_map = dict() self._orcid_id_map = dict() @@ -158,6 +169,8 @@ class EntityImporter: self._doi_id_map = dict() self._pmid_id_map = dict() + self.reset() + def reset(self): self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) self._edit_count = 0 @@ -433,6 +446,53 @@ class EntityImporter: existing.urls = [u for u in existing.urls if u.url not in redundant_urls] return existing + def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: + """ + This helper function uses fuzzycat (and elasticsearch) to look for + existing release entities with similar metadata. + + Returns None if there was no match of any kind, or a single tuple + (status: str, reason: str, existing: ReleaseEntity) if there was a match. + + Status string is one of the fuzzycat.common.Status, with "strongest + match" in this sorted order: + + - EXACT + - STRONG + - WEAK + - AMBIGUOUS + + Eg, if there is any EXACT match that is always returned; an AMBIGIOUS + result is only returned if all the candidate matches were ambiguous. + """ + + # this map used to establish priority order of verified matches + STATUS_SORT = { + fuzzycat.common.Status.TODO: 0, + fuzzycat.common.Status.EXACT: 10, + fuzzycat.common.Status.STRONG: 20, + fuzzycat.common.Status.WEAK: 30, + fuzzycat.common.Status.AMBIGUOUS: 40, + fuzzycat.common.Status.DIFFERENT: 60, + } + + # TODO: the size here is a first guess; what should it really be? + candidates = match_release_fuzzy(release, size=10, es=self.es_client) + if not candidates: + return None + + release_dict = entity_to_dict(release, api_client=self.api.api_client) + verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + + # chose the "closest" match + closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] + if closest[0].status == fuzzycat.common.Status.DIFFERENT: + return None + elif closest[0].status == fuzzycat.common.Status.TODO: + raise NotImplementedError("fuzzycat verify hit a Status.TODO") + else: + return (closest[0].status.name, closest[0].reason.value, closest[1]) + class RecordPusher: """ diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 03752484..191a65d8 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter): return False break - # TODO: in the future could do fuzzy match here, eg using elasticsearch + if not existing and self.do_fuzzy_match: + fuzzy_result = self.match_existing_release_fuzzy(re) + # TODO: in the future, could assign work_id for clustering, or for + # "EXACT" match, set existing and allow (optional) update code path + # to run + if fuzzy_result is not None: + self.counts["exists-fuzzy"] += 1 + return False - # create entity + # if no fuzzy existing match, create entity if not existing: return True |