diff options
| author | bnewbold <bnewbold@archive.org> | 2020-12-18 02:13:47 +0000 | 
|---|---|---|
| committer | bnewbold <bnewbold@archive.org> | 2020-12-18 02:13:47 +0000 | 
| commit | 443243e8cccba3e779b7c56d0cdb6dcd992a3100 (patch) | |
| tree | b279887d9038daa19b72e53509658f016eaec452 /python/fatcat_tools/importers | |
| parent | 7d90a0404e3ecb44c7d0ca93b2c32a0f66b8d88a (diff) | |
| parent | 5eeb7a9d61beb8cb40fd89bd91fcd9dd820035aa (diff) | |
| download | fatcat-443243e8cccba3e779b7c56d0cdb6dcd992a3100.tar.gz fatcat-443243e8cccba3e779b7c56d0cdb6dcd992a3100.zip  | |
Merge branch 'bnewbold-doaj-fuzzy' into 'master'
DOAJ import fuzzy match filter
See merge request webgroup/fatcat!92
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 64 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 11 | 
2 files changed, 71 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 3c810391..6dc2ab9e 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,16 +7,23 @@ import sqlite3  import datetime  import subprocess  from collections import Counter -from confluent_kafka import Consumer, KafkaException +from typing import Optional, Tuple  import xml.etree.ElementTree as ET +import elasticsearch  from bs4 import BeautifulSoup +from confluent_kafka import Consumer, KafkaException  import fatcat_openapi_client +from fatcat_openapi_client import ReleaseEntity  from fatcat_openapi_client.rest import ApiException +from fuzzycat.matching import match_release_fuzzy +import fuzzycat.common +import fuzzycat.verify  # TODO: refactor so remove need for this (re-imports for backwards compatibility)  from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 +from fatcat_tools.transforms import entity_to_dict  DATE_FMT = "%Y-%m-%d"  SANE_MAX_RELEASES = 200 @@ -145,12 +152,16 @@ class EntityImporter:          self.api = api          self.do_updates = bool(kwargs.get('do_updates', True)) +        self.do_fuzzy_match = kwargs.get('do_fuzzy_match', True)          self.bezerk_mode = kwargs.get('bezerk_mode', False)          self.submit_mode = kwargs.get('submit_mode', False)          self.edit_batch_size = kwargs.get('edit_batch_size', 100)          self.editgroup_description = kwargs.get('editgroup_description')          self.editgroup_extra = eg_extra -        self.reset() + +        self.es_client = kwargs.get('es_client') +        if not self.es_client: +            self.es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki")          self._issnl_id_map = dict()          self._orcid_id_map = dict() @@ -158,6 +169,8 @@ class EntityImporter:          self._doi_id_map = dict()          self._pmid_id_map = dict() +        self.reset() +      def reset(self):          self.counts = Counter({'total': 0, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})          self._edit_count = 0 @@ -433,6 +446,53 @@ class EntityImporter:          existing.urls = [u for u in existing.urls if u.url not in redundant_urls]          return existing +    def match_existing_release_fuzzy(self, release: ReleaseEntity) -> Optional[Tuple[str, str, ReleaseEntity]]: +        """ +        This helper function uses fuzzycat (and elasticsearch) to look for +        existing release entities with similar metadata. + +        Returns None if there was no match of any kind, or a single tuple +        (status: str, reason: str, existing: ReleaseEntity) if there was a match. + +        Status string is one of the fuzzycat.common.Status, with "strongest +        match" in this sorted order: + +        - EXACT +        - STRONG +        - WEAK +        - AMBIGUOUS + +        Eg, if there is any EXACT match that is always returned; an AMBIGIOUS +        result is only returned if all the candidate matches were ambiguous. +        """ + +        # this map used to establish priority order of verified matches +        STATUS_SORT = { +            fuzzycat.common.Status.TODO: 0, +            fuzzycat.common.Status.EXACT: 10, +            fuzzycat.common.Status.STRONG: 20, +            fuzzycat.common.Status.WEAK: 30, +            fuzzycat.common.Status.AMBIGUOUS: 40, +            fuzzycat.common.Status.DIFFERENT: 60, +        } + +        # TODO: the size here is a first guess; what should it really be? +        candidates = match_release_fuzzy(release, size=10, es=self.es_client) +        if not candidates: +            return None + +        release_dict = entity_to_dict(release, api_client=self.api.api_client) +        verified = [(fuzzycat.verify.verify(release_dict, entity_to_dict(c, api_client=self.api.api_client)), c) for c in candidates] + +        # chose the "closest" match +        closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0] +        if closest[0].status == fuzzycat.common.Status.DIFFERENT: +            return None +        elif closest[0].status == fuzzycat.common.Status.TODO: +            raise NotImplementedError("fuzzycat verify hit a Status.TODO") +        else: +            return (closest[0].status.name, closest[0].reason.value, closest[1]) +  class RecordPusher:      """ diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 03752484..191a65d8 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter):                          return False                      break -        # TODO: in the future could do fuzzy match here, eg using elasticsearch +        if not existing and self.do_fuzzy_match: +            fuzzy_result = self.match_existing_release_fuzzy(re) +            # TODO: in the future, could assign work_id for clustering, or for +            # "EXACT" match, set existing and allow (optional) update code path +            # to run +            if fuzzy_result is not None: +                self.counts["exists-fuzzy"] += 1 +                return False -        # create entity +        # if no fuzzy existing match, create entity          if not existing:              return True  | 
