From 478d7d06ad9e56145cb94f3461c355b1ba9eb491 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 24 Sep 2021 13:58:51 +0200
Subject: start larger refactoring: remove cluster

background: verifying hundreds of millions of documents turned out to be
a bit slow; anecdata: running clustering and verification over 1.8B
inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for
those operations. Also: with Go we do not need the extra GNU parallel
wrapping.

In any case, we aim for fuzzycat refactoring to provide:

* better, more configurable verification and small scale matching
* removal of batch clustering code (and improve refcat docs)
* a place for a bit more generic, similarity based utils

The most important piece in fuzzycat is a CSV file containing hand
picked test examples for verification - and the code that is able to
fulfill that test suite. We want to make this part more robust.
---
 fuzzycat/verify.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fuzzycat/verify.py')

diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 5b90c47..9eb808b 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -91,7 +91,7 @@ from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_
                            TITLE_FRAGMENT_BLACKLIST)
 from fuzzycat.entities import entity_to_dict
 from fuzzycat.utils import (author_similarity_score, clean_doi, contains_chemical_formula,
-                            dict_key_exists, doi_prefix, has_doi_prefix, jaccard, num_project,
+                            dict_has_key, doi_prefix, has_doi_prefix, jaccard, num_project,
                             parse_page_string, slugify_string)
 
 Verify = collections.namedtuple("Verify", "status reason")
@@ -233,10 +233,10 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
         if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
             if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
                 return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED)
-            if a_title == b_title and ((dict_key_exists(a, "extra.subtitle")
-                                        and not dict_key_exists(b, "extra.subtitle")) or
-                                       (dict_key_exists(b, "extra.subtitle")
-                                        and not dict_key_exists(a, "extra.subtitle"))):
+            if a_title == b_title and ((dict_has_key(a, "extra.subtitle")
+                                        and not dict_has_key(b, "extra.subtitle")) or
+                                       (dict_has_key(b, "extra.subtitle")
+                                        and not dict_has_key(a, "extra.subtitle"))):
                 return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC)
     except PathAccessError:
         pass
@@ -301,7 +301,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     # beware: we have versions and "isPartOf", e.g.
     # https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
     # Datacite md schema: https://doi.org/10.14454/7xq3-zf69
-    if dict_key_exists(a, "extra.datacite") and dict_key_exists(b, "extra.datacite"):
+    if dict_has_key(a, "extra.datacite") and dict_has_key(b, "extra.datacite"):
         whitelist = set([
             "HasPart",
             "HasVersion",
@@ -511,8 +511,8 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     # if any([a_authors, b_authors]) and not (a_authors and b_authors):
     # Does not cover case, where both authors are empty.
     if a_release_year == b_release_year and a_title_lower == b_title_lower:
-        if ((dict_key_exists(a, "ext_ids.pmid") and dict_key_exists(b, "ext_ids.doi"))
-                or (dict_key_exists(b, "ext_ids.pmid") and dict_key_exists(a, "ext_ids.doi"))):
+        if ((dict_has_key(a, "ext_ids.pmid") and dict_has_key(b, "ext_ids.doi"))
+                or (dict_has_key(b, "ext_ids.pmid") and dict_has_key(a, "ext_ids.doi"))):
             return Verify(Status.STRONG, Reason.PMID_DOI_PAIR)
 
     # Two JSTOR items will probably be different.
-- 
cgit v1.2.3