start larger refactoring: remove cluster

background: verifying hundreds of millions of documents turned out to be a bit slow; anecdata: running clustering and verification over 1.8B inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for those operations. Also: with Go we do not need the extra GNU parallel wrapping. In any case, we aim for fuzzycat refactoring to provide: * better, more configurable verification and small scale matching * removal of batch clustering code (and improve refcat docs) * a place for a bit more generic, similarity based utils The most important piece in fuzzycat is a CSV file containing hand picked test examples for verification - and the code that is able to fulfill that test suite. We want to make this part more robust.
author: Martin Czygan <martin.czygan@gmail.com> 2021-09-24 13:58:51 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-09-24 13:58:51 +0200
commit: 478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
tree: fa467290e8c8df41a1e97a6de751d0f7e790c9de /fuzzycat/utils.py
parent: 86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
download: fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz
fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip
1 files changed, 10 insertions, 5 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 303daf6..24e103a 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -26,12 +26,12 @@ def es_compat_hits_total(resp):
     https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
 
     It is responsibility of the call site to set `track_total_hits` in ES7 to
-    get an exact number.
+    get an exact number (https://www.elastic.co/guide/en/elasticsearch/reference/master/search-your-data.html#track-total-hits).
     """
     try:
-        return resp["hits"]["total"]["value"]
+        return resp["hits"]["total"]["value"]  # ES7
     except TypeError:
-        return resp["hits"]["total"]
+        return resp["hits"]["total"]  # ES6
 
 
 def parse_page_string(s):
@@ -44,6 +44,8 @@ def parse_page_string(s):
 
     Does not handle lists of page numbers, roman numerals, and several other
     patterns.
+
+    Returns a named tuple with start, end and count fields.
     """
     if not s:
         raise ValueError('page parsing: empty string')
@@ -69,7 +71,7 @@ def parse_page_string(s):
     return ParsedPages(start=a, end=b, count=count)
 
 
-def dict_key_exists(doc, path):
+def dict_has_key(doc, path):
     """
     Return true, if key in a dictionary at a given path exists. XXX: probably
     already in glom.
@@ -101,7 +103,10 @@ def doi_prefix(v):
     """
     Return the prefix of a DOI.
     """
-    return v.split("/")[0]
+    parts = v.split("/")
+    if len(parts) == 1:
+        raise ValueError("invalid doi: {}".format(v))
+    return parts[0]
 
 
 def has_doi_prefix(v, prefix="10.1234"):
author	Martin Czygan <martin.czygan@gmail.com>	2021-09-24 13:58:51 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-09-24 13:58:51 +0200
commit	478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
tree	fa467290e8c8df41a1e97a6de751d0f7e790c9de /fuzzycat/utils.py
parent	86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
download	fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip