matching: cleanup and documentation

author: Martin Czygan <martin.czygan@gmail.com> 2021-12-07 20:09:09 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2021-12-07 20:09:09 +0100
commit: ac6af6be9a4dae7349472b0d6a4b96958641c1ae (patch)
tree: d86194626b4051ef6d16105a457756ac7a9a0396
parent: 73f54c0ab791f2850850256cfa6065028553940f (diff)
download: fuzzycat-ac6af6be9a4dae7349472b0d6a4b96958641c1ae.tar.gz
fuzzycat-ac6af6be9a4dae7349472b0d6a4b96958641c1ae.zip
1 files changed, 29 insertions, 47 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index cf872a6..12a524d 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -32,7 +32,30 @@ class FuzzyReleaseMatcher:
     completeness of the input document, e.g. if the input has contrib and
     title, then use both, if it only has a title, then use just that, etc.
 
-    This class is currently tested against the live fatcat search instance.
+    Note that while we're only doing a single elasticsearch query, we still
+    need to go to the fatcat API currently for getting the full release
+    document for matched release identifiers; e.g. a call to `match` may take a
+    second or more. Even with a parallel thread pool, this takes some time.
+
+    A possible workaround would be to build "good enough" release entities from
+    the information held in elasticsearch only.
+
+    For really high performance matching of millions of entities (e.g. like
+    20M+ researchgate titles, catalog self-match, citation document matching,
+    ...) this is and will be too slow.
+
+    Anecdata: An early 2020 test run matching 23M "title strings" took
+    literally a couple of weeks to complete.
+
+    This class is currently tested against the live fatcat search instance. A
+    usage example:
+
+    >>> from fuzzycat.matching import FuzzyReleaseMatcher
+    >>> from fatcat_openapi_client import ReleaseEntity
+    >>> from fuzzycat.entities import entity_from_dict
+    >>> matcher = FuzzyReleaseMatcher()
+    >>> re = entity_from_dict({"ext_ids": {}, "title": "Internet Archive Scholar Citation"}, ReleaseEntity)
+    >>> list_of_candidates = matcher.match(re)
     """
     def __init__(self,
                  es="https://search.fatcat.wiki",
@@ -354,7 +377,8 @@ def test_release_tokens():
 
 def fetch_release(ident, api=None):
     """
-    Return release entity of None.
+    Returns the release entity for a given `ident` or `None`, if ident does not
+    exist or the API failed.
     """
     if api is None:
         api = public_api(FATCAT_API_URL)
@@ -373,16 +397,17 @@ def retrieve_entity_list(
     ids: List[str],
     api: DefaultApi = None,
     entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,
+    pool_size: int = 10,
 ) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:
     """
-    Parallel requests.
+    Retrieve a list of entities from the fatcat API in parallel.
     """
     if api is None:
         api = public_api(FATCAT_API_URL)
 
     result = []
     if entity_type == ReleaseEntity:
-        with Pool(10) as p:
+        with Pool(pool_size) as p:
             result = p.map(fetch_release, ids)
         return [v for v in result if v is not None]
     else:
@@ -391,49 +416,6 @@ def retrieve_entity_list(
     return result
 
 
-def retrieve_entity_list_sequential(
-    ids: List[str],
-    api: DefaultApi = None,
-    entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,
-) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:
-    """
-    Retrieve a list of entities. Some entities might be missing. Return all
-    that are accessible.
-
-    TODO: parallelize API access.
-    """
-    if api is None:
-        api = public_api(FATCAT_API_URL)
-    result = []
-    if entity_type == ReleaseEntity:
-        for id in ids:
-            try:
-                re = api.get_release(id, hide="refs,abstracts", expand="container,contribs,files")
-                result.append(re)
-            except ApiException as exc:
-                if exc.status == 404:
-                    print("[err] failed to retrieve release entity: {}".format(id), file=sys.stderr)
-                else:
-                    print("[err] api failed with {}: {}".format(exc.status, exc.message),
-                          file=sys.stderr)
-    elif entity_type == ContainerEntity:
-        for id in ids:
-            try:
-                re = api.get_container(id)
-                result.append(re)
-            except ApiException as exc:
-                if exc.status == 404:
-                    print("[err] failed to retrieve container entity: {}".format(id),
-                          file=sys.stderr)
-                else:
-                    print("[err] api failed with {}: {}".format(exc.status, exc.message),
-                          file=sys.stderr)
-    else:
-        raise ValueError("[err] cannot retrieve ids {} of type {}".format(ids, entity_type))
-
-    return result
-
-
 def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api: DefaultApi = None):
     """
     Convert an elasticsearch result to a list of entities. Accepts both a
author	Martin Czygan <martin.czygan@gmail.com>	2021-12-07 20:09:09 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2021-12-07 20:09:09 +0100
commit	ac6af6be9a4dae7349472b0d6a4b96958641c1ae (patch)
tree	d86194626b4051ef6d16105a457756ac7a9a0396
parent	73f54c0ab791f2850850256cfa6065028553940f (diff)
download	fuzzycat-ac6af6be9a4dae7349472b0d6a4b96958641c1ae.tar.gz fuzzycat-ac6af6be9a4dae7349472b0d6a4b96958641c1ae.zip