aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-12-07 20:09:09 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-12-07 20:09:09 +0100
commitac6af6be9a4dae7349472b0d6a4b96958641c1ae (patch)
treed86194626b4051ef6d16105a457756ac7a9a0396
parent73f54c0ab791f2850850256cfa6065028553940f (diff)
downloadfuzzycat-ac6af6be9a4dae7349472b0d6a4b96958641c1ae.tar.gz
fuzzycat-ac6af6be9a4dae7349472b0d6a4b96958641c1ae.zip
matching: cleanup and documentation
-rw-r--r--fuzzycat/matching.py76
1 files changed, 29 insertions, 47 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index cf872a6..12a524d 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -32,7 +32,30 @@ class FuzzyReleaseMatcher:
completeness of the input document, e.g. if the input has contrib and
title, then use both, if it only has a title, then use just that, etc.
- This class is currently tested against the live fatcat search instance.
+ Note that while we're only doing a single elasticsearch query, we still
+ need to go to the fatcat API currently for getting the full release
+ document for matched release identifiers; e.g. a call to `match` may take a
+ second or more. Even with a parallel thread pool, this takes some time.
+
+ A possible workaround would be to build "good enough" release entities from
+ the information held in elasticsearch only.
+
+ For really high performance matching of millions of entities (e.g. like
+ 20M+ researchgate titles, catalog self-match, citation document matching,
+ ...) this is and will be too slow.
+
+ Anecdata: An early 2020 test run matching 23M "title strings" took
+ literally a couple of weeks to complete.
+
+ This class is currently tested against the live fatcat search instance. A
+ usage example:
+
+ >>> from fuzzycat.matching import FuzzyReleaseMatcher
+ >>> from fatcat_openapi_client import ReleaseEntity
+ >>> from fuzzycat.entities import entity_from_dict
+ >>> matcher = FuzzyReleaseMatcher()
+ >>> re = entity_from_dict({"ext_ids": {}, "title": "Internet Archive Scholar Citation"}, ReleaseEntity)
+ >>> list_of_candidates = matcher.match(re)
"""
def __init__(self,
es="https://search.fatcat.wiki",
@@ -354,7 +377,8 @@ def test_release_tokens():
def fetch_release(ident, api=None):
"""
- Return release entity of None.
+ Returns the release entity for a given `ident` or `None`, if ident does not
+ exist or the API failed.
"""
if api is None:
api = public_api(FATCAT_API_URL)
@@ -373,16 +397,17 @@ def retrieve_entity_list(
ids: List[str],
api: DefaultApi = None,
entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,
+ pool_size: int = 10,
) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:
"""
- Parallel requests.
+ Retrieve a list of entities from the fatcat API in parallel.
"""
if api is None:
api = public_api(FATCAT_API_URL)
result = []
if entity_type == ReleaseEntity:
- with Pool(10) as p:
+ with Pool(pool_size) as p:
result = p.map(fetch_release, ids)
return [v for v in result if v is not None]
else:
@@ -391,49 +416,6 @@ def retrieve_entity_list(
return result
-def retrieve_entity_list_sequential(
- ids: List[str],
- api: DefaultApi = None,
- entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity,
-) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]:
- """
- Retrieve a list of entities. Some entities might be missing. Return all
- that are accessible.
-
- TODO: parallelize API access.
- """
- if api is None:
- api = public_api(FATCAT_API_URL)
- result = []
- if entity_type == ReleaseEntity:
- for id in ids:
- try:
- re = api.get_release(id, hide="refs,abstracts", expand="container,contribs,files")
- result.append(re)
- except ApiException as exc:
- if exc.status == 404:
- print("[err] failed to retrieve release entity: {}".format(id), file=sys.stderr)
- else:
- print("[err] api failed with {}: {}".format(exc.status, exc.message),
- file=sys.stderr)
- elif entity_type == ContainerEntity:
- for id in ids:
- try:
- re = api.get_container(id)
- result.append(re)
- except ApiException as exc:
- if exc.status == 404:
- print("[err] failed to retrieve container entity: {}".format(id),
- file=sys.stderr)
- else:
- print("[err] api failed with {}: {}".format(exc.status, exc.message),
- file=sys.stderr)
- else:
- raise ValueError("[err] cannot retrieve ids {} of type {}".format(ids, entity_type))
-
- return result
-
-
def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api: DefaultApi = None):
"""
Convert an elasticsearch result to a list of entities. Accepts both a