diff options
-rw-r--r-- | fuzzycat/matching.py | 76 |
1 files changed, 29 insertions, 47 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index cf872a6..12a524d 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -32,7 +32,30 @@ class FuzzyReleaseMatcher: completeness of the input document, e.g. if the input has contrib and title, then use both, if it only has a title, then use just that, etc. - This class is currently tested against the live fatcat search instance. + Note that while we're only doing a single elasticsearch query, we still + need to go to the fatcat API currently for getting the full release + document for matched release identifiers; e.g. a call to `match` may take a + second or more. Even with a parallel thread pool, this takes some time. + + A possible workaround would be to build "good enough" release entities from + the information held in elasticsearch only. + + For really high performance matching of millions of entities (e.g. like + 20M+ researchgate titles, catalog self-match, citation document matching, + ...) this is and will be too slow. + + Anecdata: An early 2020 test run matching 23M "title strings" took + literally a couple of weeks to complete. + + This class is currently tested against the live fatcat search instance. A + usage example: + + >>> from fuzzycat.matching import FuzzyReleaseMatcher + >>> from fatcat_openapi_client import ReleaseEntity + >>> from fuzzycat.entities import entity_from_dict + >>> matcher = FuzzyReleaseMatcher() + >>> re = entity_from_dict({"ext_ids": {}, "title": "Internet Archive Scholar Citation"}, ReleaseEntity) + >>> list_of_candidates = matcher.match(re) """ def __init__(self, es="https://search.fatcat.wiki", @@ -354,7 +377,8 @@ def test_release_tokens(): def fetch_release(ident, api=None): """ - Return release entity of None. + Returns the release entity for a given `ident` or `None`, if ident does not + exist or the API failed. """ if api is None: api = public_api(FATCAT_API_URL) @@ -373,16 +397,17 @@ def retrieve_entity_list( ids: List[str], api: DefaultApi = None, entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity, + pool_size: int = 10, ) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]: """ - Parallel requests. + Retrieve a list of entities from the fatcat API in parallel. """ if api is None: api = public_api(FATCAT_API_URL) result = [] if entity_type == ReleaseEntity: - with Pool(10) as p: + with Pool(pool_size) as p: result = p.map(fetch_release, ids) return [v for v in result if v is not None] else: @@ -391,49 +416,6 @@ def retrieve_entity_list( return result -def retrieve_entity_list_sequential( - ids: List[str], - api: DefaultApi = None, - entity_type: Union[Type[ReleaseEntity], Type[ContainerEntity]] = ReleaseEntity, -) -> List[Union[Type[ReleaseEntity], Type[ContainerEntity]]]: - """ - Retrieve a list of entities. Some entities might be missing. Return all - that are accessible. - - TODO: parallelize API access. - """ - if api is None: - api = public_api(FATCAT_API_URL) - result = [] - if entity_type == ReleaseEntity: - for id in ids: - try: - re = api.get_release(id, hide="refs,abstracts", expand="container,contribs,files") - result.append(re) - except ApiException as exc: - if exc.status == 404: - print("[err] failed to retrieve release entity: {}".format(id), file=sys.stderr) - else: - print("[err] api failed with {}: {}".format(exc.status, exc.message), - file=sys.stderr) - elif entity_type == ContainerEntity: - for id in ids: - try: - re = api.get_container(id) - result.append(re) - except ApiException as exc: - if exc.status == 404: - print("[err] failed to retrieve container entity: {}".format(id), - file=sys.stderr) - else: - print("[err] api failed with {}: {}".format(exc.status, exc.message), - file=sys.stderr) - else: - raise ValueError("[err] cannot retrieve ids {} of type {}".format(ids, entity_type)) - - return result - - def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api: DefaultApi = None): """ Convert an elasticsearch result to a list of entities. Accepts both a |