diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-15 04:09:26 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-15 04:09:26 +0100 |
commit | 103f75a6e6af836f7a4afa5746847ef7ce63591d (patch) | |
tree | fe66f4ec4e4f2b6326a3b59acdc330e6c1cddac9 /fuzzycat/matching.py | |
parent | f4437c0274951aa2e9b5f54d960bae638dfcfe8b (diff) | |
download | fuzzycat-103f75a6e6af836f7a4afa5746847ef7ce63591d.tar.gz fuzzycat-103f75a6e6af836f7a4afa5746847ef7ce63591d.zip |
include matching (stub)
Diffstat (limited to 'fuzzycat/matching.py')
-rw-r--r-- | fuzzycat/matching.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py new file mode 100644 index 0000000..518198a --- /dev/null +++ b/fuzzycat/matching.py @@ -0,0 +1,91 @@ +import elasticsearch +from fatcat_openapi_client import ContainerEntity, ReleaseEntity + +def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[ReleaseEntity]: + """ + Given a release entity, return a number similar release entities from + fatcat using Elasticsearch. + """ + assert isinstance(release, ReleaseEntity) + + if size is None or size == 0: + size = 10000 # or any large number + + if isinstance(es, str): + es = elasticsearch.Elasticsearch([es]) + if es is None: + es = elasticsearch.Elasticsearch() + + # Try to match by external identifier. + ext_ids = release.ext_ids + attrs = { + "doi": "doi", + "wikidata_qid": "wikidata_qid", + "isbn13": "isbn13", + "pmid": "pmid", + "pmcid": "pmcid", + "core": "code_id", + "arxiv": "arxiv_id", + "jstor": "jstor_id", + "ark": "ark_id", + "mag": "mag_id", + } + for attr, es_field in attrs.items(): + value = getattr(ext_ids, attr) + if not value: + continue + s = ( + elasticsearch_dsl.Search(using=es, index="fatcat_release") + .query("term", **{es_field: value}) + .extra(size=size) + ) + print(s) + resp = s.execute() + if len(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity) + + body = { + "query": {"match": {"title": {"query": release.title, "operator": "AND"}}}, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if resp["hits"]["total"] > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity) + + # Get fuzzy. + # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness + body = { + "query": { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if resp["hits"]["total"] > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity) + + # TODO: perform more queries on other fields. + return [] + + +def response_to_entity_list(response, size=5, entity_type=ReleaseEntity): + """ + Convert an elasticsearch result to a list of entities. Accepts both a + dictionary and an elasticsearch_dsl.response.Response. + + We take the ids from elasticsearch and retrieve entities via API. + """ + if isinstance(response, dict): + ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size] + return retrieve_entity_list(ids, entity_type=entity_type) + elif isinstance(response, elasticsearch_dsl.response.Response): + ids = [hit.to_dict().get("ident") for hit in response] + return retrieve_entity_list(ids, entity_type=entity_type) + else: + raise ValueError("cannot convert {}".format(response)) |