aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/matching.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-15 04:09:26 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-15 04:09:26 +0100
commit103f75a6e6af836f7a4afa5746847ef7ce63591d (patch)
treefe66f4ec4e4f2b6326a3b59acdc330e6c1cddac9 /fuzzycat/matching.py
parentf4437c0274951aa2e9b5f54d960bae638dfcfe8b (diff)
downloadfuzzycat-103f75a6e6af836f7a4afa5746847ef7ce63591d.tar.gz
fuzzycat-103f75a6e6af836f7a4afa5746847ef7ce63591d.zip
include matching (stub)
Diffstat (limited to 'fuzzycat/matching.py')
-rw-r--r--fuzzycat/matching.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
new file mode 100644
index 0000000..518198a
--- /dev/null
+++ b/fuzzycat/matching.py
@@ -0,0 +1,91 @@
+import elasticsearch
+from fatcat_openapi_client import ContainerEntity, ReleaseEntity
+
+def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[ReleaseEntity]:
+ """
+ Given a release entity, return a number similar release entities from
+ fatcat using Elasticsearch.
+ """
+ assert isinstance(release, ReleaseEntity)
+
+ if size is None or size == 0:
+ size = 10000 # or any large number
+
+ if isinstance(es, str):
+ es = elasticsearch.Elasticsearch([es])
+ if es is None:
+ es = elasticsearch.Elasticsearch()
+
+ # Try to match by external identifier.
+ ext_ids = release.ext_ids
+ attrs = {
+ "doi": "doi",
+ "wikidata_qid": "wikidata_qid",
+ "isbn13": "isbn13",
+ "pmid": "pmid",
+ "pmcid": "pmcid",
+ "core": "code_id",
+ "arxiv": "arxiv_id",
+ "jstor": "jstor_id",
+ "ark": "ark_id",
+ "mag": "mag_id",
+ }
+ for attr, es_field in attrs.items():
+ value = getattr(ext_ids, attr)
+ if not value:
+ continue
+ s = (
+ elasticsearch_dsl.Search(using=es, index="fatcat_release")
+ .query("term", **{es_field: value})
+ .extra(size=size)
+ )
+ print(s)
+ resp = s.execute()
+ if len(resp) > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity)
+
+ body = {
+ "query": {"match": {"title": {"query": release.title, "operator": "AND"}}},
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_release")
+ if resp["hits"]["total"] > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity)
+
+ # Get fuzzy.
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+ body = {
+ "query": {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ }
+ }
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_release")
+ if resp["hits"]["total"] > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity)
+
+ # TODO: perform more queries on other fields.
+ return []
+
+
+def response_to_entity_list(response, size=5, entity_type=ReleaseEntity):
+ """
+ Convert an elasticsearch result to a list of entities. Accepts both a
+ dictionary and an elasticsearch_dsl.response.Response.
+
+ We take the ids from elasticsearch and retrieve entities via API.
+ """
+ if isinstance(response, dict):
+ ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size]
+ return retrieve_entity_list(ids, entity_type=entity_type)
+ elif isinstance(response, elasticsearch_dsl.response.Response):
+ ids = [hit.to_dict().get("ident") for hit in response]
+ return retrieve_entity_list(ids, entity_type=entity_type)
+ else:
+ raise ValueError("cannot convert {}".format(response))