diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-15 04:24:38 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-15 04:24:38 +0100 |
commit | 135675d544dab8629900bd9c8816b9a094118ad8 (patch) | |
tree | ed3e660816ecd43847a12467332c3bc48236554d /fuzzycat/matching.py | |
parent | 103f75a6e6af836f7a4afa5746847ef7ce63591d (diff) | |
download | fuzzycat-135675d544dab8629900bd9c8816b9a094118ad8.tar.gz fuzzycat-135675d544dab8629900bd9c8816b9a094118ad8.zip |
matching stub
Diffstat (limited to 'fuzzycat/matching.py')
-rw-r--r-- | fuzzycat/matching.py | 77 |
1 files changed, 71 insertions, 6 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 518198a..b248024 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,6 +1,15 @@ +import os +import re +from typing import List, Union, Type + import elasticsearch +import elasticsearch_dsl +import requests from fatcat_openapi_client import ContainerEntity, ReleaseEntity +from fuzzycat.entities import entity_from_dict, entity_from_json + + def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[ReleaseEntity]: """ Given a release entity, return a number similar release entities from @@ -34,18 +43,24 @@ def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[Release value = getattr(ext_ids, attr) if not value: continue - s = ( - elasticsearch_dsl.Search(using=es, index="fatcat_release") - .query("term", **{es_field: value}) - .extra(size=size) - ) + s = (elasticsearch_dsl.Search(using=es, + index="fatcat_release").query("term", **{ + es_field: value + }).extra(size=size)) print(s) resp = s.execute() if len(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity) body = { - "query": {"match": {"title": {"query": release.title, "operator": "AND"}}}, + "query": { + "match": { + "title": { + "query": release.title, + "operator": "AND" + } + } + }, "size": size, } resp = es.search(body=body, index="fatcat_release") @@ -89,3 +104,53 @@ def response_to_entity_list(response, size=5, entity_type=ReleaseEntity): return retrieve_entity_list(ids, entity_type=entity_type) else: raise ValueError("cannot convert {}".format(response)) + + +def anything_to_entity( + s: str, + entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]], + api_url: str = "https://api.fatcat.wiki/v0", + es_url: str = "https://search.fatcat.wiki", +) -> Union[ContainerEntity, ReleaseEntity]: + """ + Convert a string to a given entity type. This function may go out to the + fatcat API or elasticsearch and hence is expensive. + """ + names = { + ContainerEntity: "container", + ReleaseEntity: "release", + } + if not entity_type in names: + raise ValueError("cannot convert {} - only: {}".format(entity_type, names.keys())) + entity_name = names[entity_type] + + if s is None: + raise ValueError("no entity found") + + if os.path.exists(s): + with open(s) as f: + return entity_from_json(f.read(), entity_type) + + match = re.search("/?([a-z0-9]{26})$", s) + if match: + url = "{}/{}/{}".format(api_url, entity_name, match.group(1)) + resp = requests.get(url) + if resp.status_code == 200: + return entity_from_json(resp.text, entity_type) + if resp.status_code == 404: + raise ValueError("entity not found: {}".format(url)) + + if re.match("[0-9]{4}(-)?[0-9]{3,3}[0-9xx]", s): + url = "{}/fatcat_{}/_search?q=issns:{}".format(es_url, entity_name, s) + doc = requests.get(url).json() + if doc["hits"]["total"] == 1: + ident = doc["hits"]["hits"][0]["_source"]["ident"] + url = "{}/{}/{}".format(api_url, entity_name, ident) + return entity_from_json(requests.get(url).text, entity_type) + + if entity_name == "container": + return entity_from_dict({"name": s}, entity_type) + elif entity_name == "release": + return entity_from_dict({"title": s, "ext_ids": {}}, entity_type) + else: + raise ValueError("unhandled entity type: {}".format(entity_type)) |