From 135675d544dab8629900bd9c8816b9a094118ad8 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 15 Dec 2020 04:24:38 +0100 Subject: matching stub --- fuzzycat/entities.py | 55 ++++++++++++++++++++++++++++++++++++ fuzzycat/matching.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++---- tests/test_matching.py | 19 +++++++++++++ 3 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 fuzzycat/entities.py create mode 100644 tests/test_matching.py diff --git a/fuzzycat/entities.py b/fuzzycat/entities.py new file mode 100644 index 0000000..0b9ebc3 --- /dev/null +++ b/fuzzycat/entities.py @@ -0,0 +1,55 @@ +import collections +import json + +import toml +from fatcat_openapi_client import ApiClient + + +def entity_to_dict(entity, api_client=None) -> dict: + """ + Hack to take advantage of the code-generated serialization code. + + Initializing/destroying ApiClient objects is surprisingly expensive + (because it involves a threadpool), so we allow passing an existing + instance. If you already have a full-on API connection `api`, you can + access the ApiClient object as `api.api_client`. This is such a speed-up + that this argument may become mandatory. + """ + if not api_client: + api_client = ApiClient() + return api_client.sanitize_for_serialization(entity) + + +def entity_from_json(json_str: str, entity_type, api_client=None): + """ + Hack to take advantage of the code-generated deserialization code + + See note on `entity_to_dict()` about api_client argument. + """ + if not api_client: + api_client = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return api_client.deserialize(thing, entity_type) + + +def entity_from_dict(obj: dict, entity_type, api_client=None): + json_str = json.dumps(obj) + return entity_from_json(json_str, entity_type, api_client=api_client) + + +def entity_to_toml(entity, api_client=None, pop_fields=None) -> str: + """ + pop_fields parameter can be used to strip out some fields from the resulting + TOML. Eg, for fields which should not be edited, like the ident. + """ + obj = entity_to_dict(entity, api_client=api_client) + pop_fields = pop_fields or [] + for k in pop_fields: + obj.pop(k, None) + return toml.dumps(obj) + + +def entity_from_toml(toml_str: str, entity_type, api_client=None): + obj = toml.loads(toml_str) + return entity_from_dict(obj, entity_type, api_client=api_client) diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 518198a..b248024 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,6 +1,15 @@ +import os +import re +from typing import List, Union, Type + import elasticsearch +import elasticsearch_dsl +import requests from fatcat_openapi_client import ContainerEntity, ReleaseEntity +from fuzzycat.entities import entity_from_dict, entity_from_json + + def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[ReleaseEntity]: """ Given a release entity, return a number similar release entities from @@ -34,18 +43,24 @@ def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[Release value = getattr(ext_ids, attr) if not value: continue - s = ( - elasticsearch_dsl.Search(using=es, index="fatcat_release") - .query("term", **{es_field: value}) - .extra(size=size) - ) + s = (elasticsearch_dsl.Search(using=es, + index="fatcat_release").query("term", **{ + es_field: value + }).extra(size=size)) print(s) resp = s.execute() if len(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity) body = { - "query": {"match": {"title": {"query": release.title, "operator": "AND"}}}, + "query": { + "match": { + "title": { + "query": release.title, + "operator": "AND" + } + } + }, "size": size, } resp = es.search(body=body, index="fatcat_release") @@ -89,3 +104,53 @@ def response_to_entity_list(response, size=5, entity_type=ReleaseEntity): return retrieve_entity_list(ids, entity_type=entity_type) else: raise ValueError("cannot convert {}".format(response)) + + +def anything_to_entity( + s: str, + entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]], + api_url: str = "https://api.fatcat.wiki/v0", + es_url: str = "https://search.fatcat.wiki", +) -> Union[ContainerEntity, ReleaseEntity]: + """ + Convert a string to a given entity type. This function may go out to the + fatcat API or elasticsearch and hence is expensive. + """ + names = { + ContainerEntity: "container", + ReleaseEntity: "release", + } + if not entity_type in names: + raise ValueError("cannot convert {} - only: {}".format(entity_type, names.keys())) + entity_name = names[entity_type] + + if s is None: + raise ValueError("no entity found") + + if os.path.exists(s): + with open(s) as f: + return entity_from_json(f.read(), entity_type) + + match = re.search("/?([a-z0-9]{26})$", s) + if match: + url = "{}/{}/{}".format(api_url, entity_name, match.group(1)) + resp = requests.get(url) + if resp.status_code == 200: + return entity_from_json(resp.text, entity_type) + if resp.status_code == 404: + raise ValueError("entity not found: {}".format(url)) + + if re.match("[0-9]{4}(-)?[0-9]{3,3}[0-9xx]", s): + url = "{}/fatcat_{}/_search?q=issns:{}".format(es_url, entity_name, s) + doc = requests.get(url).json() + if doc["hits"]["total"] == 1: + ident = doc["hits"]["hits"][0]["_source"]["ident"] + url = "{}/{}/{}".format(api_url, entity_name, ident) + return entity_from_json(requests.get(url).text, entity_type) + + if entity_name == "container": + return entity_from_dict({"name": s}, entity_type) + elif entity_name == "release": + return entity_from_dict({"title": s, "ext_ids": {}}, entity_type) + else: + raise ValueError("unhandled entity type: {}".format(entity_type)) diff --git a/tests/test_matching.py b/tests/test_matching.py new file mode 100644 index 0000000..927b383 --- /dev/null +++ b/tests/test_matching.py @@ -0,0 +1,19 @@ +from fuzzycat.matching import anything_to_entity, match_release_fuzzy +from fatcat_openapi_client import ReleaseEntity +import pytest +import elasticsearch + +@pytest.fixture +def es_client(): + return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:80"]) + +@pytest.mark.skip +def test_match_release_fuzzy(es_client): + cases = ( + ("wtv64ahbdzgwnan7rllwr3nurm", 2), + ) + for case, count in cases: + entity = anything_to_entity(case, ReleaseEntity) + + result = match_release_fuzzy(entity, es=es_client) + assert len(result) == count -- cgit v1.2.3