aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-15 04:24:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-15 04:24:38 +0100
commit135675d544dab8629900bd9c8816b9a094118ad8 (patch)
treeed3e660816ecd43847a12467332c3bc48236554d
parent103f75a6e6af836f7a4afa5746847ef7ce63591d (diff)
downloadfuzzycat-135675d544dab8629900bd9c8816b9a094118ad8.tar.gz
fuzzycat-135675d544dab8629900bd9c8816b9a094118ad8.zip
matching stub
-rw-r--r--fuzzycat/entities.py55
-rw-r--r--fuzzycat/matching.py77
-rw-r--r--tests/test_matching.py19
3 files changed, 145 insertions, 6 deletions
diff --git a/fuzzycat/entities.py b/fuzzycat/entities.py
new file mode 100644
index 0000000..0b9ebc3
--- /dev/null
+++ b/fuzzycat/entities.py
@@ -0,0 +1,55 @@
+import collections
+import json
+
+import toml
+from fatcat_openapi_client import ApiClient
+
+
+def entity_to_dict(entity, api_client=None) -> dict:
+ """
+ Hack to take advantage of the code-generated serialization code.
+
+ Initializing/destroying ApiClient objects is surprisingly expensive
+ (because it involves a threadpool), so we allow passing an existing
+ instance. If you already have a full-on API connection `api`, you can
+ access the ApiClient object as `api.api_client`. This is such a speed-up
+ that this argument may become mandatory.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ return api_client.sanitize_for_serialization(entity)
+
+
+def entity_from_json(json_str: str, entity_type, api_client=None):
+ """
+ Hack to take advantage of the code-generated deserialization code
+
+ See note on `entity_to_dict()` about api_client argument.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ thing = collections.namedtuple('Thing', ['data'])
+ thing.data = json_str
+ return api_client.deserialize(thing, entity_type)
+
+
+def entity_from_dict(obj: dict, entity_type, api_client=None):
+ json_str = json.dumps(obj)
+ return entity_from_json(json_str, entity_type, api_client=api_client)
+
+
+def entity_to_toml(entity, api_client=None, pop_fields=None) -> str:
+ """
+ pop_fields parameter can be used to strip out some fields from the resulting
+ TOML. Eg, for fields which should not be edited, like the ident.
+ """
+ obj = entity_to_dict(entity, api_client=api_client)
+ pop_fields = pop_fields or []
+ for k in pop_fields:
+ obj.pop(k, None)
+ return toml.dumps(obj)
+
+
+def entity_from_toml(toml_str: str, entity_type, api_client=None):
+ obj = toml.loads(toml_str)
+ return entity_from_dict(obj, entity_type, api_client=api_client)
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index 518198a..b248024 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -1,6 +1,15 @@
+import os
+import re
+from typing import List, Union, Type
+
import elasticsearch
+import elasticsearch_dsl
+import requests
from fatcat_openapi_client import ContainerEntity, ReleaseEntity
+from fuzzycat.entities import entity_from_dict, entity_from_json
+
+
def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[ReleaseEntity]:
"""
Given a release entity, return a number similar release entities from
@@ -34,18 +43,24 @@ def match_release_fuzzy(release: ReleaseEntity, size=5, es=None) -> List[Release
value = getattr(ext_ids, attr)
if not value:
continue
- s = (
- elasticsearch_dsl.Search(using=es, index="fatcat_release")
- .query("term", **{es_field: value})
- .extra(size=size)
- )
+ s = (elasticsearch_dsl.Search(using=es,
+ index="fatcat_release").query("term", **{
+ es_field: value
+ }).extra(size=size))
print(s)
resp = s.execute()
if len(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity)
body = {
- "query": {"match": {"title": {"query": release.title, "operator": "AND"}}},
+ "query": {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND"
+ }
+ }
+ },
"size": size,
}
resp = es.search(body=body, index="fatcat_release")
@@ -89,3 +104,53 @@ def response_to_entity_list(response, size=5, entity_type=ReleaseEntity):
return retrieve_entity_list(ids, entity_type=entity_type)
else:
raise ValueError("cannot convert {}".format(response))
+
+
+def anything_to_entity(
+ s: str,
+ entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]],
+ api_url: str = "https://api.fatcat.wiki/v0",
+ es_url: str = "https://search.fatcat.wiki",
+) -> Union[ContainerEntity, ReleaseEntity]:
+ """
+ Convert a string to a given entity type. This function may go out to the
+ fatcat API or elasticsearch and hence is expensive.
+ """
+ names = {
+ ContainerEntity: "container",
+ ReleaseEntity: "release",
+ }
+ if not entity_type in names:
+ raise ValueError("cannot convert {} - only: {}".format(entity_type, names.keys()))
+ entity_name = names[entity_type]
+
+ if s is None:
+ raise ValueError("no entity found")
+
+ if os.path.exists(s):
+ with open(s) as f:
+ return entity_from_json(f.read(), entity_type)
+
+ match = re.search("/?([a-z0-9]{26})$", s)
+ if match:
+ url = "{}/{}/{}".format(api_url, entity_name, match.group(1))
+ resp = requests.get(url)
+ if resp.status_code == 200:
+ return entity_from_json(resp.text, entity_type)
+ if resp.status_code == 404:
+ raise ValueError("entity not found: {}".format(url))
+
+ if re.match("[0-9]{4}(-)?[0-9]{3,3}[0-9xx]", s):
+ url = "{}/fatcat_{}/_search?q=issns:{}".format(es_url, entity_name, s)
+ doc = requests.get(url).json()
+ if doc["hits"]["total"] == 1:
+ ident = doc["hits"]["hits"][0]["_source"]["ident"]
+ url = "{}/{}/{}".format(api_url, entity_name, ident)
+ return entity_from_json(requests.get(url).text, entity_type)
+
+ if entity_name == "container":
+ return entity_from_dict({"name": s}, entity_type)
+ elif entity_name == "release":
+ return entity_from_dict({"title": s, "ext_ids": {}}, entity_type)
+ else:
+ raise ValueError("unhandled entity type: {}".format(entity_type))
diff --git a/tests/test_matching.py b/tests/test_matching.py
new file mode 100644
index 0000000..927b383
--- /dev/null
+++ b/tests/test_matching.py
@@ -0,0 +1,19 @@
+from fuzzycat.matching import anything_to_entity, match_release_fuzzy
+from fatcat_openapi_client import ReleaseEntity
+import pytest
+import elasticsearch
+
+@pytest.fixture
+def es_client():
+ return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:80"])
+
+@pytest.mark.skip
+def test_match_release_fuzzy(es_client):
+ cases = (
+ ("wtv64ahbdzgwnan7rllwr3nurm", 2),
+ )
+ for case, count in cases:
+ entity = anything_to_entity(case, ReleaseEntity)
+
+ result = match_release_fuzzy(entity, es=es_client)
+ assert len(result) == count