aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-21 15:55:52 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-21 15:55:52 +0200
commitdccbaa5c1b0ba556449de6024540ba05d67ef6a0 (patch)
tree60876376084fbeb1ec541079bc458d9f7858370d
parent6a224c316869ba2651094ad47e1d92e102524f85 (diff)
downloadfuzzycat-dccbaa5c1b0ba556449de6024540ba05d67ef6a0.tar.gz
fuzzycat-dccbaa5c1b0ba556449de6024540ba05d67ef6a0.zip
matching: run an additional es query for fuzzy matching
-rw-r--r--fuzzycat/matching.py74
-rw-r--r--tests/test_matching.py22
2 files changed, 93 insertions, 3 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index 33e130e..310dfc2 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -73,12 +73,83 @@ def match_release_fuzzy(
if r:
return [r]
+
+ if release.title is not None and release.contribs is not None:
+ names = " ".join([c.raw_name for c in release.contribs])
+ body = {
+ "track_total_hits": True,
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ {
+ "match": {
+ "contrib_names": {
+ "query": names,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ }
+ }
+ },
+ ],
+ },
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_release")
+ if es_compat_hits_total(resp) > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
+
+ body = {
+ "track_total_hits": True,
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ },
+ }
+ },
+ {
+ "match": {
+ "contrib_names": {
+ "query": names,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ }
+ }
+ },
+ ],
+ },
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_release")
+ if es_compat_hits_total(resp) > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
+
+ # Note: If the title is short, we will get lots of results here; do we need
+ # to check for title length or result set length length or result set
+ # length here?
body = {
+ "track_total_hits": True,
"query": {
"match": {
"title": {
"query": release.title,
- "operator": "AND"
+ "operator": "AND",
}
}
},
@@ -91,6 +162,7 @@ def match_release_fuzzy(
# Get fuzzy.
# https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
body = {
+ "track_total_hits": True,
"query": {
"match": {
"title": {
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 2122144..c2e26f3 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -72,6 +72,24 @@ def test_match_release_fuzzy(es_client, caplog):
"ext_ids": {}
}, 5),
({
+ "title": "unlikelytitle",
+ "ext_ids": {}
+ }, 0),
+ ({
+ "title": "Imminent dystopia",
+ "ext_ids": {}
+ }, 2),
+ ({
+ "title": "",
+ "contribs": [{"raw_name": "Aristoteles"}],
+ "ext_ids": {}
+ }, 5),
+ ({
+ "title": "Letter",
+ "contribs": [{"raw_name": "Claudel"}],
+ "ext_ids": {}
+ }, 1),
+ ({
"title": "The Future of Digital Scholarship",
"contribs": [{
"raw_name": "Costantino Thanos"
@@ -83,6 +101,6 @@ def test_match_release_fuzzy(es_client, caplog):
entity = entity_from_dict(doc, ReleaseEntity)
result = match_release_fuzzy(entity, es=es_client)
with caplog.at_level(logging.INFO):
- logging.info("[{}] given {}, found {}, {}".format(i, entity.title, len(result),
+ logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
[v.title for v in result]))
- assert len(result) == count
+ assert len(result) == count, doc