aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-11-16 21:13:46 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-11-16 21:13:46 +0100
commitd104f8d0ba8eef5563555de82be66bbf17f961db (patch)
tree47ebb2ee3816b5d65bea0c184b8f8d733ea91681
parente90bc43b2052b70d86875f289115a0876be230cc (diff)
downloadfuzzycat-d104f8d0ba8eef5563555de82be66bbf17f961db.tar.gz
fuzzycat-d104f8d0ba8eef5563555de82be66bbf17f961db.zip
complete migration from away from match_release_fuzzy
Instead, use `FuzzyReleaseMatcher.match`, which has approximately the same behavior.
-rw-r--r--fuzzycat/__main__.py5
-rw-r--r--fuzzycat/matching.py162
-rw-r--r--fuzzycat/simple.py5
-rw-r--r--tests/test_matching.py82
4 files changed, 7 insertions, 247 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index 7792df6..d616efc 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -60,7 +60,7 @@ from fatcat_openapi_client import ReleaseEntity
from fuzzycat.entities import entity_to_dict
from fuzzycat.grobid_unstructured import grobid_parse_unstructured
-from fuzzycat.matching import anything_to_entity, match_release_fuzzy
+from fuzzycat.matching import FuzzyReleaseMatcher, anything_to_entity
from fuzzycat.refs import RefsGroupVerifier
from fuzzycat.simple import closest_fuzzy_release_match
from fuzzycat.utils import random_idents_from_query, random_word
@@ -143,7 +143,8 @@ def run_release_match(args):
"""
try:
entity = anything_to_entity(args.value, ReleaseEntity)
- result = match_release_fuzzy(entity, size=args.size, es=args.es_url)
+ matcher = FuzzyReleaseMatcher(es=args.es_url, size=args.size)
+ result = matcher.match(entity)
except Exception as err:
print("fuzzy match failed: {}".format(err), file=sys.stderr)
else:
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index c83e48c..2984d9a 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -460,168 +460,6 @@ class FuzzyReleaseMatcher:
self.match_release_generic_fuzzy_contrib)
-def match_release_fuzzy(
- release: ReleaseEntity,
- size: int = 5,
- es: Optional[Union[str, Type[elasticsearch.client.Elasticsearch]]] = None,
- api: DefaultApi = None,
- index: str = "fatcat_release",
-) -> List[ReleaseEntity]:
- """
- Given a release entity, return a number similar release entities from
- fatcat using Elasticsearch.
-
- TODO: rename "es" parameter to "es_client", which would be clearer
-
- This is deprecated, move to matcher class.
- """
- assert isinstance(release, ReleaseEntity)
-
- if size is None or size == 0:
- size = 10000 # or any large number
-
- if isinstance(es, str):
- es = elasticsearch.Elasticsearch([es])
- if es is None:
- es = elasticsearch.Elasticsearch()
- if api is None:
- api = public_api(FATCAT_API_URL)
-
- # > query cascade
- #
- # [x] 1 exact ids
- # [ ] 2 exact title and exact contrib
- # [ ] 3 exact title and fuzzy contrib
- # [ ] 4 exact title
- # [ ] 5 title w/o stopwords, fuzzy contrib
- # [ ] 6 title w/o stopwords
- # [ ] 7 fuzzy title and fuzzy contrib
- # [ ] 8 fuzzy whole document
-
- # Try to match by external identifier.
- # TODO: use api, ability to disable; benchmark
- ext_ids = release.ext_ids
- attrs = (
- "doi",
- "wikidata_qid",
- "isbn13",
- "pmid",
- "pmcid",
- "core",
- "arxiv",
- "jstor",
- "ark",
- "mag",
- "doaj",
- "dblp",
- "oai",
- )
- for attr in attrs:
- value = getattr(ext_ids, attr)
- if not value:
- continue
- try:
- r = api.lookup_release(**{attr: value})
- except fatcat_openapi_client.rest.ApiException as err:
- if err.status in [404, 400]:
- r = None
- else:
- raise err
- if r:
- return [r]
-
- if release.title is not None and release.contribs is not None:
- names = " ".join([c.raw_name for c in release.contribs])
- query = {
- "bool": {
- "must": [
- {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- },
- }
- },
- {
- "match": {
- "contrib_names": {
- "query": names,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
- }
- },
- ],
- },
- }
- resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
- if es_compat_hits_total(resp) > 0:
- return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
-
- query = {
- "bool": {
- "should": [
- {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- },
- }
- },
- {
- "match": {
- "contrib_names": {
- "query": names,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
- }
- },
- ],
- },
- }
- resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
- if es_compat_hits_total(resp) > 0:
- return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
-
- # Note: If the title is short, we will get lots of results here; do we need
- # to check for title length or result set length length or result set
- # length here?
- query = {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- }
- }
- }
- resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
- if es_compat_hits_total(resp) > 0:
- return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
-
- # Get fuzzy.
- # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
- query = {
- "match": {
- "title": {
- "query": release.title,
- "operator": "AND",
- "fuzziness": "AUTO",
- }
- }
- }
- resp = es.search(index=index, body={"query": query, "size": size, "track_total_hits": True})
- if es_compat_hits_total(resp) > 0:
- return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
-
- # TODO: perform more queries on other fields.
- return []
-
-
def public_api(host_uri):
"""
Note: unlike the authenticated variant, this helper might get called even
diff --git a/fuzzycat/simple.py b/fuzzycat/simple.py
index ff59ba2..c92b5ae 100644
--- a/fuzzycat/simple.py
+++ b/fuzzycat/simple.py
@@ -24,7 +24,7 @@ from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
from fuzzycat.common import Reason, Status
from fuzzycat.entities import entity_to_dict
from fuzzycat.grobid_unstructured import grobid_parse_unstructured
-from fuzzycat.matching import match_release_fuzzy
+from fuzzycat.matching import FuzzyReleaseMatcher
from fuzzycat.utils import clean_doi
from fuzzycat.verify import verify
@@ -84,7 +84,8 @@ def close_fuzzy_release_matches(release: ReleaseEntity,
result is only returned if all the candidate matches were ambiguous.
"""
- candidates = match_release_fuzzy(release, size=match_limit, es=es_client)
+ matcher = FuzzyReleaseMatcher(es=es_client, size=match_limit)
+ candidates = matcher.match(release)
if not candidates:
return None
diff --git a/tests/test_matching.py b/tests/test_matching.py
index ca94c2a..a7754ee 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -8,12 +8,11 @@ import requests
from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
from fuzzycat.entities import entity_from_dict, entity_from_json
-from fuzzycat.matching import anything_to_entity, match_release_fuzzy, FuzzyReleaseMatcher
+from fuzzycat.matching import anything_to_entity, FuzzyReleaseMatcher
warnings.filterwarnings(
"ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ...
-from fuzzycat.matching import anything_to_entity, match_release_fuzzy
from fuzzycat.config import settings
from fatcat_openapi_client import ReleaseEntity
import pytest
@@ -30,20 +29,6 @@ logger.setLevel(logging.DEBUG)
FATCAT_SEARCH_URL = settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443")
-def is_not_reachable(url, timeout=3):
- return not is_reachable(url)
-
-
-def is_reachable(url, timeout=3):
- """
- Return true, if URL is reachable and returns HTTP 200.
- """
- try:
- return requests.get(url, verify=False, timeout=timeout).ok
- except Exception:
- return False
-
-
def yaml_to_cases(klass,
files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"):
"""
@@ -63,71 +48,6 @@ def es_client():
return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
-# @pytest.mark.skipif(
-# is_not_reachable(FATCAT_SEARCH_URL),
-# reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".
-# format(FATCAT_SEARCH_URL))
-def test_match_release_fuzzy(es_client, caplog):
- """
- This test is tied to the current index contents, so if that changes, this
- test may fail as well.
-
- Note: Deprecated. We want to get rid of this.
- """
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
-
- result = match_release_fuzzy(entity, es=es_client)
- logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
- assert len(result) == count
-
- # Partial data.
- cases = (
- ({
- "title": "digital libraries",
- "ext_ids": {}
- }, 5),
- ({
- "title": "unlikelytitle",
- "ext_ids": {}
- }, 0),
- ({
- "title": "Imminent dystopia",
- "ext_ids": {}
- }, 2),
- ({
- "title": "",
- "contribs": [{
- "raw_name": "Aristoteles"
- }],
- "ext_ids": {}
- }, 5),
- # ({
- # "title": "Letter",
- # "contribs": [{"raw_name": "Claudel"}],
- # "ext_ids": {}
- # }, 1),
- # ({
- # "title": "The Future of Digital Scholarship",
- # "contribs": [{
- # "raw_name": "Costantino Thanos"
- # }],
- # "ext_ids": {}
- # }, 5),
- )
- for i, (doc, count) in enumerate(cases):
- entity = entity_from_dict(doc, ReleaseEntity)
- result = match_release_fuzzy(entity, es=es_client)
- with caplog.at_level(logging.INFO):
- logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
- [v.title for v in result]))
- assert len(result) == count, doc
-
-
def test_matcher_match_release(es_client, caplog):
cases = (
("wtv64ahbdzgwnan7rllwr3nurm", 1),