diff options
Diffstat (limited to 'fuzzycat/fatcat')
-rw-r--r-- | fuzzycat/fatcat/api_auth.py | 45 | ||||
-rw-r--r-- | fuzzycat/fatcat/common.py | 164 | ||||
-rw-r--r-- | fuzzycat/fatcat/entities.py | 60 | ||||
-rw-r--r-- | fuzzycat/fatcat/main.py | 22 | ||||
-rw-r--r-- | fuzzycat/fatcat/matching.py | 273 |
5 files changed, 0 insertions, 564 deletions
diff --git a/fuzzycat/fatcat/api_auth.py b/fuzzycat/fatcat/api_auth.py deleted file mode 100644 index 0bad5e9..0000000 --- a/fuzzycat/fatcat/api_auth.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding: utf-8 -""" -API helper, taken from fatcat_tools/api_auth.py -""" - -import os -import sys - -import fatcat_openapi_client - - -def public_api(host_uri): - """ - Note: unlike the authenticated variant, this helper might get called even - if the API isn't going to be used, so it's important that it doesn't try to - actually connect to the API host or something. - """ - conf = fatcat_openapi_client.Configuration() - conf.host = host_uri - return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) - - -def authenticated_api(host_uri, token=None): - """ - Note: if this helper is called, it's implied that an actual API connection - is needed, so it does try to connect and verify credentials. - """ - - conf = fatcat_openapi_client.Configuration() - conf.host = host_uri - if not token: - token = os.environ['FATCAT_API_AUTH_TOKEN'] - if not token: - sys.stderr.write( - 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') - sys.exit(-1) - - conf.api_key["Authorization"] = token - conf.api_key_prefix["Authorization"] = "Bearer" - api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) - - # verify up front that auth is working - api.auth_check() - - return api diff --git a/fuzzycat/fatcat/common.py b/fuzzycat/fatcat/common.py deleted file mode 100644 index 7499ce4..0000000 --- a/fuzzycat/fatcat/common.py +++ /dev/null @@ -1,164 +0,0 @@ -# coding: utf-8 -""" -Adapter for fatcat and fatcat entities. -""" - -import collections -from enum import Enum -from typing import Dict, List, Type, Union - -from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, - ReleaseExtIds, WorkEntity) - -from fuzzycat.fatcat.api_auth import public_api -from fuzzycat.fatcat.entities import entity_from_dict, entity_from_json - - -class MatchStatus(Enum): - """ - When matching two entities, use these levels to express match strength. - When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is - certain, that items do not match. - """ - - EXACT = 0 - STRONG = 1 - WEAK = 2 - AMBIGIOUS = 3 - DIFFERENT = 4 - - -def compare_ext_ids(a: ReleaseExtIds, b: ReleaseExtIds) -> Dict[str, int]: - """ - Returns a dictionary with number of existing, matching and differing - identifier between entity a and b. TODO(martin): It might be helpful to - have some mapping service, that would relate qid to doi, or a mag to a - jstor id, if this information is known. - """ - counter = collections.Counter({"a": 0, "b": 0, "both": 0, "hits": 0, "misses": 0}) - attrs = ( - "doi", - "wikidata_qid", - "isbn13", - "pmid", - "pmcid", - "core", - "arxiv", - "jstor", - "ark", - "mag", - ) - for attr in attrs: - v = getattr(a, attr) - w = getattr(b, attr) - if v: - counter["a"] += 1 - if w: - counter["b"] += 1 - if not v or not w: - continue - counter["both"] += 1 - if v == w: - counter["hits"] += 1 - else: - counter["misses"] += 1 - return counter - - -def fetch_container_list( - ids: List[str], - api: DefaultApi = None, -) -> List[ContainerEntity]: - """ - Fetch a list of containers from the API. - """ - if api is None: - api = public_api("https://api.fatcat.wiki/v0") - result = [] - for id in ids: - try: - ce = api.get_container(id) - result.append(ce) - except ApiException as exc: - if exc.status == 404: - print("[err] failed to fetch container: {}".format(id), file=sys.stderr) - continue - raise - return result - - -def fetch_release_list( - ids: List[str], - api: DefaultApi = None, -) -> List[ReleaseEntity]: - """ - Returns a list of entities. Some entities might be missing. Return all that - are accessible. - """ - if api is None: - api = public_api("https://api.fatcat.wiki/v0") - result = [] - for id in ids: - try: - re = api.get_release(id, hide="refs,abstracts", expand="container") - result.append(re) - except ApiException as exc: - if exc.status == 404: - print("[err] failed to fetch release: {}".format(id), file=sys.stderr) - continue - raise - return result - - -def entity_comparable_attrs( - a: Union[ContainerEntity, ReleaseEntity], - b: Union[ContainerEntity, ReleaseEntity], - entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]], -) -> List[str]: - """ - Return a list of top-level attributes, which are defined on both entities - (i.e. we could actually compare them). - """ - attrs = entity_type.attribute_map.keys() - comparable_attrs = [] - for attr in attrs: - if getattr(a, attr) is None: - continue - if getattr(b, attr) is None: - continue - comparable_attrs.append(attr) - return comparable_attrs - - -def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api=None): - """ - Convert an elasticsearch result to a list of entities. Accepts both a - dictionary and an elasticsearch_dsl.response.Response. - - We take the ids from elasticsearch and retrieve entities via API. - """ - if isinstance(response, dict): - ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size] - elif isinstance(response, elasticsearch_dsl.response.Response): - ids = [hit.to_dict().get("ident") for hit in response] - - if entity_type == ReleaseEntity: - return fetch_release_list(ids, api=api) - if entity_type == ContainerEntity: - return fetch_container_list(ids, api=api) - - raise ValueError("invalid entity type: {}".format(entity_type)) - - -def exact_release_match(a: ReleaseEntity, b: ReleaseEntity) -> bool: - """ - Currently, entities implement comparison through object dictionaries. - """ - return a == b - - -def exact_work_match(a: WorkEntity, b: WorkEntity) -> bool: - """ - Currently, entities implement comparison through object dictionaries. - """ - return a == b diff --git a/fuzzycat/fatcat/entities.py b/fuzzycat/fatcat/entities.py deleted file mode 100644 index 351c2b8..0000000 --- a/fuzzycat/fatcat/entities.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -""" -This is taken from fatcat_tools/transforms/entities. -""" - -import collections -import json - -import toml -from fatcat_openapi_client import ApiClient - - -def entity_to_dict(entity, api_client=None) -> dict: - """ - Hack to take advantage of the code-generated serialization code. - - Initializing/destroying ApiClient objects is surprisingly expensive - (because it involves a threadpool), so we allow passing an existing - instance. If you already have a full-on API connection `api`, you can - access the ApiClient object as `api.api_client`. This is such a speed-up - that this argument may become mandatory. - """ - if not api_client: - api_client = ApiClient() - return api_client.sanitize_for_serialization(entity) - - -def entity_from_json(json_str: str, entity_type, api_client=None): - """ - Hack to take advantage of the code-generated deserialization code - - See note on `entity_to_dict()` about api_client argument. - """ - if not api_client: - api_client = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return api_client.deserialize(thing, entity_type) - - -def entity_from_dict(obj: dict, entity_type, api_client=None): - json_str = json.dumps(obj) - return entity_from_json(json_str, entity_type, api_client=api_client) - - -def entity_to_toml(entity, api_client=None, pop_fields=None) -> str: - """ - pop_fields parameter can be used to strip out some fields from the resulting - TOML. Eg, for fields which should not be edited, like the ident. - """ - obj = entity_to_dict(entity, api_client=api_client) - pop_fields = pop_fields or [] - for k in pop_fields: - obj.pop(k, None) - return toml.dumps(obj) - - -def entity_from_toml(toml_str: str, entity_type, api_client=None): - obj = toml.loads(toml_str) - return entity_from_dict(obj, entity_type, api_client=api_client) diff --git a/fuzzycat/fatcat/main.py b/fuzzycat/fatcat/main.py deleted file mode 100644 index 07e4ad4..0000000 --- a/fuzzycat/fatcat/main.py +++ /dev/null @@ -1,22 +0,0 @@ -# coding: utf-8 -""" -Command line entry point for ad-hoc testing. -""" - -import argparse - -from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds - -from fuzzycat.fatcat.matching import match_release_fuzzy - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("-R", "--release", help="match release", action="store_true") - parser.add_argument("-t", "--title", help="title") - - args = parser.parse_args() - - if args.release and args.title: - re = ReleaseEntity(title=args.title, ext_ids=ReleaseExtIds()) - print(match_release_fuzzy(re, es="https://search.fatcat.wiki")) diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py deleted file mode 100644 index 04ec275..0000000 --- a/fuzzycat/fatcat/matching.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding: utf-8 -""" -Public API for fuzzy matches for fatcat. - -Match methods return candidates, verify methods return a match status. - - match_containar_fuzzy -> List[ContainerEntity] - match_release_fuzzy -> List[ReleaseEntity] - - verify_serial_name -> MatchStatus - verify_container_name -> MatchStatus - verify_container_fuzzy -> MatchStatus - verify_release_fuzzy -> MatchStatus - -Candidate generation will use external data from search and hence is expensive. Verification is fast. -""" - -from typing import List, Optional, Set, Union - -import elasticsearch -from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, - ReleaseExtIds, WorkEntity) -from fatcat_openapi_client.api.default_api import DefaultApi - -from fuzzycat import cleanups -from fuzzycat.fatcat.common import (MatchStatus, compare_ext_ids, response_to_entity_list) -from fuzzycat.serials import serialsdb - - -def match_container_fuzzy(container: ContainerEntity, - size: int = 5, - es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None, - api: Optional[DefaultApi] = None) -> List[ContainerEntity]: - """ - Given a container entity, which can be (very) partial, return a list of - candidate matches. Elasticsearch can be a hostport or the low level client - object. - - Random data point: with 20 parallel workers callind match_container_fuzzy, - we get around 40 req/s. - """ - assert isinstance(container, ContainerEntity) - - if size is None or size == 0: - size = 10000 # or any large number - - if isinstance(es, str): - es = elasticsearch.Elasticsearch([es]) - if es is None: - es = elasticsearch.Elasticsearch() - - # If we find any match by ISSN-L, we return only those. - if container.issnl: - s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query( - "term", issns=container.issnl).extra(size=size)) - resp = s.execute() - if len(resp) > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - # Do we have an exact QID match? - if container.wikidata_qid: - s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query( - "term", wikidata_qid=container.wikidata_qid).extra(size=size)) - resp = s.execute() - if len(resp) > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - # Start with exact name match. - # - # curl -s https://search.fatcat.wiki/fatcat_container/_mapping | jq . - # - # "name": { - # "type": "text", - # "copy_to": [ - # "biblio" - # ], - # "analyzer": "textIcu", - # "search_analyzer": "textIcuSearch" - # }, - # - body = { - "query": { - "match": { - "name": { - "query": container.name, - "operator": "AND" - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_container") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - # Get fuzzy. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "query": { - "match": { - "name": { - "query": container.name, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_container") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - return [] - - -def match_release_fuzzy(release: ReleaseEntity, - size: int = 5, - es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None, - api: Optional[DefaultApi] = None) -> List[ReleaseEntity]: - """ - Given a release entity, return a number similar release entities from - fatcat using Elasticsearch. - """ - assert isinstance(release, ReleaseEntity) - - if size is None or size == 0: - size = 10000 # or any large number - - if isinstance(es, str): - es = elasticsearch.Elasticsearch([es]) - if es is None: - es = elasticsearch.Elasticsearch() - - # Try to match by external identifier. - ext_ids = release.ext_ids - attrs = { - "doi": "doi", - "wikidata_qid": "wikidata_qid", - "isbn13": "isbn13", - "pmid": "pmid", - "pmcid": "pmcid", - "core": "code_id", - "arxiv": "arxiv_id", - "jstor": "jstor_id", - "ark": "ark_id", - "mag": "mag_id", - } - for attr, es_field in attrs.items(): - value = getattr(ext_ids, attr) - if not value: - continue - s = (elasticsearch_dsl.Search(using=es, - index="fatcat_release").query("term", **{ - es_field: value - }).extra(size=size)) - resp = s.execute() - if len(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) - - body = { - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND" - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_release") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) - - # Get fuzzy. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_release") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) - - return [] - - -def verify_serial_name(a: str, b: str) -> MatchStatus: - """ - Serial name verification. Serial names are a subset of container names. - There are about 2M serials. - """ - def verify(a: Set[str], b: Set[str]) -> MatchStatus: - - # If any name yields multiple ISSN-L, we cannot decide. - if len(a) > 1: - return MatchStatus.AMBIGIOUS - if len(b) > 1: - return MatchStatus.AMBIGIOUS - - # If both names point the same ISSN-L, it is an exact match. - if len(a) > 0 and len(a) == len(b): - if len(a & b) == len(a): - return MatchStatus.EXACT - else: - return MatchStatus.DIFFERENT - - # Multiple names possible, but there is overlap. - if len(a & b) > 0: - return MatchStatus.STRONG - - return MatchStatus.AMBIGIOUS - - # First, try values as given. - issnls_for_a = serialsdb.get(a, set()) - issnls_for_b = serialsdb.get(b, set()) - - status = verify(issnls_for_a, issnls_for_b) - if status != MatchStatus.AMBIGIOUS: - return status - - # Try to match slightly cleaned up values. - issnls_for_a = serialsdb.get(a, set(), cleanup_pipeline=cleanups.basic) - issnls_for_b = serialsdb.get(b, set(), cleanup_pipeline=cleanups.basic) - - return verify(issnls_for_a, issnls_for_b) - - -def verify_container_name(a: str, b: str) -> MatchStatus: - status = verify_serial_name(a, b) - if status != MatchStatus.AMBIGIOUS: - return status - - # TODO: add additional verification, string match and common patterns. - - -def verify_container_match(a: ContainerEntity, b: ContainerEntity) -> MatchStatus: - pass - - -def verify_release_match(a: ReleaseEntity, b: ReleaseEntity) -> MatchStatus: - assert isinstance(a, ReleaseEntity) - assert isinstance(b, ReleaseEntity) - - if a == b: - return MatchStatus.EXACT - - a_ext_ids, b_ext_ids = a.ext_ids, b.ext_ids - # Compare ext ids, result is a counter, we are interested in "hits" and - # "misses", only. - cmp_result = compare_ext_ids(a_ext_ids, b_ext_ids) - - # Assume that if more ids match than mismatch, it is a good signal, e.g. if - # only a DOI is defined and they match, it is an exact match. - if cmp_result["hits"] > 0 and cmp_result["misses"] == 0: - return MatchStatus.EXACT - if cmp_result["hits"] > cmp_result["misses"]: - return MatchStatus.STRONG - if cmp_result["hits"] == 0 and cmp_result["misses"] > 0: - return MatchStatus.DIFFERENT - if cmp_result["hits"] < cmp_result["misses"]: - return MatchStatus.AMBIGIOUS - - # TODO: do title verification, apply string cleanups, etc. |