From c134c0974d0fc8b57a0d3329d389ac72120a01bb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 21 Oct 2020 03:54:53 +0200 Subject: cleanup --- fuzzycat/cleanups.py | 13 -- fuzzycat/fatcat/api_auth.py | 45 ----- fuzzycat/fatcat/common.py | 164 ------------------ fuzzycat/fatcat/entities.py | 60 ------- fuzzycat/fatcat/main.py | 22 --- fuzzycat/fatcat/matching.py | 273 ------------------------------ fuzzycat/issn.py | 401 -------------------------------------------- fuzzycat/serials.py | 43 ----- fuzzycat/utils.py | 249 --------------------------- setup.py | 2 - tests/test_matching.py | 4 - tests/test_utils.py | 128 -------------- 12 files changed, 1404 deletions(-) delete mode 100644 fuzzycat/cleanups.py delete mode 100644 fuzzycat/fatcat/api_auth.py delete mode 100644 fuzzycat/fatcat/common.py delete mode 100644 fuzzycat/fatcat/entities.py delete mode 100644 fuzzycat/fatcat/main.py delete mode 100644 fuzzycat/fatcat/matching.py delete mode 100644 fuzzycat/issn.py delete mode 100644 fuzzycat/serials.py delete mode 100644 fuzzycat/utils.py delete mode 100644 tests/test_matching.py delete mode 100644 tests/test_utils.py diff --git a/fuzzycat/cleanups.py b/fuzzycat/cleanups.py deleted file mode 100644 index c2e021d..0000000 --- a/fuzzycat/cleanups.py +++ /dev/null @@ -1,13 +0,0 @@ -""" -Various shared cleanup approaches. -""" - -from fuzzycat.utils import (StringPipeline, normalize_ampersand, normalize_whitespace) - -# These transformations should not affect the name or a journal. -basic = StringPipeline([ - str.lower, - normalize_whitespace, - normalize_ampersand, - lambda v: v.rstrip("."), -]) diff --git a/fuzzycat/fatcat/api_auth.py b/fuzzycat/fatcat/api_auth.py deleted file mode 100644 index 0bad5e9..0000000 --- a/fuzzycat/fatcat/api_auth.py +++ /dev/null @@ -1,45 +0,0 @@ -# coding: utf-8 -""" -API helper, taken from fatcat_tools/api_auth.py -""" - -import os -import sys - -import fatcat_openapi_client - - -def public_api(host_uri): - """ - Note: unlike the authenticated variant, this helper might get called even - if the API isn't going to be used, so it's important that it doesn't try to - actually connect to the API host or something. - """ - conf = fatcat_openapi_client.Configuration() - conf.host = host_uri - return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) - - -def authenticated_api(host_uri, token=None): - """ - Note: if this helper is called, it's implied that an actual API connection - is needed, so it does try to connect and verify credentials. - """ - - conf = fatcat_openapi_client.Configuration() - conf.host = host_uri - if not token: - token = os.environ['FATCAT_API_AUTH_TOKEN'] - if not token: - sys.stderr.write( - 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') - sys.exit(-1) - - conf.api_key["Authorization"] = token - conf.api_key_prefix["Authorization"] = "Bearer" - api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) - - # verify up front that auth is working - api.auth_check() - - return api diff --git a/fuzzycat/fatcat/common.py b/fuzzycat/fatcat/common.py deleted file mode 100644 index 7499ce4..0000000 --- a/fuzzycat/fatcat/common.py +++ /dev/null @@ -1,164 +0,0 @@ -# coding: utf-8 -""" -Adapter for fatcat and fatcat entities. -""" - -import collections -from enum import Enum -from typing import Dict, List, Type, Union - -from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, - ReleaseExtIds, WorkEntity) - -from fuzzycat.fatcat.api_auth import public_api -from fuzzycat.fatcat.entities import entity_from_dict, entity_from_json - - -class MatchStatus(Enum): - """ - When matching two entities, use these levels to express match strength. - When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is - certain, that items do not match. - """ - - EXACT = 0 - STRONG = 1 - WEAK = 2 - AMBIGIOUS = 3 - DIFFERENT = 4 - - -def compare_ext_ids(a: ReleaseExtIds, b: ReleaseExtIds) -> Dict[str, int]: - """ - Returns a dictionary with number of existing, matching and differing - identifier between entity a and b. TODO(martin): It might be helpful to - have some mapping service, that would relate qid to doi, or a mag to a - jstor id, if this information is known. - """ - counter = collections.Counter({"a": 0, "b": 0, "both": 0, "hits": 0, "misses": 0}) - attrs = ( - "doi", - "wikidata_qid", - "isbn13", - "pmid", - "pmcid", - "core", - "arxiv", - "jstor", - "ark", - "mag", - ) - for attr in attrs: - v = getattr(a, attr) - w = getattr(b, attr) - if v: - counter["a"] += 1 - if w: - counter["b"] += 1 - if not v or not w: - continue - counter["both"] += 1 - if v == w: - counter["hits"] += 1 - else: - counter["misses"] += 1 - return counter - - -def fetch_container_list( - ids: List[str], - api: DefaultApi = None, -) -> List[ContainerEntity]: - """ - Fetch a list of containers from the API. - """ - if api is None: - api = public_api("https://api.fatcat.wiki/v0") - result = [] - for id in ids: - try: - ce = api.get_container(id) - result.append(ce) - except ApiException as exc: - if exc.status == 404: - print("[err] failed to fetch container: {}".format(id), file=sys.stderr) - continue - raise - return result - - -def fetch_release_list( - ids: List[str], - api: DefaultApi = None, -) -> List[ReleaseEntity]: - """ - Returns a list of entities. Some entities might be missing. Return all that - are accessible. - """ - if api is None: - api = public_api("https://api.fatcat.wiki/v0") - result = [] - for id in ids: - try: - re = api.get_release(id, hide="refs,abstracts", expand="container") - result.append(re) - except ApiException as exc: - if exc.status == 404: - print("[err] failed to fetch release: {}".format(id), file=sys.stderr) - continue - raise - return result - - -def entity_comparable_attrs( - a: Union[ContainerEntity, ReleaseEntity], - b: Union[ContainerEntity, ReleaseEntity], - entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]], -) -> List[str]: - """ - Return a list of top-level attributes, which are defined on both entities - (i.e. we could actually compare them). - """ - attrs = entity_type.attribute_map.keys() - comparable_attrs = [] - for attr in attrs: - if getattr(a, attr) is None: - continue - if getattr(b, attr) is None: - continue - comparable_attrs.append(attr) - return comparable_attrs - - -def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api=None): - """ - Convert an elasticsearch result to a list of entities. Accepts both a - dictionary and an elasticsearch_dsl.response.Response. - - We take the ids from elasticsearch and retrieve entities via API. - """ - if isinstance(response, dict): - ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size] - elif isinstance(response, elasticsearch_dsl.response.Response): - ids = [hit.to_dict().get("ident") for hit in response] - - if entity_type == ReleaseEntity: - return fetch_release_list(ids, api=api) - if entity_type == ContainerEntity: - return fetch_container_list(ids, api=api) - - raise ValueError("invalid entity type: {}".format(entity_type)) - - -def exact_release_match(a: ReleaseEntity, b: ReleaseEntity) -> bool: - """ - Currently, entities implement comparison through object dictionaries. - """ - return a == b - - -def exact_work_match(a: WorkEntity, b: WorkEntity) -> bool: - """ - Currently, entities implement comparison through object dictionaries. - """ - return a == b diff --git a/fuzzycat/fatcat/entities.py b/fuzzycat/fatcat/entities.py deleted file mode 100644 index 351c2b8..0000000 --- a/fuzzycat/fatcat/entities.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -""" -This is taken from fatcat_tools/transforms/entities. -""" - -import collections -import json - -import toml -from fatcat_openapi_client import ApiClient - - -def entity_to_dict(entity, api_client=None) -> dict: - """ - Hack to take advantage of the code-generated serialization code. - - Initializing/destroying ApiClient objects is surprisingly expensive - (because it involves a threadpool), so we allow passing an existing - instance. If you already have a full-on API connection `api`, you can - access the ApiClient object as `api.api_client`. This is such a speed-up - that this argument may become mandatory. - """ - if not api_client: - api_client = ApiClient() - return api_client.sanitize_for_serialization(entity) - - -def entity_from_json(json_str: str, entity_type, api_client=None): - """ - Hack to take advantage of the code-generated deserialization code - - See note on `entity_to_dict()` about api_client argument. - """ - if not api_client: - api_client = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return api_client.deserialize(thing, entity_type) - - -def entity_from_dict(obj: dict, entity_type, api_client=None): - json_str = json.dumps(obj) - return entity_from_json(json_str, entity_type, api_client=api_client) - - -def entity_to_toml(entity, api_client=None, pop_fields=None) -> str: - """ - pop_fields parameter can be used to strip out some fields from the resulting - TOML. Eg, for fields which should not be edited, like the ident. - """ - obj = entity_to_dict(entity, api_client=api_client) - pop_fields = pop_fields or [] - for k in pop_fields: - obj.pop(k, None) - return toml.dumps(obj) - - -def entity_from_toml(toml_str: str, entity_type, api_client=None): - obj = toml.loads(toml_str) - return entity_from_dict(obj, entity_type, api_client=api_client) diff --git a/fuzzycat/fatcat/main.py b/fuzzycat/fatcat/main.py deleted file mode 100644 index 07e4ad4..0000000 --- a/fuzzycat/fatcat/main.py +++ /dev/null @@ -1,22 +0,0 @@ -# coding: utf-8 -""" -Command line entry point for ad-hoc testing. -""" - -import argparse - -from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds - -from fuzzycat.fatcat.matching import match_release_fuzzy - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("-R", "--release", help="match release", action="store_true") - parser.add_argument("-t", "--title", help="title") - - args = parser.parse_args() - - if args.release and args.title: - re = ReleaseEntity(title=args.title, ext_ids=ReleaseExtIds()) - print(match_release_fuzzy(re, es="https://search.fatcat.wiki")) diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py deleted file mode 100644 index 04ec275..0000000 --- a/fuzzycat/fatcat/matching.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding: utf-8 -""" -Public API for fuzzy matches for fatcat. - -Match methods return candidates, verify methods return a match status. - - match_containar_fuzzy -> List[ContainerEntity] - match_release_fuzzy -> List[ReleaseEntity] - - verify_serial_name -> MatchStatus - verify_container_name -> MatchStatus - verify_container_fuzzy -> MatchStatus - verify_release_fuzzy -> MatchStatus - -Candidate generation will use external data from search and hence is expensive. Verification is fast. -""" - -from typing import List, Optional, Set, Union - -import elasticsearch -from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, - ReleaseExtIds, WorkEntity) -from fatcat_openapi_client.api.default_api import DefaultApi - -from fuzzycat import cleanups -from fuzzycat.fatcat.common import (MatchStatus, compare_ext_ids, response_to_entity_list) -from fuzzycat.serials import serialsdb - - -def match_container_fuzzy(container: ContainerEntity, - size: int = 5, - es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None, - api: Optional[DefaultApi] = None) -> List[ContainerEntity]: - """ - Given a container entity, which can be (very) partial, return a list of - candidate matches. Elasticsearch can be a hostport or the low level client - object. - - Random data point: with 20 parallel workers callind match_container_fuzzy, - we get around 40 req/s. - """ - assert isinstance(container, ContainerEntity) - - if size is None or size == 0: - size = 10000 # or any large number - - if isinstance(es, str): - es = elasticsearch.Elasticsearch([es]) - if es is None: - es = elasticsearch.Elasticsearch() - - # If we find any match by ISSN-L, we return only those. - if container.issnl: - s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query( - "term", issns=container.issnl).extra(size=size)) - resp = s.execute() - if len(resp) > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - # Do we have an exact QID match? - if container.wikidata_qid: - s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query( - "term", wikidata_qid=container.wikidata_qid).extra(size=size)) - resp = s.execute() - if len(resp) > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - # Start with exact name match. - # - # curl -s https://search.fatcat.wiki/fatcat_container/_mapping | jq . - # - # "name": { - # "type": "text", - # "copy_to": [ - # "biblio" - # ], - # "analyzer": "textIcu", - # "search_analyzer": "textIcuSearch" - # }, - # - body = { - "query": { - "match": { - "name": { - "query": container.name, - "operator": "AND" - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_container") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - # Get fuzzy. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "query": { - "match": { - "name": { - "query": container.name, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_container") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) - - return [] - - -def match_release_fuzzy(release: ReleaseEntity, - size: int = 5, - es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None, - api: Optional[DefaultApi] = None) -> List[ReleaseEntity]: - """ - Given a release entity, return a number similar release entities from - fatcat using Elasticsearch. - """ - assert isinstance(release, ReleaseEntity) - - if size is None or size == 0: - size = 10000 # or any large number - - if isinstance(es, str): - es = elasticsearch.Elasticsearch([es]) - if es is None: - es = elasticsearch.Elasticsearch() - - # Try to match by external identifier. - ext_ids = release.ext_ids - attrs = { - "doi": "doi", - "wikidata_qid": "wikidata_qid", - "isbn13": "isbn13", - "pmid": "pmid", - "pmcid": "pmcid", - "core": "code_id", - "arxiv": "arxiv_id", - "jstor": "jstor_id", - "ark": "ark_id", - "mag": "mag_id", - } - for attr, es_field in attrs.items(): - value = getattr(ext_ids, attr) - if not value: - continue - s = (elasticsearch_dsl.Search(using=es, - index="fatcat_release").query("term", **{ - es_field: value - }).extra(size=size)) - resp = s.execute() - if len(resp) > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) - - body = { - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND" - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_release") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) - - # Get fuzzy. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness - body = { - "query": { - "match": { - "title": { - "query": release.title, - "operator": "AND", - "fuzziness": "AUTO", - } - } - }, - "size": size, - } - resp = es.search(body=body, index="fatcat_release") - if resp["hits"]["total"] > 0: - return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) - - return [] - - -def verify_serial_name(a: str, b: str) -> MatchStatus: - """ - Serial name verification. Serial names are a subset of container names. - There are about 2M serials. - """ - def verify(a: Set[str], b: Set[str]) -> MatchStatus: - - # If any name yields multiple ISSN-L, we cannot decide. - if len(a) > 1: - return MatchStatus.AMBIGIOUS - if len(b) > 1: - return MatchStatus.AMBIGIOUS - - # If both names point the same ISSN-L, it is an exact match. - if len(a) > 0 and len(a) == len(b): - if len(a & b) == len(a): - return MatchStatus.EXACT - else: - return MatchStatus.DIFFERENT - - # Multiple names possible, but there is overlap. - if len(a & b) > 0: - return MatchStatus.STRONG - - return MatchStatus.AMBIGIOUS - - # First, try values as given. - issnls_for_a = serialsdb.get(a, set()) - issnls_for_b = serialsdb.get(b, set()) - - status = verify(issnls_for_a, issnls_for_b) - if status != MatchStatus.AMBIGIOUS: - return status - - # Try to match slightly cleaned up values. - issnls_for_a = serialsdb.get(a, set(), cleanup_pipeline=cleanups.basic) - issnls_for_b = serialsdb.get(b, set(), cleanup_pipeline=cleanups.basic) - - return verify(issnls_for_a, issnls_for_b) - - -def verify_container_name(a: str, b: str) -> MatchStatus: - status = verify_serial_name(a, b) - if status != MatchStatus.AMBIGIOUS: - return status - - # TODO: add additional verification, string match and common patterns. - - -def verify_container_match(a: ContainerEntity, b: ContainerEntity) -> MatchStatus: - pass - - -def verify_release_match(a: ReleaseEntity, b: ReleaseEntity) -> MatchStatus: - assert isinstance(a, ReleaseEntity) - assert isinstance(b, ReleaseEntity) - - if a == b: - return MatchStatus.EXACT - - a_ext_ids, b_ext_ids = a.ext_ids, b.ext_ids - # Compare ext ids, result is a counter, we are interested in "hits" and - # "misses", only. - cmp_result = compare_ext_ids(a_ext_ids, b_ext_ids) - - # Assume that if more ids match than mismatch, it is a good signal, e.g. if - # only a DOI is defined and they match, it is an exact match. - if cmp_result["hits"] > 0 and cmp_result["misses"] == 0: - return MatchStatus.EXACT - if cmp_result["hits"] > cmp_result["misses"]: - return MatchStatus.STRONG - if cmp_result["hits"] == 0 and cmp_result["misses"] > 0: - return MatchStatus.DIFFERENT - if cmp_result["hits"] < cmp_result["misses"]: - return MatchStatus.AMBIGIOUS - - # TODO: do title verification, apply string cleanups, etc. diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py deleted file mode 100644 index aa6b78a..0000000 --- a/fuzzycat/issn.py +++ /dev/null @@ -1,401 +0,0 @@ -""" -Munge the ISSN data so we get some container name test data out of it. - - ... - "issn": "0000-0019", - "mainTitle": "The publishers weekly.", - "name": [ - "The publishers weekly.", - "Publishers weekly" - ], - -Public data from ISSN via: -https://portal.issn.org/resource/ISSN/0874-2308?format=json, and issnlister -(https://github.com/miku/issnlister) to aggregate. - -The dataset contains naming variants in "name". - -Q1: How many of these variants would our matching algorithm detect? - -For that, we need a dataset that generates pairs (a, b) from all names (the -mainTitle is just one of the name). - -Example JSON LD response from ISSN: - -{ - "@context": { - "format": { - "@id": "http://purl.org/dc/elements/1.1/format", - "@type": "@id" - }, - "identifiedBy": { - "@id": "http://id.loc.gov/ontologies/bibframe/identifiedBy", - "@type": "@id" - }, - "identifier": { - "@id": "http://purl.org/dc/elements/1.1/identifier" - }, - "isPartOf": { - "@id": "http://schema.org/isPartOf", - "@type": "@id" - }, - "issn": { - "@id": "http://purl.org/ontology/bibo/issn" - }, - "label": { - "@id": "http://www.w3.org/2000/01/rdf-schema#label" - }, - "location": { - "@id": "http://schema.org/location", - "@type": "@id" - }, - "mainEntity": { - "@id": "http://schema.org/mainEntity", - "@type": "@id" - }, - "modified": { - "@id": "http://purl.org/dc/terms/modified", - "@type": "http://www.w3.org/2001/XMLSchema#dateTime" - }, - "name": { - "@id": "http://schema.org/name" - }, - "publication": { - "@id": "http://schema.org/publication", - "@type": "@id" - }, - "status": { - "@id": "http://id.loc.gov/ontologies/bibframe/status", - "@type": "@id" - }, - "title": { - "@id": "http://id.loc.gov/ontologies/bibframe/title", - "@type": "@id" - }, - "type": { - "@id": "http://purl.org/dc/terms/type", - "@type": "@id" - }, - "value": { - "@id": "http://www.w3.org/1999/02/22-rdf-syntax-ns#value" - }, - "wasAttributedTo": { - "@id": "http://www.w3.org/ns/prov#wasAttributedTo", - "@type": "@id" - } - }, - "@graph": [ - { - "@id": "http://id.loc.gov/vocabulary/countries/pl", - "label": "Poland" - }, - { - "@id": "organization/ISSNCenter#57", - "@type": "http://schema.org/Organization" - }, - { - "@id": "resource/ISSN-L/0001-4125", - "identifiedBy": "resource/ISSN/0001-4125#ISSN-L" - }, - { - "@id": "resource/ISSN/0001-4125", - "@type": [ - "http://schema.org/Periodical", - "http://id.loc.gov/ontologies/bibframe/Instance", - "http://id.loc.gov/ontologies/bibframe/Work" - ], - "format": "vocabularies/medium#Print", - "http://schema.org/issn": "0001-4125", - "identifiedBy": [ - "resource/ISSN/0001-4125#ISSN-L", - "resource/ISSN/0001-4125#KeyTitle", - "resource/ISSN/0001-4125#ISSN" - ], - "identifier": "0001-4125", - "isPartOf": "resource/ISSN-L/0001-4125", - "issn": "0001-4125", - "name": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques", - "publication": "resource/ISSN/0001-4125#ReferencePublicationEvent", - "title": "resource/ISSN/0001-4125#KeyTitle", - "type": "http://marc21rdf.info/terms/formofmaterial#a" - }, - { - "@id": "resource/ISSN/0001-4125#ISSN", - "@type": "http://id.loc.gov/ontologies/bibframe/Issn", - "status": "vocabularies/IdentifierStatus#Valid", - "value": "0001-4125" - }, - { - "@id": "resource/ISSN/0001-4125#ISSN-L", - "@type": "http://id.loc.gov/ontologies/bibframe/IssnL", - "status": "vocabularies/IdentifierStatus#Valid", - "value": "0001-4125" - }, - { - "@id": "resource/ISSN/0001-4125#KeyTitle", - "@type": [ - "http://id.loc.gov/ontologies/bibframe/Identifier", - "http://id.loc.gov/ontologies/bibframe/KeyTitle" - ], - "value": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques" - }, - { - "@id": "resource/ISSN/0001-4125#Record", - "@type": "http://schema.org/CreativeWork", - "mainEntity": "resource/ISSN/0001-4125", - "modified": "20051223105700.0", - "status": "vocabularies/RecordStatus#Register", - "wasAttributedTo": "organization/ISSNCenter#57" - }, - { - "@id": "resource/ISSN/0001-4125#ReferencePublicationEvent", - "@type": "http://schema.org/PublicationEvent", - "location": "http://id.loc.gov/vocabulary/countries/pl" - } - ] -} - -""" - -import argparse -import collections -import itertools -import json -import os -import re -import shelve -import sys -from typing import Any, Callable, Dict, Generator, Iterable, List, Tuple, Union - -from simhash import Simhash - -from fuzzycat import cleanups -from fuzzycat.utils import SetEncoder - - -def listify(v: Union[str, List[str]]) -> List[str]: - """ - Sensible create a list. - """ - if v is None: - return [] - if isinstance(v, str): - return [v] - return v - - -def jsonld_minimal(v: Dict[str, Any]) -> Dict[str, Any]: - """ - Turn a JSON from issn.org into a smaller dict with a few core fields. Will - fail, if no ISSN-L is found in the input. - - { - "issnl": "0001-4125", - "material": [], - "issns": [ - "0001-4125" - ], - "urls": [], - "names": [ - "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques" - ] - } - - """ - items = v.get("@graph") - if not items: - return {} - doc = {} - for item in items: - # "@id": "resource/ISSN-L/0001-4125" - # "@id": "resource/ISSN/0001-4125" - # ... - id = item.get("@id") - if not id: - continue - - # ISSN-L - match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id) - if match: - doc["issnl"] = match.group(1) - continue - - # The "main" issn entry. - match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id) - if match: - # if we do not have ISSN-L yet, check "exampleOfWork", - # "resource/ISSN/2658-0705" - if not "issnl" in doc: - match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", - item.get("exampleOfWork", "")) - if match: - doc["issnl"] = match.group(1) - - # note material - doc["material"] = listify(item.get("material", [])) - - # collect ids - issns = set([match.group(1)]) - if item.get("identifier"): - issns.add(item.get("identifier")) - if item.get("issn"): - issns.add(item.get("issn")) - doc["issns"] = issns - # add urls - doc["urls"] = listify(item.get("url", [])) - # add names, variants - names = listify(item.get("name")) + listify(item.get("alternateName")) - doc["names"] = list(set(names)) - - # add related issn - for v in listify(item.get("isFormatOf", [])): - match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v) - if match: - doc["issns"].add(match.group(1)) - - if "issnl" not in doc: - raise ValueError("entry without issnl: {}".format(item)) - - return doc - - -def de_jsonld(lines: Iterable): - """ - Batch convert jsonld to minimal JSON and write to stdout. - """ - for line in lines: - line = line.strip() - try: - doc = jsonld_minimal(json.loads(line)) - except json.decoder.JSONDecodeError as exc: - print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) - continue - else: - print(json.dumps(doc, cls=SetEncoder)) - - -def generate_name_pairs(lines: Iterable, - cleanup_pipeline: Callable[[str], str] = None, - keep_original: bool = True) -> Generator[Tuple[str, str, str], None, None]: - """ - Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on - errors. Proto unit test data. - - Example output: - - 0013-211X Eendracht-bode (Tholen) Eendracht-bode. - 0012-7388 Dynamic maturity Dynamic maturity. - 0012-6055 Drehpunkt. Drehpunkt (Basel. 1968) - - Basically, these would be free test cases, since we would like to report - "match" on most of these. - - That can be useful to detect various scripts refering to the same journal. - - 0040-2249 Tehnika kino i televideniâ. Tehnika kino i televideniâ - 0040-2249 Tehnika kino i televideniâ. Техника кино и телевидения - 0040-2249 Tehnika kino i televideniâ. Техника кино и телевидения. - 0040-2249 Tehnika kino i televideniâ Техника кино и телевидения - 0040-2249 Tehnika kino i televideniâ Техника кино и телевидения. - 0040-2249 Техника кино и телевидения Техника кино и телевидения. - - If cleanup_pipeline is given, additionally add - """ - for line in lines: - line = line.strip() - try: - doc = jsonld_minimal(json.loads(line)) - except json.decoder.JSONDecodeError as exc: - print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) - continue - for a, b in itertools.combinations(doc.get("names", []), 2): - if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original): - yield (doc["issnl"], a, b) - if cleanup_pipeline: - a = cleanup_pipeline(a) - b = cleanup_pipeline(b) - yield (doc["issnl"], a, b) - - -def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline: Callable[[str], str] = None): - """ - Given JSON lines, generate a dictionary mapping names sets of ISSN. Names - might be reused. - """ - mapping = collections.defaultdict(set) - for issnl, a, b in generate_name_pairs(lines, cleanup_pipeline=cleanup_pipeline): - mapping[a].add(issnl) - mapping[b].add(issnl) - return mapping - - -def generate_shelve(lines: Iterable, output: str, cleanup_pipeline: Callable[[str], str] = None): - """ - Generate a persistent key value store from name issn mappings. 5015523 - entries, 1.1G take about 5min. - """ - with shelve.open(output) as db: - mapping = generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline) - for name, issnls in mapping.items(): - db[name] = issnls - print("wrote {} keys to {}".format(len(db), output), file=sys.stderr) - - -def generate_simhash(lines: Iterable): - """ - Print TSV with simhash values. - - Match and non-match count. - - 1069447 1 - 927120 0 - """ - for issnl, a, b in generate_name_pairs(lines): - ha = Simhash(a).value - hb = Simhash(b).value - row = (issnl, 0 if ha == hb else 1, ha, hb) - print("\t".join([str(v) for v in row])) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("file", - default=sys.stdin, - type=argparse.FileType("r"), - help="public data from issn, one JSON object per line") - parser.add_argument("--make-pairs", - action="store_true", - help="generate TSV and write to stdout") - parser.add_argument("--make-mapping", - action="store_true", - help="generate JSON mapping from name to list of ISSN") - parser.add_argument("--make-shelve", - action="store_true", - help="generate trie mapping from name to list of ISSN") - parser.add_argument("--make-simhash", action="store_true", help="print out simhash value") - parser.add_argument("-o", - "--output", - type=str, - default="output.file", - help="write output to file") - parser.add_argument("-c", "--cleanup", type=str, default=None, help="cleanup pipeline name") - parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld") - - args = parser.parse_args() - - # Add additional cleanup routines here. - cleanup = dict(basic=cleanups.basic).get(args.cleanup) - - if args.make_mapping: - print( - json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), - cls=SetEncoder)) - if args.make_pairs: - for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup): - print("{}\t{}\t{}".format(issn, a, b)) - if args.de_jsonld: - de_jsonld(args.file) - if args.make_shelve: - generate_shelve(args.file, output=args.output, cleanup_pipeline=cleanup) - if args.make_simhash: - generate_simhash(args.file) diff --git a/fuzzycat/serials.py b/fuzzycat/serials.py deleted file mode 100644 index 2f1782d..0000000 --- a/fuzzycat/serials.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf-8 -""" -Serial name matching. Includes names from issn database. -""" - -import os -import shelve - -__all__ = ["serialsdb"] - - -class SerialsDatabase: - """ - Lookup allows to lookup serial names, using a database of real serial names. - - >>> from serials import serialsdb - >>> serialsdb.get("Philosophica") - {'1857-9272', '2232-299X', '2232-3007', '2232-3015'} - - """ - def __init__(self, path=None): - """ - Note that shelve appends "db" to the name automatically. TODO: make this - auto-download into a cache directory. - """ - if path is None: - path = os.path.join(os.path.expanduser("~"), ".cache/fuzzycat/names") - self.db = shelve.open(path, flag='r') - - def __getitem__(self, v): - return self.db[v] - - def get(self, v, default=None, cleanup_pipeline=None): - if not cleanup_pipeline: - return self.db.get(v, default=default) - return self.db.get(cleanup_pipeline(v), default=default) - - def close(self): - self.db.close() - - -# A singleton. -serialsdb = SerialsDatabase() diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py deleted file mode 100644 index 9d2a2f7..0000000 --- a/fuzzycat/utils.py +++ /dev/null @@ -1,249 +0,0 @@ -# coding: utf-8 - -import collections -import itertools -import json -import re -import string -from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence -""" -A couple of utilities, may be split up into separate modules. -""" - - -class SetEncoder(json.JSONEncoder): - """ - Helper to encode python sets into JSON lists. - So you can write something like this: - json.dumps({"things": set([1, 2, 3])}, cls=SetEncoder) - """ - def default(self, obj): - """ - Decorate call to standard implementation. - """ - if isinstance(obj, set): - return list(obj) - return json.JSONEncoder.default(self, obj) - - -class StringPipeline: - """ - Minimalistic grouping of functions applied on an input string to produce - some cleaned or normalized output. Pipeline functions are Func[[str], str]. - - >>> cleanups = StringPipeline([ - ... str.lower, - ... remove_html_tags, - ... normalize_whitespace, - ... normalize_ampersand, - ... ]) - >>> cleanups("Input & Output") - input and output - - """ - def __init__(self, fs: List[Callable[[str], str]]): - self.fs = fs - - def __call__(self, s: str) -> str: - return self.run(s) - - def run(self, s: str) -> str: - """ - Apply all function and return result. Deprecated: just call the object. - """ - for f in self.fs: - s = f(s) - return s - - -class StringAnnotator: - """ - Experimental, rationale: In some way, feature engineering; we want to - derive metrics, number from the string, do this consistently and compactly. - E.g. once we have dozens of "speaking" characteristics, a case based method - might become more readble. - - if s.is_single_token and s.some_ratio > 0.4: - return MatchStatus.AMBIGIOUS - - Could also subclass string and pluck more methods on it (might be even - reusable). - - .... - - Given a string, derive a couple of metrics, based on functions. The - annotation is a dict, mapping an annotation key to a value of any type. - - >>> metrics = StringAnnotator([ - ... has_html_tags, - ... has_only_printable_characters, - ... is_single_token, - ... length, - ... has_year_in_parentheses, - ... ]) - >>> metrics.run("Journal of Pataphysics 2038-2032") - {"value": "Journal of Pataphysics 2038-2032", "is_single_token": False, ... } - - TODO(martin): - - * SimpleNamespace, dotdict, Dataclass. - * string_utils.py or similar - * maybe adopt SpaCy or similar - """ - def __init__(self, fs: List[Callable[[str], Dict[str, Any]]]): - self.fs = fs - - def run(self, s: str) -> Dict[str, Any]: - annotations: DefaultDict[str, Any] = collections.defaultdict(dict) - for f in self.fs: - result = f(s) - annotations.update(result) - return annotations - - -def normalize_whitespace(s: str) -> str: - """ - Remove trailing spaces and normalize whitespace. - """ - return re.sub(r"\s{2,}", " ", s.strip()) - - -def normalize_ampersand(s: str) -> str: - """ - Normalize ampersand to and. - """ - return s.replace(" & ", " and ") - - -def letter_to_non_letter_ratio(s: str) -> float: - """ - Non letters are defined by printable w/o letters. - """ - if len(s) == 0: - return 0.0 - non_letters = set(string.printable) - set(string.ascii_letters) - non_letter_count = sum(c in non_letters for c in s) - return non_letter_count / len(s) - - -def alphanumeric_ratio(s: str) -> float: - """ - Ratio of letters, digit and whitespace to total string length. - """ - if len(s) == 0: - return 0.0 - alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "]) - alphanumeric_count = sum(c in alphanumeric for c in s) - return alphanumeric_count / len(s) - - -def alphanumeric_only(s: str) -> str: - """ - Remove all non-alphanumeric content from string. - """ - alphanumeric = set(string.ascii_letters) | set(string.digits) | set([" "]) - return "".join((c for c in s if c in alphanumeric)) - - -def parenthesized_year(s: str) -> Optional[str]: - """ - Return the year only, if it is in parentheses, e.g. Hello (2020). - """ - match = re.search(r"[\(\[]\s*([12][\d]{3})\s*[\]\)]", s) - if match: - return match.group(1) - return None - - -def has_non_letters_ratio(s: str, threshold: float = 0.4) -> bool: - """ - Check the ratio of non-letters in a string, e.g. for things like "A.R.G.H" - """ - if len(s) == 0: - return False - return (sum(c not in string.ascii_letters for c in s) / len(s)) > threshold - - -def is_single_word_printable(s: str) -> bool: - """ - True, if s is a single token of printable characters. - """ - return all(c in string.printable for c in s) and s.split() == 1 - - -def extract_wikidata_qids(s: str) -> List[str]: - """ - Given a string, extract all qids. - """ - return re.findall(r"Q[0-9]{1,10}", s) - - -def extract_issns(s: str) -> List[str]: - """ - Given a string return a list of valid ISSN. - """ - pattern = r"[0-9]{4,4}-[0-9]{3,3}[0-9xX]" - return [v for v in re.findall(pattern, s) if is_valid_issn(v)] - - -def longest_common_prefix(a: Sequence, b: Sequence) -> Sequence: - """ - Return the longest common prefix of a and b. The length of the return value - is at most min(len(a), len(b)). - """ - a, b = sorted((a, b), key=len) - for i, (u, v) in enumerate(zip(a, b)): - if u != v: - return a[:i] - return a - - -def common_prefix_length_ratio(a: Sequence, b: Sequence) -> float: - """ - Return a float between 0.0 and 1.0 expressing the ratio between the length - of the common shared prefix to the length of the longest sequence. - """ - maxlen = max(len(a), len(b)) - if maxlen == 0: - return 0.0 - return len(longest_common_prefix(a, b)) / maxlen - - -def hamming_distance(s: str, t: str) -> int: - """ - Return hamming distance of s and t. - """ - return sum((u != v for u, v in itertools.zip_longest(s, t))) - - -def calculate_issn_checkdigit(s: str) -> str: - """ - Given a string of length 7, return the ISSN check value (digit or X) as - string. - """ - if len(s) != 7: - raise ValueError("seven digits required") - ss = sum((int(digit) * f for digit, f in zip(s, range(8, 1, -1)))) - _, mod = divmod(ss, 11) - checkdigit = 0 if mod == 0 else 11 - mod - result = "X" if checkdigit == 10 else "{}".format(checkdigit) - return result - - -def is_valid_issn(issn: str) -> bool: - """ - Return True, if the ISSN is valid. This does not mean it is registered. - """ - if "-" in issn: - issn = issn.replace("-", "") - if len(issn) != 8: - raise ValueError("invalid issn length: {}".format(issn)) - checkdigit = calculate_issn_checkdigit(issn[:7]) - return issn[7] == "{}".format(checkdigit) - - -def keys_with_values(d: Dict) -> List[Any]: - """ - Return all keys of a dictionary which have non-falsy values. - """ - return [k for k, v in d.items() if v] diff --git a/setup.py b/setup.py index 33629e6..2e06672 100644 --- a/setup.py +++ b/setup.py @@ -23,8 +23,6 @@ with open("README.md", "r") as fh: python_requires=">=3.6", zip_safe=False, entry_points={"console_scripts": [ - "fuzzycat=fuzzycat.fatcat.main:main", - "fuzzycat-issn=fuzzycat.issn:main", "fuzzycat-cluster=fuzzycat.cluster:main", ],}, install_requires=[ diff --git a/tests/test_matching.py b/tests/test_matching.py deleted file mode 100644 index 6ae393b..0000000 --- a/tests/test_matching.py +++ /dev/null @@ -1,4 +0,0 @@ -# coding: utf-8 -""" -Test cases for fuzzy matching. -""" diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index cc7fae0..0000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 - -from typing import List, NamedTuple - -import pytest - -from fuzzycat.utils import * - - -def test_extract_issns(): - Case = NamedTuple("Case", [("s", str), ("result", List[str])]) - cases = ( - Case("", []), - Case("Hello 1234", []), - Case("Hello 1084-5100 World", ["1084-5100"]), - Case("Hello 10845100 World", []), - Case("Hello 1084-5100 1084-5100 World", ["1084-5100", "1084-5100"]), - Case("2323-573X 2169-1886 Journal", ["2323-573X", "2169-1886"]), - ) - for c in cases: - result = extract_issns(c.s) - assert result == c.result - - -def test_longest_common_prefix(): - Case = NamedTuple("Case", [("a", str), ("b", str), ("result", str)]) - cases = ( - Case("", "", ""), - Case("a", "", ""), - Case("ab", "a", "a"), - Case("123", "123", "123"), - ) - for c in cases: - result = longest_common_prefix(c.a, c.b) - assert result == c.result - - -def test_common_prefix_length_ratio(): - Case = NamedTuple("Case", [("a", str), ("b", str), ("result", float)]) - cases = ( - Case("", "", 0.0), - Case("a", "", 0.0), - Case("Hello World!", "ello", 0.0), - Case("ab", "a", 0.5), - Case("123", "123", 1.0), - Case("1234", "123", 0.75), - ) - for c in cases: - result = common_prefix_length_ratio(c.a, c.b) - assert result == c.result - - -def test_hamming_distance(): - Case = NamedTuple("Case", [("a", str), ("b", str), ("result", int)]) - cases = ( - Case("", "", 0), - Case("a", "a", 0), - Case("a", "ab", 1), - Case("abc", "cba", 2), - Case("1234", "", 4), - ) - for c in cases: - result = hamming_distance(c.a, c.b) - assert result == c.result - - -def test_is_valid_issn(): - cases = { - "value_error": ("", "1234", "123456", "111122223333", "XXXXXXXX"), - "valid": ( - "0710-4081", - "0011-7625", - "2268-5901", - "1809-0710", - "1533-7561", - "07104081", - "00117625", - "22685901", - "18090710", - "15337561", - ), - "invalid": ( - "0710-4080", - "0011-7626", - "2268-5902", - "1809-0709", - "1533-7560", - "07104080", - "00117626", - "22685902", - "18090709", - "15337560", - ), - } - for ve in cases["value_error"]: - with pytest.raises(ValueError): - is_valid_issn(ve) - for v in cases["valid"]: - assert is_valid_issn(v) == True - for v in cases["invalid"]: - assert is_valid_issn(v) == False - - -def test_keys_with_values(): - Case = NamedTuple("Case", [("d", Dict), ("result", List[Any])]) - cases = ( - Case({}, []), - Case({"a": "v"}, ["a"]), - Case({ - "a": "", - "b": "v" - }, ["b"]), - Case({ - "a": None, - "b": "v" - }, ["b"]), - Case({ - "a": [], - "b": "v" - }, ["b"]), - Case({ - "a": 0, - "b": "v" - }, ["b"]), - ) - for case in cases: - result = keys_with_values(case.d) - assert result == case.result -- cgit v1.2.3