diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-17 17:30:28 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-17 17:30:28 +0200 |
commit | 3cdd049998ed85827dc6339725ea8fdda5a700aa (patch) | |
tree | c243547543cdbe11d235a13af42e016e633f6c6e | |
parent | 5084ca9fe10b4c58afc28370d98cf3e798bd2109 (diff) | |
download | fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.tar.gz fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.zip |
large overhaul
* separate all fatcat related code into fatcat submodule
* more type annotations
* add verify_serial_name for journal names
-rw-r--r-- | fuzzycat/__init__.py | 4 | ||||
-rw-r--r-- | fuzzycat/cleanups.py | 6 | ||||
-rw-r--r-- | fuzzycat/fatcat/api_auth.py | 45 | ||||
-rw-r--r-- | fuzzycat/fatcat/common.py | 164 | ||||
-rw-r--r-- | fuzzycat/fatcat/entities.py | 60 | ||||
-rw-r--r-- | fuzzycat/fatcat/matching.py | 233 | ||||
-rw-r--r-- | fuzzycat/issn.py | 46 | ||||
-rw-r--r-- | fuzzycat/journals.py | 33 | ||||
-rw-r--r-- | fuzzycat/main.py | 5 | ||||
-rw-r--r-- | fuzzycat/matching.py | 147 | ||||
-rw-r--r-- | fuzzycat/serials.py | 43 | ||||
-rw-r--r-- | fuzzycat/status.py | 15 | ||||
-rw-r--r-- | fuzzycat/utils.py | 4 | ||||
-rw-r--r-- | setup.py | 6 |
14 files changed, 577 insertions, 234 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py index 7feffd5..6c381d0 100644 --- a/fuzzycat/__init__.py +++ b/fuzzycat/__init__.py @@ -1,6 +1,4 @@ __version__ = "0.1.1" -from fuzzycat.matching import match_container_names -from fuzzycat.status import MatchStatus +from fuzzycat.serials import serialsdb from fuzzycat.utils import * -from fuzzycat.journals import JournalLookup diff --git a/fuzzycat/cleanups.py b/fuzzycat/cleanups.py index d806e51..c2e021d 100644 --- a/fuzzycat/cleanups.py +++ b/fuzzycat/cleanups.py @@ -1,10 +1,8 @@ - """ Various shared cleanup approaches. """ -from fuzzycat.utils import StringPipeline, normalize_whitespace, normalize_ampersand - +from fuzzycat.utils import (StringPipeline, normalize_ampersand, normalize_whitespace) # These transformations should not affect the name or a journal. basic = StringPipeline([ @@ -13,5 +11,3 @@ basic = StringPipeline([ normalize_ampersand, lambda v: v.rstrip("."), ]) - - diff --git a/fuzzycat/fatcat/api_auth.py b/fuzzycat/fatcat/api_auth.py new file mode 100644 index 0000000..0bad5e9 --- /dev/null +++ b/fuzzycat/fatcat/api_auth.py @@ -0,0 +1,45 @@ +# coding: utf-8 +""" +API helper, taken from fatcat_tools/api_auth.py +""" + +import os +import sys + +import fatcat_openapi_client + + +def public_api(host_uri): + """ + Note: unlike the authenticated variant, this helper might get called even + if the API isn't going to be used, so it's important that it doesn't try to + actually connect to the API host or something. + """ + conf = fatcat_openapi_client.Configuration() + conf.host = host_uri + return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) + + +def authenticated_api(host_uri, token=None): + """ + Note: if this helper is called, it's implied that an actual API connection + is needed, so it does try to connect and verify credentials. + """ + + conf = fatcat_openapi_client.Configuration() + conf.host = host_uri + if not token: + token = os.environ['FATCAT_API_AUTH_TOKEN'] + if not token: + sys.stderr.write( + 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') + sys.exit(-1) + + conf.api_key["Authorization"] = token + conf.api_key_prefix["Authorization"] = "Bearer" + api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) + + # verify up front that auth is working + api.auth_check() + + return api diff --git a/fuzzycat/fatcat/common.py b/fuzzycat/fatcat/common.py new file mode 100644 index 0000000..7499ce4 --- /dev/null +++ b/fuzzycat/fatcat/common.py @@ -0,0 +1,164 @@ +# coding: utf-8 +""" +Adapter for fatcat and fatcat entities. +""" + +import collections +from enum import Enum +from typing import Dict, List, Type, Union + +from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, + ReleaseExtIds, WorkEntity) + +from fuzzycat.fatcat.api_auth import public_api +from fuzzycat.fatcat.entities import entity_from_dict, entity_from_json + + +class MatchStatus(Enum): + """ + When matching two entities, use these levels to express match strength. + When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is + certain, that items do not match. + """ + + EXACT = 0 + STRONG = 1 + WEAK = 2 + AMBIGIOUS = 3 + DIFFERENT = 4 + + +def compare_ext_ids(a: ReleaseExtIds, b: ReleaseExtIds) -> Dict[str, int]: + """ + Returns a dictionary with number of existing, matching and differing + identifier between entity a and b. TODO(martin): It might be helpful to + have some mapping service, that would relate qid to doi, or a mag to a + jstor id, if this information is known. + """ + counter = collections.Counter({"a": 0, "b": 0, "both": 0, "hits": 0, "misses": 0}) + attrs = ( + "doi", + "wikidata_qid", + "isbn13", + "pmid", + "pmcid", + "core", + "arxiv", + "jstor", + "ark", + "mag", + ) + for attr in attrs: + v = getattr(a, attr) + w = getattr(b, attr) + if v: + counter["a"] += 1 + if w: + counter["b"] += 1 + if not v or not w: + continue + counter["both"] += 1 + if v == w: + counter["hits"] += 1 + else: + counter["misses"] += 1 + return counter + + +def fetch_container_list( + ids: List[str], + api: DefaultApi = None, +) -> List[ContainerEntity]: + """ + Fetch a list of containers from the API. + """ + if api is None: + api = public_api("https://api.fatcat.wiki/v0") + result = [] + for id in ids: + try: + ce = api.get_container(id) + result.append(ce) + except ApiException as exc: + if exc.status == 404: + print("[err] failed to fetch container: {}".format(id), file=sys.stderr) + continue + raise + return result + + +def fetch_release_list( + ids: List[str], + api: DefaultApi = None, +) -> List[ReleaseEntity]: + """ + Returns a list of entities. Some entities might be missing. Return all that + are accessible. + """ + if api is None: + api = public_api("https://api.fatcat.wiki/v0") + result = [] + for id in ids: + try: + re = api.get_release(id, hide="refs,abstracts", expand="container") + result.append(re) + except ApiException as exc: + if exc.status == 404: + print("[err] failed to fetch release: {}".format(id), file=sys.stderr) + continue + raise + return result + + +def entity_comparable_attrs( + a: Union[ContainerEntity, ReleaseEntity], + b: Union[ContainerEntity, ReleaseEntity], + entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]], +) -> List[str]: + """ + Return a list of top-level attributes, which are defined on both entities + (i.e. we could actually compare them). + """ + attrs = entity_type.attribute_map.keys() + comparable_attrs = [] + for attr in attrs: + if getattr(a, attr) is None: + continue + if getattr(b, attr) is None: + continue + comparable_attrs.append(attr) + return comparable_attrs + + +def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api=None): + """ + Convert an elasticsearch result to a list of entities. Accepts both a + dictionary and an elasticsearch_dsl.response.Response. + + We take the ids from elasticsearch and retrieve entities via API. + """ + if isinstance(response, dict): + ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size] + elif isinstance(response, elasticsearch_dsl.response.Response): + ids = [hit.to_dict().get("ident") for hit in response] + + if entity_type == ReleaseEntity: + return fetch_release_list(ids, api=api) + if entity_type == ContainerEntity: + return fetch_container_list(ids, api=api) + + raise ValueError("invalid entity type: {}".format(entity_type)) + + +def exact_release_match(a: ReleaseEntity, b: ReleaseEntity) -> bool: + """ + Currently, entities implement comparison through object dictionaries. + """ + return a == b + + +def exact_work_match(a: WorkEntity, b: WorkEntity) -> bool: + """ + Currently, entities implement comparison through object dictionaries. + """ + return a == b diff --git a/fuzzycat/fatcat/entities.py b/fuzzycat/fatcat/entities.py new file mode 100644 index 0000000..351c2b8 --- /dev/null +++ b/fuzzycat/fatcat/entities.py @@ -0,0 +1,60 @@ +# coding: utf-8 +""" +This is taken from fatcat_tools/transforms/entities. +""" + +import collections +import json + +import toml +from fatcat_openapi_client import ApiClient + + +def entity_to_dict(entity, api_client=None) -> dict: + """ + Hack to take advantage of the code-generated serialization code. + + Initializing/destroying ApiClient objects is surprisingly expensive + (because it involves a threadpool), so we allow passing an existing + instance. If you already have a full-on API connection `api`, you can + access the ApiClient object as `api.api_client`. This is such a speed-up + that this argument may become mandatory. + """ + if not api_client: + api_client = ApiClient() + return api_client.sanitize_for_serialization(entity) + + +def entity_from_json(json_str: str, entity_type, api_client=None): + """ + Hack to take advantage of the code-generated deserialization code + + See note on `entity_to_dict()` about api_client argument. + """ + if not api_client: + api_client = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return api_client.deserialize(thing, entity_type) + + +def entity_from_dict(obj: dict, entity_type, api_client=None): + json_str = json.dumps(obj) + return entity_from_json(json_str, entity_type, api_client=api_client) + + +def entity_to_toml(entity, api_client=None, pop_fields=None) -> str: + """ + pop_fields parameter can be used to strip out some fields from the resulting + TOML. Eg, for fields which should not be edited, like the ident. + """ + obj = entity_to_dict(entity, api_client=api_client) + pop_fields = pop_fields or [] + for k in pop_fields: + obj.pop(k, None) + return toml.dumps(obj) + + +def entity_from_toml(toml_str: str, entity_type, api_client=None): + obj = toml.loads(toml_str) + return entity_from_dict(obj, entity_type, api_client=api_client) diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py new file mode 100644 index 0000000..194106d --- /dev/null +++ b/fuzzycat/fatcat/matching.py @@ -0,0 +1,233 @@ +# coding: utf-8 +""" +Public API for fuzzy matches for fatcat. + +Match methods return candidates, verify methods return a match status. + + match_containar_fuzzy -> List[ContainerEntity] + match_release_fuzzy -> List[ReleaseEntity] + + verify_serial_name -> MatchStatus + verify_container_name -> MatchStatus + verify_container_fuzzy -> MatchStatus + verify_release_fuzzy -> MatchStatus + +Candidate generation will use external data from search and hence is expensive. Verification is fast. +""" + +from typing import List + +import elasticsearch +from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity, + ReleaseExtIds, WorkEntity) +from fatcat_openapi_client.api.default_api import DefaultApi + +from fuzzycat.fatcat.common import MatchStatus, response_to_entity_list +from fuzzycat.serials import serialsdb + + +def match_container_fuzzy(container: ContainerEntity, + size: int = 5, + es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None, + api: Optional[DefaultApi] = None) -> List[ContainerEntity]: + """ + Given a container entity, which can be (very) partial, return a list of + candidate matches. Elasticsearch can be a hostport or the low level client + object. + + Random data point: with 20 parallel workers callind match_container_fuzzy, + we get around 40 req/s. + """ + assert isinstance(container, ContainerEntity) + + if size is None or size == 0: + size = 10000 # or any large number + + if isinstance(es, str): + es = elasticsearch.Elasticsearch([es]) + if es is None: + es = elasticsearch.Elasticsearch() + + # If we find any match by ISSN-L, we return only those. + if container.issnl: + s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query( + "term", issns=container.issnl).extra(size=size)) + resp = s.execute() + if len(resp) > 0: + return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) + + # Do we have an exact QID match? + if container.wikidata_qid: + s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query( + "term", wikidata_qid=container.wikidata_qid).extra(size=size)) + resp = s.execute() + if len(resp) > 0: + return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) + + # Start with exact name match. + # + # curl -s https://search.fatcat.wiki/fatcat_container/_mapping | jq . + # + # "name": { + # "type": "text", + # "copy_to": [ + # "biblio" + # ], + # "analyzer": "textIcu", + # "search_analyzer": "textIcuSearch" + # }, + # + body = { + "query": { + "match": { + "name": { + "query": container.name, + "operator": "AND" + } + } + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_container") + if resp["hits"]["total"] > 0: + return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) + + # Get fuzzy. + # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness + body = { + "query": { + "match": { + "name": { + "query": container.name, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_container") + if resp["hits"]["total"] > 0: + return response_to_entity_list(resp, entity_type=ContainerEntity, api=api) + + return [] + + +def match_release_fuzzy(release: ReleaseEntity, + size: int = 5, + es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None, + api: Optional[DefaultApi] = None) -> List[ReleaseEntity]: + """ + Given a release entity, return a number similar release entities from + fatcat using Elasticsearch. + """ + assert isinstance(release, ReleaseEntity) + + if size is None or size == 0: + size = 10000 # or any large number + + if isinstance(es, str): + es = elasticsearch.Elasticsearch([es]) + if es is None: + es = elasticsearch.Elasticsearch() + + # Try to match by external identifier. + ext_ids = release.ext_ids + attrs = { + "doi": "doi", + "wikidata_qid": "wikidata_qid", + "isbn13": "isbn13", + "pmid": "pmid", + "pmcid": "pmcid", + "core": "code_id", + "arxiv": "arxiv_id", + "jstor": "jstor_id", + "ark": "ark_id", + "mag": "mag_id", + } + for attr, es_field in attrs.items(): + value = getattr(ext_ids, attr) + if not value: + continue + s = (elasticsearch_dsl.Search(using=es, + index="fatcat_release").query("term", **{ + es_field: value + }).extra(size=size)) + resp = s.execute() + if len(resp) > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) + + body = { + "query": { + "match": { + "title": { + "query": release.title, + "operator": "AND" + } + } + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if resp["hits"]["total"] > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) + + # Get fuzzy. + # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness + body = { + "query": { + "match": { + "title": { + "query": release.title, + "operator": "AND", + "fuzziness": "AUTO", + } + } + }, + "size": size, + } + resp = es.search(body=body, index="fatcat_release") + if resp["hits"]["total"] > 0: + return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api) + + return [] + + +def verify_serial_name(a: str, b: str) -> MatchStatus: + """ + Serial name verification. Serial names are a subset of container names. + There are about 2M serials. + """ + issnls_for_a = serialsdb.get(a, set()) + issnls_for_b = serialsdb.get(b, set()) + + # If any name yields multiple ISSN-L, we cannot decide. + if len(issnls_for_a) > 1: + return MatchStatus.AMBIGIOUS + if len(issnls_for_b) > 1: + return MatchStatus.AMBIGIOUS + + # If both names point the same ISSN-L, it is an exact match. + if len(issnls_for_a) == 1 and len(issnls_for_b) == 1: + if len(issnls_for_a & issnls_for_b) == 1: + return MatchStatus.EXACT + else: + return MatchStatus.DIFFERENT + + # Multiple names possible, but there is overlap. + if len(issnls_for_a & issnls_for_b) > 0: + return MatchStatus.STRONG + + return MatchStatus.AMBIGIOUS + + +def verify_container_name(a: str, b: str) -> MatchStatus: + pass + + +def verify_container_match(a: ContainerEntity, b: ContainerEntity) -> MatchStatus: + pass + + +def verify_release_match(a: ReleaseEntity, b: ReleaseEntity) -> MatchStatus: + pass diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index e866992..aa6b78a 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -165,13 +165,13 @@ import os import re import shelve import sys -from typing import Dict, Iterable, List, Union - -from fuzzycat import cleanups -from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace) +from typing import Any, Callable, Dict, Generator, Iterable, List, Tuple, Union from simhash import Simhash +from fuzzycat import cleanups +from fuzzycat.utils import SetEncoder + def listify(v: Union[str, List[str]]) -> List[str]: """ @@ -184,7 +184,7 @@ def listify(v: Union[str, List[str]]) -> List[str]: return v -def jsonld_minimal(v: Dict) -> Dict: +def jsonld_minimal(v: Dict[str, Any]) -> Dict[str, Any]: """ Turn a JSON from issn.org into a smaller dict with a few core fields. Will fail, if no ISSN-L is found in the input. @@ -207,7 +207,6 @@ def jsonld_minimal(v: Dict) -> Dict: return {} doc = {} for item in items: - pass # "@id": "resource/ISSN-L/0001-4125" # "@id": "resource/ISSN/0001-4125" # ... @@ -262,7 +261,7 @@ def jsonld_minimal(v: Dict) -> Dict: def de_jsonld(lines: Iterable): """ - Batch convert to minimal JSON. + Batch convert jsonld to minimal JSON and write to stdout. """ for line in lines: line = line.strip() @@ -275,7 +274,9 @@ def de_jsonld(lines: Iterable): print(json.dumps(doc, cls=SetEncoder)) -def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True): +def generate_name_pairs(lines: Iterable, + cleanup_pipeline: Callable[[str], str] = None, + keep_original: bool = True) -> Generator[Tuple[str, str, str], None, None]: """ Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on errors. Proto unit test data. @@ -315,7 +316,8 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=Tr b = cleanup_pipeline(b) yield (doc["issnl"], a, b) -def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None): + +def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline: Callable[[str], str] = None): """ Given JSON lines, generate a dictionary mapping names sets of ISSN. Names might be reused. @@ -327,19 +329,23 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None): return mapping -def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None): +def generate_shelve(lines: Iterable, output: str, cleanup_pipeline: Callable[[str], str] = None): """ Generate a persistent key value store from name issn mappings. 5015523 entries, 1.1G take about 5min. """ with shelve.open(output) as db: - for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items(): + mapping = generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline) + for name, issnls in mapping.items(): db[name] = issnls print("wrote {} keys to {}".format(len(db), output), file=sys.stderr) + def generate_simhash(lines: Iterable): """ - simhash matches vs non-matches. + Print TSV with simhash values. + + Match and non-match count. 1069447 1 927120 0 @@ -366,28 +372,24 @@ def main(): parser.add_argument("--make-shelve", action="store_true", help="generate trie mapping from name to list of ISSN") - parser.add_argument("--make-simhash", - action="store_true", - help="print out simhash value") + parser.add_argument("--make-simhash", action="store_true", help="print out simhash value") parser.add_argument("-o", "--output", type=str, default="output.file", help="write output to file") - parser.add_argument("-c", - "--cleanup", - type=str, - default=None, - help="cleanup pipeline name") + parser.add_argument("-c", "--cleanup", type=str, default=None, help="cleanup pipeline name") parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld") args = parser.parse_args() - # Map more cleanup routines. + # Add additional cleanup routines here. cleanup = dict(basic=cleanups.basic).get(args.cleanup) if args.make_mapping: - print(json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), cls=SetEncoder)) + print( + json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), + cls=SetEncoder)) if args.make_pairs: for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup): print("{}\t{}\t{}".format(issn, a, b)) diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py deleted file mode 100644 index bd76b7f..0000000 --- a/fuzzycat/journals.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 - -""" -Journal name matching. Includes names from issn database and abbreviations. -""" - -import shelve - -class JournalLookup: - """ - Lookup allows to lookup journals, using a database of real journal names. - - >>> lookup = JournalLookup() - >>> lookup["Philosophica"] - {'1857-9272', '2232-299X', '2232-3007', '2232-3015'} - - """ - def __init__(self, namedb='names'): - """ - Note that shelve appends "db" to the name automatically. - """ - self.db = shelve.open(namedb) - - def __getitem__(self, v): - return self.db[v] - - def get(self, v, cleanup_pipeline=None): - if not cleanup_pipeline: - return self.db.get(v) - return self.db.get(cleanup_pipeline(v)) - - def close(self): - self.db.close() diff --git a/fuzzycat/main.py b/fuzzycat/main.py deleted file mode 100644 index 8da283b..0000000 --- a/fuzzycat/main.py +++ /dev/null @@ -1,5 +0,0 @@ -from fuzzycat import __version__ - - -def main(): - print("hello fuzzycat {}".format(__version__)) diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py deleted file mode 100644 index cbadbc2..0000000 --- a/fuzzycat/matching.py +++ /dev/null @@ -1,147 +0,0 @@ -import re -import string - -from ftfy import fix_text -from unidecode import unidecode - -from fuzzycat.status import MatchStatus -from fuzzycat.utils import * - - -def match_container_names(a: str, b: str) -> MatchStatus: - """ - Given two strings representing container names, return a match status. - TODO(martin): incorporate abbreviations mapping, other synonyms. - - Some name stats over 146302 real names from fatcat. - - In [11]: len(df) - Out[11]: 146302 - - In [12]: df.head() - Out[12]: - name nlen - 0 Sartre Studies International 28 - 1 Revolutionary world 19 - 2 Monograph Series on Nonlinear Science and Comp... 52 - 3 Hepatitis Monthly 17 - 4 TRACEY 6 - - In [13]: df.describe() - Out[13]: - nlen - count 146302.000000 - mean 33.891861 - std 18.955551 - min 2.000000 - 25% 20.000000 - 50% 31.000000 - 75% 44.000000 - max 286.000000 - - Aroung 4000 names which are not [a-zA-z ], e.g.: - - In [23]: df[df.is_alpha_only == False].sample(n=5) - Out[23]: - name nlen is_alpha_only - 118497 Журнал Фронтирных Исследований 30 False - 124885 Õpetatud Eesti Seltsi Aastaraamat/Yearbook of ... 74 False - 142217 Études économiques de l'OCDE : Norvège 38 False - 34681 حولیة کلیة أصول الدین والدعوة بالمنوفیة 39 False - 132251 Известия Российской академии наук Теория и сис... 61 False - - """ - - if a is None or b is None: - raise ValueError("strings required, got: a = {}, b = {}".format(a, b)) - - # Basic normalisation, try to remove superfluous whitespace, which should - # never matter, "HNO Praxis" - string_cleanups = StringPipeline([ - str.lower, - str.strip, - fix_text, - lambda s: re.sub(r"\s{2,}", " ", s), - lambda s: s.replace("&", "and"), - ]) - a = string_cleanups.run(a) - b = string_cleanups.run(b) - - # Derive some characteristics of the string. The keys are free form which - # may or may not be a problem. TODO(martin): maybe subclass str and just - # add additional methods? - sa = StringAnnotator([ - lambda s: { - "is_short_string": len(s) < 15 - }, - lambda s: { - "is_printable_only": all(c in string.printable for c in s) - }, - lambda s: { - "is_single_token": len(s.split()) < 2 - }, - lambda s: { - "letter_to_non_letter_ratio": letter_to_non_letter_ratio(s) - }, - lambda s: { - "alphanumeric_ratio": alphanumeric_ratio(s) - }, - lambda s: { - "has_diacritics": s != unidecode(s) - }, - lambda s: { - "startswith_the": s.startswith("the ") - }, - lambda s: { - "parenthesized_year": parenthesized_year(s) - }, - lambda s: { - "alphanumeric_only": alphanumeric_only(s) - }, - ]) - asa = sa.run(a) - bsa = sa.run(b) - - if asa["is_short_string"] and asa["letter_to_non_letter_ratio"] > 0.4: - if a == b: - return MatchStatus.EXACT - - if not asa["is_short_string"] and not asa["is_single_token"]: - if a == b: - return MatchStatus.EXACT - - # Short, single (ascii) word titles, like "Language" and the like. Single - # token "臨床皮膚科" needs to pass. - if asa["is_printable_only"] and asa["is_single_token"]: - return MatchStatus.AMBIGIOUS - - if a == b: - return MatchStatus.EXACT - - # Mostly ASCII, but with some possible artifacts. - if (asa["alphanumeric_ratio"] > 0.9 and asa["alphanumeric_only"] == bsa["alphanumeric_only"]): - return MatchStatus.STRONG - - # Year in parentheses case, e.g. "Conf X (2018)" and "Conf X (2019)" should - # be different; about 3% of names contain a '(', 1% some possible date. - if (asa["parenthesized_year"] and asa["parenthesized_year"] == bsa["parenthesized_year"]): - return MatchStatus.DIFFERENT - - # Common prefixes (maybe curate these manually): - common_prefixes = ("precarpathian bulletin of the shevchenko scientific society", ) - for prefix in common_prefixes: - if a.startswith(prefix) and a != b: - return MatchStatus.DIFFERENT - - if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.9): - return MatchStatus.STRONG - - if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.7): - return MatchStatus.WEAK - - # Address e.g. a char flip, but only, if we do not have diacritics. - if (not asa["is_short_string"] and not asa["is_single_token"] and not asa["has_diacritics"] - and hamming_distance(a, b) < 2): - return MatchStatus.STRONG - - return MatchStatus.AMBIGIOUS diff --git a/fuzzycat/serials.py b/fuzzycat/serials.py new file mode 100644 index 0000000..5222084 --- /dev/null +++ b/fuzzycat/serials.py @@ -0,0 +1,43 @@ +# coding: utf-8 +""" +Serial name matching. Includes names from issn database. +""" + +import os +import shelve + +__all__ = ["serialsdb"] + + +class SerialsDatabase: + """ + Lookup allows to lookup serial names, using a database of real serial names. + + >>> from serials import serialsdb + >>> serialsdb.get("Philosophica") + {'1857-9272', '2232-299X', '2232-3007', '2232-3015'} + + """ + def __init__(self, path='names'): + """ + Note that shelve appends "db" to the name automatically. TODO: make this + auto-download into a cache directory. + """ + if path is None: + path = os.path.join(os.path.expanduser("~"), ".cache/fuzzycat/names") + self.db = shelve.open(path, flag='r') + + def __getitem__(self, v): + return self.db[v] + + def get(self, v, default=None, cleanup_pipeline=None): + if not cleanup_pipeline: + return self.db.get(v, default=default) + return self.db.get(cleanup_pipeline(v), default=default) + + def close(self): + self.db.close() + + +# A singleton. +serialsdb = SerialsDatabase() diff --git a/fuzzycat/status.py b/fuzzycat/status.py deleted file mode 100644 index f87c4e6..0000000 --- a/fuzzycat/status.py +++ /dev/null @@ -1,15 +0,0 @@ -from enum import Enum - - -class MatchStatus(Enum): - """ - When matching two entities, use these levels to express match strength. - When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is - certain, that items do not match. - """ - - EXACT = 0 - STRONG = 1 - WEAK = 2 - AMBIGIOUS = 3 - DIFFERENT = 4 diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index ab693eb..9d2a2f7 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -37,7 +37,7 @@ class StringPipeline: ... normalize_whitespace, ... normalize_ampersand, ... ]) - >>> cleanups.run("<a>Input & Output</a>") + >>> cleanups("<a>Input & Output</a>") input and output """ @@ -49,7 +49,7 @@ class StringPipeline: def run(self, s: str) -> str: """ - Apply all function and return result. + Apply all function and return result. Deprecated: just call the object. """ for f in self.fs: s = f(s) @@ -23,18 +23,20 @@ with open("README.md", "r") as fh: python_requires=">=3.6", zip_safe=False, entry_points={"console_scripts": [ - "fuzzycat=fuzzycat.main:main", "fuzzycat-issn=fuzzycat.issn:main", ],}, install_requires=[ "fatcat-openapi-client", "ftfy", "simhash", - "unidecode", + "unidecode>=0.10", + "toml", + "elasticsearch>=7", ], extras_require={"dev": [ "ipython", "isort", + "pylint", "jupyter", "matplotlib", "pandas", |