aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-17 17:30:28 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-17 17:30:28 +0200
commit3cdd049998ed85827dc6339725ea8fdda5a700aa (patch)
treec243547543cdbe11d235a13af42e016e633f6c6e
parent5084ca9fe10b4c58afc28370d98cf3e798bd2109 (diff)
downloadfuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.tar.gz
fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.zip
large overhaul
* separate all fatcat related code into fatcat submodule * more type annotations * add verify_serial_name for journal names
-rw-r--r--fuzzycat/__init__.py4
-rw-r--r--fuzzycat/cleanups.py6
-rw-r--r--fuzzycat/fatcat/api_auth.py45
-rw-r--r--fuzzycat/fatcat/common.py164
-rw-r--r--fuzzycat/fatcat/entities.py60
-rw-r--r--fuzzycat/fatcat/matching.py233
-rw-r--r--fuzzycat/issn.py46
-rw-r--r--fuzzycat/journals.py33
-rw-r--r--fuzzycat/main.py5
-rw-r--r--fuzzycat/matching.py147
-rw-r--r--fuzzycat/serials.py43
-rw-r--r--fuzzycat/status.py15
-rw-r--r--fuzzycat/utils.py4
-rw-r--r--setup.py6
14 files changed, 577 insertions, 234 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 7feffd5..6c381d0 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -1,6 +1,4 @@
__version__ = "0.1.1"
-from fuzzycat.matching import match_container_names
-from fuzzycat.status import MatchStatus
+from fuzzycat.serials import serialsdb
from fuzzycat.utils import *
-from fuzzycat.journals import JournalLookup
diff --git a/fuzzycat/cleanups.py b/fuzzycat/cleanups.py
index d806e51..c2e021d 100644
--- a/fuzzycat/cleanups.py
+++ b/fuzzycat/cleanups.py
@@ -1,10 +1,8 @@
-
"""
Various shared cleanup approaches.
"""
-from fuzzycat.utils import StringPipeline, normalize_whitespace, normalize_ampersand
-
+from fuzzycat.utils import (StringPipeline, normalize_ampersand, normalize_whitespace)
# These transformations should not affect the name or a journal.
basic = StringPipeline([
@@ -13,5 +11,3 @@ basic = StringPipeline([
normalize_ampersand,
lambda v: v.rstrip("."),
])
-
-
diff --git a/fuzzycat/fatcat/api_auth.py b/fuzzycat/fatcat/api_auth.py
new file mode 100644
index 0000000..0bad5e9
--- /dev/null
+++ b/fuzzycat/fatcat/api_auth.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+API helper, taken from fatcat_tools/api_auth.py
+"""
+
+import os
+import sys
+
+import fatcat_openapi_client
+
+
+def public_api(host_uri):
+ """
+ Note: unlike the authenticated variant, this helper might get called even
+ if the API isn't going to be used, so it's important that it doesn't try to
+ actually connect to the API host or something.
+ """
+ conf = fatcat_openapi_client.Configuration()
+ conf.host = host_uri
+ return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+
+
+def authenticated_api(host_uri, token=None):
+ """
+ Note: if this helper is called, it's implied that an actual API connection
+ is needed, so it does try to connect and verify credentials.
+ """
+
+ conf = fatcat_openapi_client.Configuration()
+ conf.host = host_uri
+ if not token:
+ token = os.environ['FATCAT_API_AUTH_TOKEN']
+ if not token:
+ sys.stderr.write(
+ 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n')
+ sys.exit(-1)
+
+ conf.api_key["Authorization"] = token
+ conf.api_key_prefix["Authorization"] = "Bearer"
+ api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+
+ # verify up front that auth is working
+ api.auth_check()
+
+ return api
diff --git a/fuzzycat/fatcat/common.py b/fuzzycat/fatcat/common.py
new file mode 100644
index 0000000..7499ce4
--- /dev/null
+++ b/fuzzycat/fatcat/common.py
@@ -0,0 +1,164 @@
+# coding: utf-8
+"""
+Adapter for fatcat and fatcat entities.
+"""
+
+import collections
+from enum import Enum
+from typing import Dict, List, Type, Union
+
+from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
+ ReleaseExtIds, WorkEntity)
+
+from fuzzycat.fatcat.api_auth import public_api
+from fuzzycat.fatcat.entities import entity_from_dict, entity_from_json
+
+
+class MatchStatus(Enum):
+ """
+ When matching two entities, use these levels to express match strength.
+ When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is
+ certain, that items do not match.
+ """
+
+ EXACT = 0
+ STRONG = 1
+ WEAK = 2
+ AMBIGIOUS = 3
+ DIFFERENT = 4
+
+
+def compare_ext_ids(a: ReleaseExtIds, b: ReleaseExtIds) -> Dict[str, int]:
+ """
+ Returns a dictionary with number of existing, matching and differing
+ identifier between entity a and b. TODO(martin): It might be helpful to
+ have some mapping service, that would relate qid to doi, or a mag to a
+ jstor id, if this information is known.
+ """
+ counter = collections.Counter({"a": 0, "b": 0, "both": 0, "hits": 0, "misses": 0})
+ attrs = (
+ "doi",
+ "wikidata_qid",
+ "isbn13",
+ "pmid",
+ "pmcid",
+ "core",
+ "arxiv",
+ "jstor",
+ "ark",
+ "mag",
+ )
+ for attr in attrs:
+ v = getattr(a, attr)
+ w = getattr(b, attr)
+ if v:
+ counter["a"] += 1
+ if w:
+ counter["b"] += 1
+ if not v or not w:
+ continue
+ counter["both"] += 1
+ if v == w:
+ counter["hits"] += 1
+ else:
+ counter["misses"] += 1
+ return counter
+
+
+def fetch_container_list(
+ ids: List[str],
+ api: DefaultApi = None,
+) -> List[ContainerEntity]:
+ """
+ Fetch a list of containers from the API.
+ """
+ if api is None:
+ api = public_api("https://api.fatcat.wiki/v0")
+ result = []
+ for id in ids:
+ try:
+ ce = api.get_container(id)
+ result.append(ce)
+ except ApiException as exc:
+ if exc.status == 404:
+ print("[err] failed to fetch container: {}".format(id), file=sys.stderr)
+ continue
+ raise
+ return result
+
+
+def fetch_release_list(
+ ids: List[str],
+ api: DefaultApi = None,
+) -> List[ReleaseEntity]:
+ """
+ Returns a list of entities. Some entities might be missing. Return all that
+ are accessible.
+ """
+ if api is None:
+ api = public_api("https://api.fatcat.wiki/v0")
+ result = []
+ for id in ids:
+ try:
+ re = api.get_release(id, hide="refs,abstracts", expand="container")
+ result.append(re)
+ except ApiException as exc:
+ if exc.status == 404:
+ print("[err] failed to fetch release: {}".format(id), file=sys.stderr)
+ continue
+ raise
+ return result
+
+
+def entity_comparable_attrs(
+ a: Union[ContainerEntity, ReleaseEntity],
+ b: Union[ContainerEntity, ReleaseEntity],
+ entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]],
+) -> List[str]:
+ """
+ Return a list of top-level attributes, which are defined on both entities
+ (i.e. we could actually compare them).
+ """
+ attrs = entity_type.attribute_map.keys()
+ comparable_attrs = []
+ for attr in attrs:
+ if getattr(a, attr) is None:
+ continue
+ if getattr(b, attr) is None:
+ continue
+ comparable_attrs.append(attr)
+ return comparable_attrs
+
+
+def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api=None):
+ """
+ Convert an elasticsearch result to a list of entities. Accepts both a
+ dictionary and an elasticsearch_dsl.response.Response.
+
+ We take the ids from elasticsearch and retrieve entities via API.
+ """
+ if isinstance(response, dict):
+ ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size]
+ elif isinstance(response, elasticsearch_dsl.response.Response):
+ ids = [hit.to_dict().get("ident") for hit in response]
+
+ if entity_type == ReleaseEntity:
+ return fetch_release_list(ids, api=api)
+ if entity_type == ContainerEntity:
+ return fetch_container_list(ids, api=api)
+
+ raise ValueError("invalid entity type: {}".format(entity_type))
+
+
+def exact_release_match(a: ReleaseEntity, b: ReleaseEntity) -> bool:
+ """
+ Currently, entities implement comparison through object dictionaries.
+ """
+ return a == b
+
+
+def exact_work_match(a: WorkEntity, b: WorkEntity) -> bool:
+ """
+ Currently, entities implement comparison through object dictionaries.
+ """
+ return a == b
diff --git a/fuzzycat/fatcat/entities.py b/fuzzycat/fatcat/entities.py
new file mode 100644
index 0000000..351c2b8
--- /dev/null
+++ b/fuzzycat/fatcat/entities.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+"""
+This is taken from fatcat_tools/transforms/entities.
+"""
+
+import collections
+import json
+
+import toml
+from fatcat_openapi_client import ApiClient
+
+
+def entity_to_dict(entity, api_client=None) -> dict:
+ """
+ Hack to take advantage of the code-generated serialization code.
+
+ Initializing/destroying ApiClient objects is surprisingly expensive
+ (because it involves a threadpool), so we allow passing an existing
+ instance. If you already have a full-on API connection `api`, you can
+ access the ApiClient object as `api.api_client`. This is such a speed-up
+ that this argument may become mandatory.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ return api_client.sanitize_for_serialization(entity)
+
+
+def entity_from_json(json_str: str, entity_type, api_client=None):
+ """
+ Hack to take advantage of the code-generated deserialization code
+
+ See note on `entity_to_dict()` about api_client argument.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ thing = collections.namedtuple('Thing', ['data'])
+ thing.data = json_str
+ return api_client.deserialize(thing, entity_type)
+
+
+def entity_from_dict(obj: dict, entity_type, api_client=None):
+ json_str = json.dumps(obj)
+ return entity_from_json(json_str, entity_type, api_client=api_client)
+
+
+def entity_to_toml(entity, api_client=None, pop_fields=None) -> str:
+ """
+ pop_fields parameter can be used to strip out some fields from the resulting
+ TOML. Eg, for fields which should not be edited, like the ident.
+ """
+ obj = entity_to_dict(entity, api_client=api_client)
+ pop_fields = pop_fields or []
+ for k in pop_fields:
+ obj.pop(k, None)
+ return toml.dumps(obj)
+
+
+def entity_from_toml(toml_str: str, entity_type, api_client=None):
+ obj = toml.loads(toml_str)
+ return entity_from_dict(obj, entity_type, api_client=api_client)
diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py
new file mode 100644
index 0000000..194106d
--- /dev/null
+++ b/fuzzycat/fatcat/matching.py
@@ -0,0 +1,233 @@
+# coding: utf-8
+"""
+Public API for fuzzy matches for fatcat.
+
+Match methods return candidates, verify methods return a match status.
+
+ match_containar_fuzzy -> List[ContainerEntity]
+ match_release_fuzzy -> List[ReleaseEntity]
+
+ verify_serial_name -> MatchStatus
+ verify_container_name -> MatchStatus
+ verify_container_fuzzy -> MatchStatus
+ verify_release_fuzzy -> MatchStatus
+
+Candidate generation will use external data from search and hence is expensive. Verification is fast.
+"""
+
+from typing import List
+
+import elasticsearch
+from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
+ ReleaseExtIds, WorkEntity)
+from fatcat_openapi_client.api.default_api import DefaultApi
+
+from fuzzycat.fatcat.common import MatchStatus, response_to_entity_list
+from fuzzycat.serials import serialsdb
+
+
+def match_container_fuzzy(container: ContainerEntity,
+ size: int = 5,
+ es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None,
+ api: Optional[DefaultApi] = None) -> List[ContainerEntity]:
+ """
+ Given a container entity, which can be (very) partial, return a list of
+ candidate matches. Elasticsearch can be a hostport or the low level client
+ object.
+
+ Random data point: with 20 parallel workers callind match_container_fuzzy,
+ we get around 40 req/s.
+ """
+ assert isinstance(container, ContainerEntity)
+
+ if size is None or size == 0:
+ size = 10000 # or any large number
+
+ if isinstance(es, str):
+ es = elasticsearch.Elasticsearch([es])
+ if es is None:
+ es = elasticsearch.Elasticsearch()
+
+ # If we find any match by ISSN-L, we return only those.
+ if container.issnl:
+ s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query(
+ "term", issns=container.issnl).extra(size=size))
+ resp = s.execute()
+ if len(resp) > 0:
+ return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+ # Do we have an exact QID match?
+ if container.wikidata_qid:
+ s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query(
+ "term", wikidata_qid=container.wikidata_qid).extra(size=size))
+ resp = s.execute()
+ if len(resp) > 0:
+ return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+ # Start with exact name match.
+ #
+ # curl -s https://search.fatcat.wiki/fatcat_container/_mapping | jq .
+ #
+ # "name": {
+ # "type": "text",
+ # "copy_to": [
+ # "biblio"
+ # ],
+ # "analyzer": "textIcu",
+ # "search_analyzer": "textIcuSearch"
+ # },
+ #
+ body = {
+ "query": {
+ "match": {
+ "name": {
+ "query": container.name,
+ "operator": "AND"
+ }
+ }
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_container")
+ if resp["hits"]["total"] > 0:
+ return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+ # Get fuzzy.
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+ body = {
+ "query": {
+ "match": {
+ "name": {
+ "query": container.name,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ }
+ }
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_container")
+ if resp["hits"]["total"] > 0:
+ return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+ return []
+
+
+def match_release_fuzzy(release: ReleaseEntity,
+ size: int = 5,
+ es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None,
+ api: Optional[DefaultApi] = None) -> List[ReleaseEntity]:
+ """
+ Given a release entity, return a number similar release entities from
+ fatcat using Elasticsearch.
+ """
+ assert isinstance(release, ReleaseEntity)
+
+ if size is None or size == 0:
+ size = 10000 # or any large number
+
+ if isinstance(es, str):
+ es = elasticsearch.Elasticsearch([es])
+ if es is None:
+ es = elasticsearch.Elasticsearch()
+
+ # Try to match by external identifier.
+ ext_ids = release.ext_ids
+ attrs = {
+ "doi": "doi",
+ "wikidata_qid": "wikidata_qid",
+ "isbn13": "isbn13",
+ "pmid": "pmid",
+ "pmcid": "pmcid",
+ "core": "code_id",
+ "arxiv": "arxiv_id",
+ "jstor": "jstor_id",
+ "ark": "ark_id",
+ "mag": "mag_id",
+ }
+ for attr, es_field in attrs.items():
+ value = getattr(ext_ids, attr)
+ if not value:
+ continue
+ s = (elasticsearch_dsl.Search(using=es,
+ index="fatcat_release").query("term", **{
+ es_field: value
+ }).extra(size=size))
+ resp = s.execute()
+ if len(resp) > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
+
+ body = {
+ "query": {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND"
+ }
+ }
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_release")
+ if resp["hits"]["total"] > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
+
+ # Get fuzzy.
+ # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+ body = {
+ "query": {
+ "match": {
+ "title": {
+ "query": release.title,
+ "operator": "AND",
+ "fuzziness": "AUTO",
+ }
+ }
+ },
+ "size": size,
+ }
+ resp = es.search(body=body, index="fatcat_release")
+ if resp["hits"]["total"] > 0:
+ return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
+
+ return []
+
+
+def verify_serial_name(a: str, b: str) -> MatchStatus:
+ """
+ Serial name verification. Serial names are a subset of container names.
+ There are about 2M serials.
+ """
+ issnls_for_a = serialsdb.get(a, set())
+ issnls_for_b = serialsdb.get(b, set())
+
+ # If any name yields multiple ISSN-L, we cannot decide.
+ if len(issnls_for_a) > 1:
+ return MatchStatus.AMBIGIOUS
+ if len(issnls_for_b) > 1:
+ return MatchStatus.AMBIGIOUS
+
+ # If both names point the same ISSN-L, it is an exact match.
+ if len(issnls_for_a) == 1 and len(issnls_for_b) == 1:
+ if len(issnls_for_a & issnls_for_b) == 1:
+ return MatchStatus.EXACT
+ else:
+ return MatchStatus.DIFFERENT
+
+ # Multiple names possible, but there is overlap.
+ if len(issnls_for_a & issnls_for_b) > 0:
+ return MatchStatus.STRONG
+
+ return MatchStatus.AMBIGIOUS
+
+
+def verify_container_name(a: str, b: str) -> MatchStatus:
+ pass
+
+
+def verify_container_match(a: ContainerEntity, b: ContainerEntity) -> MatchStatus:
+ pass
+
+
+def verify_release_match(a: ReleaseEntity, b: ReleaseEntity) -> MatchStatus:
+ pass
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index e866992..aa6b78a 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -165,13 +165,13 @@ import os
import re
import shelve
import sys
-from typing import Dict, Iterable, List, Union
-
-from fuzzycat import cleanups
-from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace)
+from typing import Any, Callable, Dict, Generator, Iterable, List, Tuple, Union
from simhash import Simhash
+from fuzzycat import cleanups
+from fuzzycat.utils import SetEncoder
+
def listify(v: Union[str, List[str]]) -> List[str]:
"""
@@ -184,7 +184,7 @@ def listify(v: Union[str, List[str]]) -> List[str]:
return v
-def jsonld_minimal(v: Dict) -> Dict:
+def jsonld_minimal(v: Dict[str, Any]) -> Dict[str, Any]:
"""
Turn a JSON from issn.org into a smaller dict with a few core fields. Will
fail, if no ISSN-L is found in the input.
@@ -207,7 +207,6 @@ def jsonld_minimal(v: Dict) -> Dict:
return {}
doc = {}
for item in items:
- pass
# "@id": "resource/ISSN-L/0001-4125"
# "@id": "resource/ISSN/0001-4125"
# ...
@@ -262,7 +261,7 @@ def jsonld_minimal(v: Dict) -> Dict:
def de_jsonld(lines: Iterable):
"""
- Batch convert to minimal JSON.
+ Batch convert jsonld to minimal JSON and write to stdout.
"""
for line in lines:
line = line.strip()
@@ -275,7 +274,9 @@ def de_jsonld(lines: Iterable):
print(json.dumps(doc, cls=SetEncoder))
-def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True):
+def generate_name_pairs(lines: Iterable,
+ cleanup_pipeline: Callable[[str], str] = None,
+ keep_original: bool = True) -> Generator[Tuple[str, str, str], None, None]:
"""
Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
errors. Proto unit test data.
@@ -315,7 +316,8 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=Tr
b = cleanup_pipeline(b)
yield (doc["issnl"], a, b)
-def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
+
+def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline: Callable[[str], str] = None):
"""
Given JSON lines, generate a dictionary mapping names sets of ISSN. Names
might be reused.
@@ -327,19 +329,23 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
return mapping
-def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):
+def generate_shelve(lines: Iterable, output: str, cleanup_pipeline: Callable[[str], str] = None):
"""
Generate a persistent key value store from name issn mappings. 5015523
entries, 1.1G take about 5min.
"""
with shelve.open(output) as db:
- for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():
+ mapping = generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline)
+ for name, issnls in mapping.items():
db[name] = issnls
print("wrote {} keys to {}".format(len(db), output), file=sys.stderr)
+
def generate_simhash(lines: Iterable):
"""
- simhash matches vs non-matches.
+ Print TSV with simhash values.
+
+ Match and non-match count.
1069447 1
927120 0
@@ -366,28 +372,24 @@ def main():
parser.add_argument("--make-shelve",
action="store_true",
help="generate trie mapping from name to list of ISSN")
- parser.add_argument("--make-simhash",
- action="store_true",
- help="print out simhash value")
+ parser.add_argument("--make-simhash", action="store_true", help="print out simhash value")
parser.add_argument("-o",
"--output",
type=str,
default="output.file",
help="write output to file")
- parser.add_argument("-c",
- "--cleanup",
- type=str,
- default=None,
- help="cleanup pipeline name")
+ parser.add_argument("-c", "--cleanup", type=str, default=None, help="cleanup pipeline name")
parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")
args = parser.parse_args()
- # Map more cleanup routines.
+ # Add additional cleanup routines here.
cleanup = dict(basic=cleanups.basic).get(args.cleanup)
if args.make_mapping:
- print(json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), cls=SetEncoder))
+ print(
+ json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup),
+ cls=SetEncoder))
if args.make_pairs:
for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup):
print("{}\t{}\t{}".format(issn, a, b))
diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py
deleted file mode 100644
index bd76b7f..0000000
--- a/fuzzycat/journals.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-
-"""
-Journal name matching. Includes names from issn database and abbreviations.
-"""
-
-import shelve
-
-class JournalLookup:
- """
- Lookup allows to lookup journals, using a database of real journal names.
-
- >>> lookup = JournalLookup()
- >>> lookup["Philosophica"]
- {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
-
- """
- def __init__(self, namedb='names'):
- """
- Note that shelve appends "db" to the name automatically.
- """
- self.db = shelve.open(namedb)
-
- def __getitem__(self, v):
- return self.db[v]
-
- def get(self, v, cleanup_pipeline=None):
- if not cleanup_pipeline:
- return self.db.get(v)
- return self.db.get(cleanup_pipeline(v))
-
- def close(self):
- self.db.close()
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
deleted file mode 100644
index 8da283b..0000000
--- a/fuzzycat/main.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from fuzzycat import __version__
-
-
-def main():
- print("hello fuzzycat {}".format(__version__))
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
deleted file mode 100644
index cbadbc2..0000000
--- a/fuzzycat/matching.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import re
-import string
-
-from ftfy import fix_text
-from unidecode import unidecode
-
-from fuzzycat.status import MatchStatus
-from fuzzycat.utils import *
-
-
-def match_container_names(a: str, b: str) -> MatchStatus:
- """
- Given two strings representing container names, return a match status.
- TODO(martin): incorporate abbreviations mapping, other synonyms.
-
- Some name stats over 146302 real names from fatcat.
-
- In [11]: len(df)
- Out[11]: 146302
-
- In [12]: df.head()
- Out[12]:
- name nlen
- 0 Sartre Studies International 28
- 1 Revolutionary world 19
- 2 Monograph Series on Nonlinear Science and Comp... 52
- 3 Hepatitis Monthly 17
- 4 TRACEY 6
-
- In [13]: df.describe()
- Out[13]:
- nlen
- count 146302.000000
- mean 33.891861
- std 18.955551
- min 2.000000
- 25% 20.000000
- 50% 31.000000
- 75% 44.000000
- max 286.000000
-
- Aroung 4000 names which are not [a-zA-z ], e.g.:
-
- In [23]: df[df.is_alpha_only == False].sample(n=5)
- Out[23]:
- name nlen is_alpha_only
- 118497 Журнал Фронтирных Исследований 30 False
- 124885 Õpetatud Eesti Seltsi Aastaraamat/Yearbook of ... 74 False
- 142217 Études économiques de l'OCDE : Norvège 38 False
- 34681 حولیة کلیة أصول الدین والدعوة بالمنوفیة 39 False
- 132251 Известия Российской академии наук Теория и сис... 61 False
-
- """
-
- if a is None or b is None:
- raise ValueError("strings required, got: a = {}, b = {}".format(a, b))
-
- # Basic normalisation, try to remove superfluous whitespace, which should
- # never matter, "HNO Praxis"
- string_cleanups = StringPipeline([
- str.lower,
- str.strip,
- fix_text,
- lambda s: re.sub(r"\s{2,}", " ", s),
- lambda s: s.replace("&", "and"),
- ])
- a = string_cleanups.run(a)
- b = string_cleanups.run(b)
-
- # Derive some characteristics of the string. The keys are free form which
- # may or may not be a problem. TODO(martin): maybe subclass str and just
- # add additional methods?
- sa = StringAnnotator([
- lambda s: {
- "is_short_string": len(s) < 15
- },
- lambda s: {
- "is_printable_only": all(c in string.printable for c in s)
- },
- lambda s: {
- "is_single_token": len(s.split()) < 2
- },
- lambda s: {
- "letter_to_non_letter_ratio": letter_to_non_letter_ratio(s)
- },
- lambda s: {
- "alphanumeric_ratio": alphanumeric_ratio(s)
- },
- lambda s: {
- "has_diacritics": s != unidecode(s)
- },
- lambda s: {
- "startswith_the": s.startswith("the ")
- },
- lambda s: {
- "parenthesized_year": parenthesized_year(s)
- },
- lambda s: {
- "alphanumeric_only": alphanumeric_only(s)
- },
- ])
- asa = sa.run(a)
- bsa = sa.run(b)
-
- if asa["is_short_string"] and asa["letter_to_non_letter_ratio"] > 0.4:
- if a == b:
- return MatchStatus.EXACT
-
- if not asa["is_short_string"] and not asa["is_single_token"]:
- if a == b:
- return MatchStatus.EXACT
-
- # Short, single (ascii) word titles, like "Language" and the like. Single
- # token "臨床皮膚科" needs to pass.
- if asa["is_printable_only"] and asa["is_single_token"]:
- return MatchStatus.AMBIGIOUS
-
- if a == b:
- return MatchStatus.EXACT
-
- # Mostly ASCII, but with some possible artifacts.
- if (asa["alphanumeric_ratio"] > 0.9 and asa["alphanumeric_only"] == bsa["alphanumeric_only"]):
- return MatchStatus.STRONG
-
- # Year in parentheses case, e.g. "Conf X (2018)" and "Conf X (2019)" should
- # be different; about 3% of names contain a '(', 1% some possible date.
- if (asa["parenthesized_year"] and asa["parenthesized_year"] == bsa["parenthesized_year"]):
- return MatchStatus.DIFFERENT
-
- # Common prefixes (maybe curate these manually):
- common_prefixes = ("precarpathian bulletin of the shevchenko scientific society", )
- for prefix in common_prefixes:
- if a.startswith(prefix) and a != b:
- return MatchStatus.DIFFERENT
-
- if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.9):
- return MatchStatus.STRONG
-
- if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.7):
- return MatchStatus.WEAK
-
- # Address e.g. a char flip, but only, if we do not have diacritics.
- if (not asa["is_short_string"] and not asa["is_single_token"] and not asa["has_diacritics"]
- and hamming_distance(a, b) < 2):
- return MatchStatus.STRONG
-
- return MatchStatus.AMBIGIOUS
diff --git a/fuzzycat/serials.py b/fuzzycat/serials.py
new file mode 100644
index 0000000..5222084
--- /dev/null
+++ b/fuzzycat/serials.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+"""
+Serial name matching. Includes names from issn database.
+"""
+
+import os
+import shelve
+
+__all__ = ["serialsdb"]
+
+
+class SerialsDatabase:
+ """
+ Lookup allows to lookup serial names, using a database of real serial names.
+
+ >>> from serials import serialsdb
+ >>> serialsdb.get("Philosophica")
+ {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
+
+ """
+ def __init__(self, path='names'):
+ """
+ Note that shelve appends "db" to the name automatically. TODO: make this
+ auto-download into a cache directory.
+ """
+ if path is None:
+ path = os.path.join(os.path.expanduser("~"), ".cache/fuzzycat/names")
+ self.db = shelve.open(path, flag='r')
+
+ def __getitem__(self, v):
+ return self.db[v]
+
+ def get(self, v, default=None, cleanup_pipeline=None):
+ if not cleanup_pipeline:
+ return self.db.get(v, default=default)
+ return self.db.get(cleanup_pipeline(v), default=default)
+
+ def close(self):
+ self.db.close()
+
+
+# A singleton.
+serialsdb = SerialsDatabase()
diff --git a/fuzzycat/status.py b/fuzzycat/status.py
deleted file mode 100644
index f87c4e6..0000000
--- a/fuzzycat/status.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from enum import Enum
-
-
-class MatchStatus(Enum):
- """
- When matching two entities, use these levels to express match strength.
- When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is
- certain, that items do not match.
- """
-
- EXACT = 0
- STRONG = 1
- WEAK = 2
- AMBIGIOUS = 3
- DIFFERENT = 4
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index ab693eb..9d2a2f7 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -37,7 +37,7 @@ class StringPipeline:
... normalize_whitespace,
... normalize_ampersand,
... ])
- >>> cleanups.run("<a>Input & Output</a>")
+ >>> cleanups("<a>Input & Output</a>")
input and output
"""
@@ -49,7 +49,7 @@ class StringPipeline:
def run(self, s: str) -> str:
"""
- Apply all function and return result.
+ Apply all function and return result. Deprecated: just call the object.
"""
for f in self.fs:
s = f(s)
diff --git a/setup.py b/setup.py
index e2ff44b..25ccc46 100644
--- a/setup.py
+++ b/setup.py
@@ -23,18 +23,20 @@ with open("README.md", "r") as fh:
python_requires=">=3.6",
zip_safe=False,
entry_points={"console_scripts": [
- "fuzzycat=fuzzycat.main:main",
"fuzzycat-issn=fuzzycat.issn:main",
],},
install_requires=[
"fatcat-openapi-client",
"ftfy",
"simhash",
- "unidecode",
+ "unidecode>=0.10",
+ "toml",
+ "elasticsearch>=7",
],
extras_require={"dev": [
"ipython",
"isort",
+ "pylint",
"jupyter",
"matplotlib",
"pandas",