large overhaul

* separate all fatcat related code into fatcat submodule * more type annotations * add verify_serial_name for journal names
author: Martin Czygan <martin.czygan@gmail.com> 2020-08-17 17:30:28 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-08-17 17:30:28 +0200
commit: 3cdd049998ed85827dc6339725ea8fdda5a700aa (patch)
tree: c243547543cdbe11d235a13af42e016e633f6c6e
parent: 5084ca9fe10b4c58afc28370d98cf3e798bd2109 (diff)
download: fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.tar.gz
fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.zip
14 files changed, 577 insertions, 234 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 7feffd5..6c381d0 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -1,6 +1,4 @@
 __version__ = "0.1.1"
 
-from fuzzycat.matching import match_container_names
-from fuzzycat.status import MatchStatus
+from fuzzycat.serials import serialsdb
 from fuzzycat.utils import *
-from fuzzycat.journals import JournalLookup
diff --git a/fuzzycat/cleanups.py b/fuzzycat/cleanups.py
index d806e51..c2e021d 100644
--- a/fuzzycat/cleanups.py
+++ b/fuzzycat/cleanups.py
@@ -1,10 +1,8 @@
-
 """
 Various shared cleanup approaches.
 """
 
-from fuzzycat.utils import StringPipeline, normalize_whitespace, normalize_ampersand
-
+from fuzzycat.utils import (StringPipeline, normalize_ampersand, normalize_whitespace)
 
 # These transformations should not affect the name or a journal.
 basic = StringPipeline([
@@ -13,5 +11,3 @@ basic = StringPipeline([
     normalize_ampersand,
     lambda v: v.rstrip("."),
 ])
-
-
diff --git a/fuzzycat/fatcat/api_auth.py b/fuzzycat/fatcat/api_auth.py
new file mode 100644
index 0000000..0bad5e9
--- /dev/null
+++ b/fuzzycat/fatcat/api_auth.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+API helper, taken from fatcat_tools/api_auth.py
+"""
+
+import os
+import sys
+
+import fatcat_openapi_client
+
+
+def public_api(host_uri):
+    """
+    Note: unlike the authenticated variant, this helper might get called even
+    if the API isn't going to be used, so it's important that it doesn't try to
+    actually connect to the API host or something.
+    """
+    conf = fatcat_openapi_client.Configuration()
+    conf.host = host_uri
+    return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+
+
+def authenticated_api(host_uri, token=None):
+    """
+    Note: if this helper is called, it's implied that an actual API connection
+    is needed, so it does try to connect and verify credentials.
+    """
+
+    conf = fatcat_openapi_client.Configuration()
+    conf.host = host_uri
+    if not token:
+        token = os.environ['FATCAT_API_AUTH_TOKEN']
+    if not token:
+        sys.stderr.write(
+            'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n')
+        sys.exit(-1)
+
+    conf.api_key["Authorization"] = token
+    conf.api_key_prefix["Authorization"] = "Bearer"
+    api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+
+    # verify up front that auth is working
+    api.auth_check()
+
+    return api
diff --git a/fuzzycat/fatcat/common.py b/fuzzycat/fatcat/common.py
new file mode 100644
index 0000000..7499ce4
--- /dev/null
+++ b/fuzzycat/fatcat/common.py
@@ -0,0 +1,164 @@
+# coding: utf-8
+"""
+Adapter for fatcat and fatcat entities.
+"""
+
+import collections
+from enum import Enum
+from typing import Dict, List, Type, Union
+
+from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
+                                   ReleaseExtIds, WorkEntity)
+
+from fuzzycat.fatcat.api_auth import public_api
+from fuzzycat.fatcat.entities import entity_from_dict, entity_from_json
+
+
+class MatchStatus(Enum):
+    """
+    When matching two entities, use these levels to express match strength.
+    When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is
+    certain, that items do not match.
+    """
+
+    EXACT = 0
+    STRONG = 1
+    WEAK = 2
+    AMBIGIOUS = 3
+    DIFFERENT = 4
+
+
+def compare_ext_ids(a: ReleaseExtIds, b: ReleaseExtIds) -> Dict[str, int]:
+    """
+    Returns a dictionary with number of existing, matching and differing
+    identifier between entity a and b. TODO(martin): It might be helpful to
+    have some mapping service, that would relate qid to doi, or a mag to a
+    jstor id, if this information is known.
+    """
+    counter = collections.Counter({"a": 0, "b": 0, "both": 0, "hits": 0, "misses": 0})
+    attrs = (
+        "doi",
+        "wikidata_qid",
+        "isbn13",
+        "pmid",
+        "pmcid",
+        "core",
+        "arxiv",
+        "jstor",
+        "ark",
+        "mag",
+    )
+    for attr in attrs:
+        v = getattr(a, attr)
+        w = getattr(b, attr)
+        if v:
+            counter["a"] += 1
+        if w:
+            counter["b"] += 1
+        if not v or not w:
+            continue
+        counter["both"] += 1
+        if v == w:
+            counter["hits"] += 1
+        else:
+            counter["misses"] += 1
+    return counter
+
+
+def fetch_container_list(
+    ids: List[str],
+    api: DefaultApi = None,
+) -> List[ContainerEntity]:
+    """
+    Fetch a list of containers from the API.
+    """
+    if api is None:
+        api = public_api("https://api.fatcat.wiki/v0")
+    result = []
+    for id in ids:
+        try:
+            ce = api.get_container(id)
+            result.append(ce)
+        except ApiException as exc:
+            if exc.status == 404:
+                print("[err] failed to fetch container: {}".format(id), file=sys.stderr)
+                continue
+            raise
+    return result
+
+
+def fetch_release_list(
+    ids: List[str],
+    api: DefaultApi = None,
+) -> List[ReleaseEntity]:
+    """
+    Returns a list of entities. Some entities might be missing. Return all that
+    are accessible.
+    """
+    if api is None:
+        api = public_api("https://api.fatcat.wiki/v0")
+    result = []
+    for id in ids:
+        try:
+            re = api.get_release(id, hide="refs,abstracts", expand="container")
+            result.append(re)
+        except ApiException as exc:
+            if exc.status == 404:
+                print("[err] failed to fetch release: {}".format(id), file=sys.stderr)
+                continue
+            raise
+    return result
+
+
+def entity_comparable_attrs(
+    a: Union[ContainerEntity, ReleaseEntity],
+    b: Union[ContainerEntity, ReleaseEntity],
+    entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]],
+) -> List[str]:
+    """
+    Return a list of top-level attributes, which are defined on both entities
+    (i.e. we could actually compare them).
+    """
+    attrs = entity_type.attribute_map.keys()
+    comparable_attrs = []
+    for attr in attrs:
+        if getattr(a, attr) is None:
+            continue
+        if getattr(b, attr) is None:
+            continue
+        comparable_attrs.append(attr)
+    return comparable_attrs
+
+
+def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api=None):
+    """
+    Convert an elasticsearch result to a list of entities. Accepts both a
+    dictionary and an elasticsearch_dsl.response.Response.
+
+    We take the ids from elasticsearch and retrieve entities via API.
+    """
+    if isinstance(response, dict):
+        ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size]
+    elif isinstance(response, elasticsearch_dsl.response.Response):
+        ids = [hit.to_dict().get("ident") for hit in response]
+
+    if entity_type == ReleaseEntity:
+        return fetch_release_list(ids, api=api)
+    if entity_type == ContainerEntity:
+        return fetch_container_list(ids, api=api)
+
+    raise ValueError("invalid entity type: {}".format(entity_type))
+
+
+def exact_release_match(a: ReleaseEntity, b: ReleaseEntity) -> bool:
+    """
+    Currently, entities implement comparison through object dictionaries.
+    """
+    return a == b
+
+
+def exact_work_match(a: WorkEntity, b: WorkEntity) -> bool:
+    """
+    Currently, entities implement comparison through object dictionaries.
+    """
+    return a == b
diff --git a/fuzzycat/fatcat/entities.py b/fuzzycat/fatcat/entities.py
new file mode 100644
index 0000000..351c2b8
--- /dev/null
+++ b/fuzzycat/fatcat/entities.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+"""
+This is taken from fatcat_tools/transforms/entities.
+"""
+
+import collections
+import json
+
+import toml
+from fatcat_openapi_client import ApiClient
+
+
+def entity_to_dict(entity, api_client=None) -> dict:
+    """
+    Hack to take advantage of the code-generated serialization code.
+
+    Initializing/destroying ApiClient objects is surprisingly expensive
+    (because it involves a threadpool), so we allow passing an existing
+    instance. If you already have a full-on API connection `api`, you can
+    access the ApiClient object as `api.api_client`. This is such a speed-up
+    that this argument may become mandatory.
+    """
+    if not api_client:
+        api_client = ApiClient()
+    return api_client.sanitize_for_serialization(entity)
+
+
+def entity_from_json(json_str: str, entity_type, api_client=None):
+    """
+    Hack to take advantage of the code-generated deserialization code
+
+    See note on `entity_to_dict()` about api_client argument.
+    """
+    if not api_client:
+        api_client = ApiClient()
+    thing = collections.namedtuple('Thing', ['data'])
+    thing.data = json_str
+    return api_client.deserialize(thing, entity_type)
+
+
+def entity_from_dict(obj: dict, entity_type, api_client=None):
+    json_str = json.dumps(obj)
+    return entity_from_json(json_str, entity_type, api_client=api_client)
+
+
+def entity_to_toml(entity, api_client=None, pop_fields=None) -> str:
+    """
+    pop_fields parameter can be used to strip out some fields from the resulting
+    TOML. Eg, for fields which should not be edited, like the ident.
+    """
+    obj = entity_to_dict(entity, api_client=api_client)
+    pop_fields = pop_fields or []
+    for k in pop_fields:
+        obj.pop(k, None)
+    return toml.dumps(obj)
+
+
+def entity_from_toml(toml_str: str, entity_type, api_client=None):
+    obj = toml.loads(toml_str)
+    return entity_from_dict(obj, entity_type, api_client=api_client)
diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py
new file mode 100644
index 0000000..194106d
--- /dev/null
+++ b/fuzzycat/fatcat/matching.py
@@ -0,0 +1,233 @@
+# coding: utf-8
+"""
+Public API for fuzzy matches for fatcat.
+
+Match methods return candidates, verify methods return a match status.
+
+    match_containar_fuzzy  -> List[ContainerEntity]
+    match_release_fuzzy    -> List[ReleaseEntity]
+
+    verify_serial_name     -> MatchStatus
+    verify_container_name  -> MatchStatus
+    verify_container_fuzzy -> MatchStatus
+    verify_release_fuzzy   -> MatchStatus
+
+Candidate generation will use external data from search and hence is expensive. Verification is fast.
+"""
+
+from typing import List
+
+import elasticsearch
+from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
+                                   ReleaseExtIds, WorkEntity)
+from fatcat_openapi_client.api.default_api import DefaultApi
+
+from fuzzycat.fatcat.common import MatchStatus, response_to_entity_list
+from fuzzycat.serials import serialsdb
+
+
+def match_container_fuzzy(container: ContainerEntity,
+                          size: int = 5,
+                          es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None,
+                          api: Optional[DefaultApi] = None) -> List[ContainerEntity]:
+    """
+    Given a container entity, which can be (very) partial, return a list of
+    candidate matches. Elasticsearch can be a hostport or the low level client
+    object.
+
+    Random data point: with 20 parallel workers callind match_container_fuzzy,
+    we get around 40 req/s.
+    """
+    assert isinstance(container, ContainerEntity)
+
+    if size is None or size == 0:
+        size = 10000  # or any large number
+
+    if isinstance(es, str):
+        es = elasticsearch.Elasticsearch([es])
+    if es is None:
+        es = elasticsearch.Elasticsearch()
+
+    # If we find any match by ISSN-L, we return only those.
+    if container.issnl:
+        s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query(
+            "term", issns=container.issnl).extra(size=size))
+        resp = s.execute()
+        if len(resp) > 0:
+            return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+    # Do we have an exact QID match?
+    if container.wikidata_qid:
+        s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query(
+            "term", wikidata_qid=container.wikidata_qid).extra(size=size))
+        resp = s.execute()
+        if len(resp) > 0:
+            return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+    # Start with exact name match.
+    #
+    # curl -s https://search.fatcat.wiki/fatcat_container/_mapping  | jq .
+    #
+    # "name": {
+    #   "type": "text",
+    #   "copy_to": [
+    #     "biblio"
+    #   ],
+    #   "analyzer": "textIcu",
+    #   "search_analyzer": "textIcuSearch"
+    # },
+    #
+    body = {
+        "query": {
+            "match": {
+                "name": {
+                    "query": container.name,
+                    "operator": "AND"
+                }
+            }
+        },
+        "size": size,
+    }
+    resp = es.search(body=body, index="fatcat_container")
+    if resp["hits"]["total"] > 0:
+        return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+    # Get fuzzy.
+    # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+    body = {
+        "query": {
+            "match": {
+                "name": {
+                    "query": container.name,
+                    "operator": "AND",
+                    "fuzziness": "AUTO",
+                }
+            }
+        },
+        "size": size,
+    }
+    resp = es.search(body=body, index="fatcat_container")
+    if resp["hits"]["total"] > 0:
+        return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
+
+    return []
+
+
+def match_release_fuzzy(release: ReleaseEntity,
+                        size: int = 5,
+                        es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None,
+                        api: Optional[DefaultApi] = None) -> List[ReleaseEntity]:
+    """
+    Given a release entity, return a number similar release entities from
+    fatcat using Elasticsearch.
+    """
+    assert isinstance(release, ReleaseEntity)
+
+    if size is None or size == 0:
+        size = 10000  # or any large number
+
+    if isinstance(es, str):
+        es = elasticsearch.Elasticsearch([es])
+    if es is None:
+        es = elasticsearch.Elasticsearch()
+
+    # Try to match by external identifier.
+    ext_ids = release.ext_ids
+    attrs = {
+        "doi": "doi",
+        "wikidata_qid": "wikidata_qid",
+        "isbn13": "isbn13",
+        "pmid": "pmid",
+        "pmcid": "pmcid",
+        "core": "code_id",
+        "arxiv": "arxiv_id",
+        "jstor": "jstor_id",
+        "ark": "ark_id",
+        "mag": "mag_id",
+    }
+    for attr, es_field in attrs.items():
+        value = getattr(ext_ids, attr)
+        if not value:
+            continue
+        s = (elasticsearch_dsl.Search(using=es,
+                                      index="fatcat_release").query("term", **{
+                                          es_field: value
+                                      }).extra(size=size))
+        resp = s.execute()
+        if len(resp) > 0:
+            return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
+
+    body = {
+        "query": {
+            "match": {
+                "title": {
+                    "query": release.title,
+                    "operator": "AND"
+                }
+            }
+        },
+        "size": size,
+    }
+    resp = es.search(body=body, index="fatcat_release")
+    if resp["hits"]["total"] > 0:
+        return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
+
+    # Get fuzzy.
+    # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
+    body = {
+        "query": {
+            "match": {
+                "title": {
+                    "query": release.title,
+                    "operator": "AND",
+                    "fuzziness": "AUTO",
+                }
+            }
+        },
+        "size": size,
+    }
+    resp = es.search(body=body, index="fatcat_release")
+    if resp["hits"]["total"] > 0:
+        return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
+
+    return []
+
+
+def verify_serial_name(a: str, b: str) -> MatchStatus:
+    """
+    Serial name verification. Serial names are a subset of container names.
+    There are about 2M serials.
+    """
+    issnls_for_a = serialsdb.get(a, set())
+    issnls_for_b = serialsdb.get(b, set())
+
+    # If any name yields multiple ISSN-L, we cannot decide.
+    if len(issnls_for_a) > 1:
+        return MatchStatus.AMBIGIOUS
+    if len(issnls_for_b) > 1:
+        return MatchStatus.AMBIGIOUS
+
+    # If both names point the same ISSN-L, it is an exact match.
+    if len(issnls_for_a) == 1 and len(issnls_for_b) == 1:
+        if len(issnls_for_a & issnls_for_b) == 1:
+            return MatchStatus.EXACT
+        else:
+            return MatchStatus.DIFFERENT
+
+    # Multiple names possible, but there is overlap.
+    if len(issnls_for_a & issnls_for_b) > 0:
+        return MatchStatus.STRONG
+
+    return MatchStatus.AMBIGIOUS
+
+
+def verify_container_name(a: str, b: str) -> MatchStatus:
+    pass
+
+
+def verify_container_match(a: ContainerEntity, b: ContainerEntity) -> MatchStatus:
+    pass
+
+
+def verify_release_match(a: ReleaseEntity, b: ReleaseEntity) -> MatchStatus:
+    pass
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index e866992..aa6b78a 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -165,13 +165,13 @@ import os
 import re
 import shelve
 import sys
-from typing import Dict, Iterable, List, Union
-
-from fuzzycat import cleanups
-from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace)
+from typing import Any, Callable, Dict, Generator, Iterable, List, Tuple, Union
 
 from simhash import Simhash
 
+from fuzzycat import cleanups
+from fuzzycat.utils import SetEncoder
+
 
 def listify(v: Union[str, List[str]]) -> List[str]:
     """
@@ -184,7 +184,7 @@ def listify(v: Union[str, List[str]]) -> List[str]:
     return v
 
 
-def jsonld_minimal(v: Dict) -> Dict:
+def jsonld_minimal(v: Dict[str, Any]) -> Dict[str, Any]:
     """
     Turn a JSON from issn.org into a smaller dict with a few core fields.  Will
     fail, if no ISSN-L is found in the input.
@@ -207,7 +207,6 @@ def jsonld_minimal(v: Dict) -> Dict:
         return {}
     doc = {}
     for item in items:
-        pass
         # "@id": "resource/ISSN-L/0001-4125"
         # "@id": "resource/ISSN/0001-4125"
         # ...
@@ -262,7 +261,7 @@ def jsonld_minimal(v: Dict) -> Dict:
 
 def de_jsonld(lines: Iterable):
     """
-    Batch convert to minimal JSON.
+    Batch convert jsonld to minimal JSON and write to stdout.
     """
     for line in lines:
         line = line.strip()
@@ -275,7 +274,9 @@ def de_jsonld(lines: Iterable):
             print(json.dumps(doc, cls=SetEncoder))
 
 
-def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True):
+def generate_name_pairs(lines: Iterable,
+                        cleanup_pipeline: Callable[[str], str] = None,
+                        keep_original: bool = True) -> Generator[Tuple[str, str, str], None, None]:
     """
     Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
     errors. Proto unit test data.
@@ -315,7 +316,8 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=Tr
                 b = cleanup_pipeline(b)
                 yield (doc["issnl"], a, b)
 
-def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
+
+def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline: Callable[[str], str] = None):
     """
     Given JSON lines, generate a dictionary mapping names sets of ISSN. Names
     might be reused.
@@ -327,19 +329,23 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
     return mapping
 
 
-def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):
+def generate_shelve(lines: Iterable, output: str, cleanup_pipeline: Callable[[str], str] = None):
     """
     Generate a persistent key value store from name issn mappings. 5015523
     entries, 1.1G take about 5min.
     """
     with shelve.open(output) as db:
-        for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():
+        mapping = generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline)
+        for name, issnls in mapping.items():
             db[name] = issnls
         print("wrote {} keys to {}".format(len(db), output), file=sys.stderr)
 
+
 def generate_simhash(lines: Iterable):
     """
-    simhash matches vs non-matches.
+    Print TSV with simhash values.
+
+    Match and non-match count.
 
     1069447 1
      927120 0
@@ -366,28 +372,24 @@ def main():
     parser.add_argument("--make-shelve",
                         action="store_true",
                         help="generate trie mapping from name to list of ISSN")
-    parser.add_argument("--make-simhash",
-                        action="store_true",
-                        help="print out simhash value")
+    parser.add_argument("--make-simhash", action="store_true", help="print out simhash value")
     parser.add_argument("-o",
                         "--output",
                         type=str,
                         default="output.file",
                         help="write output to file")
-    parser.add_argument("-c",
-                        "--cleanup",
-                        type=str,
-                        default=None,
-                        help="cleanup pipeline name")
+    parser.add_argument("-c", "--cleanup", type=str, default=None, help="cleanup pipeline name")
     parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")
 
     args = parser.parse_args()
 
-    # Map more cleanup routines.
+    # Add additional cleanup routines here.
     cleanup = dict(basic=cleanups.basic).get(args.cleanup)
 
     if args.make_mapping:
-        print(json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), cls=SetEncoder))
+        print(
+            json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup),
+                       cls=SetEncoder))
     if args.make_pairs:
         for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup):
             print("{}\t{}\t{}".format(issn, a, b))
diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py
deleted file mode 100644
index bd76b7f..0000000
--- a/fuzzycat/journals.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-
-"""
-Journal name matching. Includes names from issn database and abbreviations.
-"""
-
-import shelve
-
-class JournalLookup:
-    """
-    Lookup allows to lookup journals, using a database of real journal names.
-
-        >>> lookup = JournalLookup()
-        >>> lookup["Philosophica"]
-        {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
-
-    """
-    def __init__(self, namedb='names'):
-        """
-        Note that shelve appends "db" to the name automatically.
-        """
-        self.db = shelve.open(namedb)
-
-    def __getitem__(self, v):
-        return self.db[v]
-
-    def get(self, v, cleanup_pipeline=None):
-        if not cleanup_pipeline:
-            return self.db.get(v)
-        return self.db.get(cleanup_pipeline(v))
-
-    def close(self):
-        self.db.close()
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
deleted file mode 100644
index 8da283b..0000000
--- a/fuzzycat/main.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from fuzzycat import __version__
-
-
-def main():
-    print("hello fuzzycat {}".format(__version__))
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
deleted file mode 100644
index cbadbc2..0000000
--- a/fuzzycat/matching.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import re
-import string
-
-from ftfy import fix_text
-from unidecode import unidecode
-
-from fuzzycat.status import MatchStatus
-from fuzzycat.utils import *
-
-
-def match_container_names(a: str, b: str) -> MatchStatus:
-    """
-    Given two strings representing container names, return a match status.
-    TODO(martin): incorporate abbreviations mapping, other synonyms.
-
-    Some name stats over 146302 real names from fatcat.
-
-        In [11]: len(df)
-        Out[11]: 146302
-
-        In [12]: df.head()
-        Out[12]:
-                                                        name  nlen
-        0                       Sartre Studies International    28
-        1                                Revolutionary world    19
-        2  Monograph Series on Nonlinear Science and Comp...    52
-        3                                  Hepatitis Monthly    17
-        4                                             TRACEY     6
-
-        In [13]: df.describe()
-        Out[13]:
-                        nlen
-        count  146302.000000
-        mean       33.891861
-        std        18.955551
-        min         2.000000
-        25%        20.000000
-        50%        31.000000
-        75%        44.000000
-        max       286.000000
-
-    Aroung 4000 names which are not [a-zA-z ], e.g.:
-
-        In [23]: df[df.is_alpha_only == False].sample(n=5)
-        Out[23]:
-                                                             name  nlen  is_alpha_only
-        118497                     Журнал Фронтирных Исследований    30          False
-        124885  Õpetatud Eesti Seltsi Aastaraamat/Yearbook of ...    74          False
-        142217             Études économiques de l'OCDE : Norvège    38          False
-        34681             حولیة کلیة أصول الدین والدعوة بالمنوفیة    39          False
-        132251  Известия Российской академии наук Теория и сис...    61          False
-
-    """
-
-    if a is None or b is None:
-        raise ValueError("strings required, got: a = {}, b = {}".format(a, b))
-
-    # Basic normalisation, try to remove superfluous whitespace, which should
-    # never matter, "HNO    Praxis"
-    string_cleanups = StringPipeline([
-        str.lower,
-        str.strip,
-        fix_text,
-        lambda s: re.sub(r"\s{2,}", " ", s),
-        lambda s: s.replace("&", "and"),
-    ])
-    a = string_cleanups.run(a)
-    b = string_cleanups.run(b)
-
-    # Derive some characteristics of the string. The keys are free form which
-    # may or may not be a problem. TODO(martin): maybe subclass str and just
-    # add additional methods?
-    sa = StringAnnotator([
-        lambda s: {
-            "is_short_string": len(s) < 15
-        },
-        lambda s: {
-            "is_printable_only": all(c in string.printable for c in s)
-        },
-        lambda s: {
-            "is_single_token": len(s.split()) < 2
-        },
-        lambda s: {
-            "letter_to_non_letter_ratio": letter_to_non_letter_ratio(s)
-        },
-        lambda s: {
-            "alphanumeric_ratio": alphanumeric_ratio(s)
-        },
-        lambda s: {
-            "has_diacritics": s != unidecode(s)
-        },
-        lambda s: {
-            "startswith_the": s.startswith("the ")
-        },
-        lambda s: {
-            "parenthesized_year": parenthesized_year(s)
-        },
-        lambda s: {
-            "alphanumeric_only": alphanumeric_only(s)
-        },
-    ])
-    asa = sa.run(a)
-    bsa = sa.run(b)
-
-    if asa["is_short_string"] and asa["letter_to_non_letter_ratio"] > 0.4:
-        if a == b:
-            return MatchStatus.EXACT
-
-    if not asa["is_short_string"] and not asa["is_single_token"]:
-        if a == b:
-            return MatchStatus.EXACT
-
-    # Short, single (ascii) word titles, like "Language" and the like. Single
-    # token "臨床皮膚科" needs to pass.
-    if asa["is_printable_only"] and asa["is_single_token"]:
-        return MatchStatus.AMBIGIOUS
-
-    if a == b:
-        return MatchStatus.EXACT
-
-    # Mostly ASCII, but with some possible artifacts.
-    if (asa["alphanumeric_ratio"] > 0.9 and asa["alphanumeric_only"] == bsa["alphanumeric_only"]):
-        return MatchStatus.STRONG
-
-    # Year in parentheses case, e.g. "Conf X (2018)" and "Conf X (2019)" should
-    # be different; about 3% of names contain a '(', 1% some possible date.
-    if (asa["parenthesized_year"] and asa["parenthesized_year"] == bsa["parenthesized_year"]):
-        return MatchStatus.DIFFERENT
-
-    # Common prefixes (maybe curate these manually):
-    common_prefixes = ("precarpathian bulletin of the shevchenko scientific society", )
-    for prefix in common_prefixes:
-        if a.startswith(prefix) and a != b:
-            return MatchStatus.DIFFERENT
-
-    if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.9):
-        return MatchStatus.STRONG
-
-    if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.7):
-        return MatchStatus.WEAK
-
-    # Address e.g. a char flip, but only, if we do not have diacritics.
-    if (not asa["is_short_string"] and not asa["is_single_token"] and not asa["has_diacritics"]
-            and hamming_distance(a, b) < 2):
-        return MatchStatus.STRONG
-
-    return MatchStatus.AMBIGIOUS
diff --git a/fuzzycat/serials.py b/fuzzycat/serials.py
new file mode 100644
index 0000000..5222084
--- /dev/null
+++ b/fuzzycat/serials.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+"""
+Serial name matching. Includes names from issn database.
+"""
+
+import os
+import shelve
+
+__all__ = ["serialsdb"]
+
+
+class SerialsDatabase:
+    """
+    Lookup allows to lookup serial names, using a database of real serial names.
+
+        >>> from serials import serialsdb
+        >>> serialsdb.get("Philosophica")
+        {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
+
+    """
+    def __init__(self, path='names'):
+        """
+        Note that shelve appends "db" to the name automatically. TODO: make this
+        auto-download into a cache directory.
+        """
+        if path is None:
+            path = os.path.join(os.path.expanduser("~"), ".cache/fuzzycat/names")
+        self.db = shelve.open(path, flag='r')
+
+    def __getitem__(self, v):
+        return self.db[v]
+
+    def get(self, v, default=None, cleanup_pipeline=None):
+        if not cleanup_pipeline:
+            return self.db.get(v, default=default)
+        return self.db.get(cleanup_pipeline(v), default=default)
+
+    def close(self):
+        self.db.close()
+
+
+# A singleton.
+serialsdb = SerialsDatabase()
diff --git a/fuzzycat/status.py b/fuzzycat/status.py
deleted file mode 100644
index f87c4e6..0000000
--- a/fuzzycat/status.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from enum import Enum
-
-
-class MatchStatus(Enum):
-    """
-    When matching two entities, use these levels to express match strength.
-    When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is
-    certain, that items do not match.
-    """
-
-    EXACT = 0
-    STRONG = 1
-    WEAK = 2
-    AMBIGIOUS = 3
-    DIFFERENT = 4
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index ab693eb..9d2a2f7 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -37,7 +37,7 @@ class StringPipeline:
         ...     normalize_whitespace,
         ...     normalize_ampersand,
         ... ])
-        >>> cleanups.run("<a>Input  & Output</a>")
+        >>> cleanups("<a>Input  & Output</a>")
         input and output
 
     """
@@ -49,7 +49,7 @@ class StringPipeline:
 
     def run(self, s: str) -> str:
         """
-        Apply all function and return result.
+        Apply all function and return result. Deprecated: just call the object.
         """
         for f in self.fs:
             s = f(s)
diff --git a/setup.py b/setup.py
index e2ff44b..25ccc46 100644
--- a/setup.py
+++ b/setup.py
@@ -23,18 +23,20 @@ with open("README.md", "r") as fh:
         python_requires=">=3.6",
         zip_safe=False,
         entry_points={"console_scripts": [
-            "fuzzycat=fuzzycat.main:main",
             "fuzzycat-issn=fuzzycat.issn:main",
         ],},
         install_requires=[
             "fatcat-openapi-client",
             "ftfy",
             "simhash",
-            "unidecode",
+            "unidecode>=0.10",
+            "toml",
+            "elasticsearch>=7",
         ],
         extras_require={"dev": [
             "ipython",
             "isort",
+            "pylint",
             "jupyter",
             "matplotlib",
             "pandas",
author	Martin Czygan <martin.czygan@gmail.com>	2020-08-17 17:30:28 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-08-17 17:30:28 +0200
commit	3cdd049998ed85827dc6339725ea8fdda5a700aa (patch)
tree	c243547543cdbe11d235a13af42e016e633f6c6e
parent	5084ca9fe10b4c58afc28370d98cf3e798bd2109 (diff)
download	fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.tar.gz fuzzycat-3cdd049998ed85827dc6339725ea8fdda5a700aa.zip