5 files changed, 0 insertions, 564 deletions
diff --git a/fuzzycat/fatcat/api_auth.py b/fuzzycat/fatcat/api_auth.py
deleted file mode 100644
index 0bad5e9..0000000
--- a/fuzzycat/fatcat/api_auth.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# coding: utf-8
-"""
-API helper, taken from fatcat_tools/api_auth.py
-"""
-
-import os
-import sys
-
-import fatcat_openapi_client
-
-
-def public_api(host_uri):
-    """
-    Note: unlike the authenticated variant, this helper might get called even
-    if the API isn't going to be used, so it's important that it doesn't try to
-    actually connect to the API host or something.
-    """
-    conf = fatcat_openapi_client.Configuration()
-    conf.host = host_uri
-    return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
-
-
-def authenticated_api(host_uri, token=None):
-    """
-    Note: if this helper is called, it's implied that an actual API connection
-    is needed, so it does try to connect and verify credentials.
-    """
-
-    conf = fatcat_openapi_client.Configuration()
-    conf.host = host_uri
-    if not token:
-        token = os.environ['FATCAT_API_AUTH_TOKEN']
-    if not token:
-        sys.stderr.write(
-            'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n')
-        sys.exit(-1)
-
-    conf.api_key["Authorization"] = token
-    conf.api_key_prefix["Authorization"] = "Bearer"
-    api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
-
-    # verify up front that auth is working
-    api.auth_check()
-
-    return api
diff --git a/fuzzycat/fatcat/common.py b/fuzzycat/fatcat/common.py
deleted file mode 100644
index 7499ce4..0000000
--- a/fuzzycat/fatcat/common.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# coding: utf-8
-"""
-Adapter for fatcat and fatcat entities.
-"""
-
-import collections
-from enum import Enum
-from typing import Dict, List, Type, Union
-
-from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
-                                   ReleaseExtIds, WorkEntity)
-
-from fuzzycat.fatcat.api_auth import public_api
-from fuzzycat.fatcat.entities import entity_from_dict, entity_from_json
-
-
-class MatchStatus(Enum):
-    """
-    When matching two entities, use these levels to express match strength.
-    When in doubt, use AMBIGIOUS. DIFFERENT should be used only, when it is
-    certain, that items do not match.
-    """
-
-    EXACT = 0
-    STRONG = 1
-    WEAK = 2
-    AMBIGIOUS = 3
-    DIFFERENT = 4
-
-
-def compare_ext_ids(a: ReleaseExtIds, b: ReleaseExtIds) -> Dict[str, int]:
-    """
-    Returns a dictionary with number of existing, matching and differing
-    identifier between entity a and b. TODO(martin): It might be helpful to
-    have some mapping service, that would relate qid to doi, or a mag to a
-    jstor id, if this information is known.
-    """
-    counter = collections.Counter({"a": 0, "b": 0, "both": 0, "hits": 0, "misses": 0})
-    attrs = (
-        "doi",
-        "wikidata_qid",
-        "isbn13",
-        "pmid",
-        "pmcid",
-        "core",
-        "arxiv",
-        "jstor",
-        "ark",
-        "mag",
-    )
-    for attr in attrs:
-        v = getattr(a, attr)
-        w = getattr(b, attr)
-        if v:
-            counter["a"] += 1
-        if w:
-            counter["b"] += 1
-        if not v or not w:
-            continue
-        counter["both"] += 1
-        if v == w:
-            counter["hits"] += 1
-        else:
-            counter["misses"] += 1
-    return counter
-
-
-def fetch_container_list(
-    ids: List[str],
-    api: DefaultApi = None,
-) -> List[ContainerEntity]:
-    """
-    Fetch a list of containers from the API.
-    """
-    if api is None:
-        api = public_api("https://api.fatcat.wiki/v0")
-    result = []
-    for id in ids:
-        try:
-            ce = api.get_container(id)
-            result.append(ce)
-        except ApiException as exc:
-            if exc.status == 404:
-                print("[err] failed to fetch container: {}".format(id), file=sys.stderr)
-                continue
-            raise
-    return result
-
-
-def fetch_release_list(
-    ids: List[str],
-    api: DefaultApi = None,
-) -> List[ReleaseEntity]:
-    """
-    Returns a list of entities. Some entities might be missing. Return all that
-    are accessible.
-    """
-    if api is None:
-        api = public_api("https://api.fatcat.wiki/v0")
-    result = []
-    for id in ids:
-        try:
-            re = api.get_release(id, hide="refs,abstracts", expand="container")
-            result.append(re)
-        except ApiException as exc:
-            if exc.status == 404:
-                print("[err] failed to fetch release: {}".format(id), file=sys.stderr)
-                continue
-            raise
-    return result
-
-
-def entity_comparable_attrs(
-    a: Union[ContainerEntity, ReleaseEntity],
-    b: Union[ContainerEntity, ReleaseEntity],
-    entity_type: Union[Type[ContainerEntity], Type[ReleaseEntity]],
-) -> List[str]:
-    """
-    Return a list of top-level attributes, which are defined on both entities
-    (i.e. we could actually compare them).
-    """
-    attrs = entity_type.attribute_map.keys()
-    comparable_attrs = []
-    for attr in attrs:
-        if getattr(a, attr) is None:
-            continue
-        if getattr(b, attr) is None:
-            continue
-        comparable_attrs.append(attr)
-    return comparable_attrs
-
-
-def response_to_entity_list(response, size=5, entity_type=ReleaseEntity, api=None):
-    """
-    Convert an elasticsearch result to a list of entities. Accepts both a
-    dictionary and an elasticsearch_dsl.response.Response.
-
-    We take the ids from elasticsearch and retrieve entities via API.
-    """
-    if isinstance(response, dict):
-        ids = [hit["_source"]["ident"] for hit in response["hits"]["hits"]][:size]
-    elif isinstance(response, elasticsearch_dsl.response.Response):
-        ids = [hit.to_dict().get("ident") for hit in response]
-
-    if entity_type == ReleaseEntity:
-        return fetch_release_list(ids, api=api)
-    if entity_type == ContainerEntity:
-        return fetch_container_list(ids, api=api)
-
-    raise ValueError("invalid entity type: {}".format(entity_type))
-
-
-def exact_release_match(a: ReleaseEntity, b: ReleaseEntity) -> bool:
-    """
-    Currently, entities implement comparison through object dictionaries.
-    """
-    return a == b
-
-
-def exact_work_match(a: WorkEntity, b: WorkEntity) -> bool:
-    """
-    Currently, entities implement comparison through object dictionaries.
-    """
-    return a == b
diff --git a/fuzzycat/fatcat/entities.py b/fuzzycat/fatcat/entities.py
deleted file mode 100644
index 351c2b8..0000000
--- a/fuzzycat/fatcat/entities.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# coding: utf-8
-"""
-This is taken from fatcat_tools/transforms/entities.
-"""
-
-import collections
-import json
-
-import toml
-from fatcat_openapi_client import ApiClient
-
-
-def entity_to_dict(entity, api_client=None) -> dict:
-    """
-    Hack to take advantage of the code-generated serialization code.
-
-    Initializing/destroying ApiClient objects is surprisingly expensive
-    (because it involves a threadpool), so we allow passing an existing
-    instance. If you already have a full-on API connection `api`, you can
-    access the ApiClient object as `api.api_client`. This is such a speed-up
-    that this argument may become mandatory.
-    """
-    if not api_client:
-        api_client = ApiClient()
-    return api_client.sanitize_for_serialization(entity)
-
-
-def entity_from_json(json_str: str, entity_type, api_client=None):
-    """
-    Hack to take advantage of the code-generated deserialization code
-
-    See note on `entity_to_dict()` about api_client argument.
-    """
-    if not api_client:
-        api_client = ApiClient()
-    thing = collections.namedtuple('Thing', ['data'])
-    thing.data = json_str
-    return api_client.deserialize(thing, entity_type)
-
-
-def entity_from_dict(obj: dict, entity_type, api_client=None):
-    json_str = json.dumps(obj)
-    return entity_from_json(json_str, entity_type, api_client=api_client)
-
-
-def entity_to_toml(entity, api_client=None, pop_fields=None) -> str:
-    """
-    pop_fields parameter can be used to strip out some fields from the resulting
-    TOML. Eg, for fields which should not be edited, like the ident.
-    """
-    obj = entity_to_dict(entity, api_client=api_client)
-    pop_fields = pop_fields or []
-    for k in pop_fields:
-        obj.pop(k, None)
-    return toml.dumps(obj)
-
-
-def entity_from_toml(toml_str: str, entity_type, api_client=None):
-    obj = toml.loads(toml_str)
-    return entity_from_dict(obj, entity_type, api_client=api_client)
diff --git a/fuzzycat/fatcat/main.py b/fuzzycat/fatcat/main.py
deleted file mode 100644
index 07e4ad4..0000000
--- a/fuzzycat/fatcat/main.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# coding: utf-8
-"""
-Command line entry point for ad-hoc testing.
-"""
-
-import argparse
-
-from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds
-
-from fuzzycat.fatcat.matching import match_release_fuzzy
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-R", "--release", help="match release", action="store_true")
-    parser.add_argument("-t", "--title", help="title")
-
-    args = parser.parse_args()
-
-    if args.release and args.title:
-        re = ReleaseEntity(title=args.title, ext_ids=ReleaseExtIds())
-        print(match_release_fuzzy(re, es="https://search.fatcat.wiki"))
diff --git a/fuzzycat/fatcat/matching.py b/fuzzycat/fatcat/matching.py
deleted file mode 100644
index 04ec275..0000000
--- a/fuzzycat/fatcat/matching.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding: utf-8
-"""
-Public API for fuzzy matches for fatcat.
-
-Match methods return candidates, verify methods return a match status.
-
-    match_containar_fuzzy  -> List[ContainerEntity]
-    match_release_fuzzy    -> List[ReleaseEntity]
-
-    verify_serial_name     -> MatchStatus
-    verify_container_name  -> MatchStatus
-    verify_container_fuzzy -> MatchStatus
-    verify_release_fuzzy   -> MatchStatus
-
-Candidate generation will use external data from search and hence is expensive. Verification is fast.
-"""
-
-from typing import List, Optional, Set, Union
-
-import elasticsearch
-from fatcat_openapi_client import (ApiException, ContainerEntity, DefaultApi, ReleaseEntity,
-                                   ReleaseExtIds, WorkEntity)
-from fatcat_openapi_client.api.default_api import DefaultApi
-
-from fuzzycat import cleanups
-from fuzzycat.fatcat.common import (MatchStatus, compare_ext_ids, response_to_entity_list)
-from fuzzycat.serials import serialsdb
-
-
-def match_container_fuzzy(container: ContainerEntity,
-                          size: int = 5,
-                          es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None,
-                          api: Optional[DefaultApi] = None) -> List[ContainerEntity]:
-    """
-    Given a container entity, which can be (very) partial, return a list of
-    candidate matches. Elasticsearch can be a hostport or the low level client
-    object.
-
-    Random data point: with 20 parallel workers callind match_container_fuzzy,
-    we get around 40 req/s.
-    """
-    assert isinstance(container, ContainerEntity)
-
-    if size is None or size == 0:
-        size = 10000  # or any large number
-
-    if isinstance(es, str):
-        es = elasticsearch.Elasticsearch([es])
-    if es is None:
-        es = elasticsearch.Elasticsearch()
-
-    # If we find any match by ISSN-L, we return only those.
-    if container.issnl:
-        s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query(
-            "term", issns=container.issnl).extra(size=size))
-        resp = s.execute()
-        if len(resp) > 0:
-            return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
-
-    # Do we have an exact QID match?
-    if container.wikidata_qid:
-        s = (elasticsearch_dsl.Search(using=es, index="fatcat_container").query(
-            "term", wikidata_qid=container.wikidata_qid).extra(size=size))
-        resp = s.execute()
-        if len(resp) > 0:
-            return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
-
-    # Start with exact name match.
-    #
-    # curl -s https://search.fatcat.wiki/fatcat_container/_mapping  | jq .
-    #
-    # "name": {
-    #   "type": "text",
-    #   "copy_to": [
-    #     "biblio"
-    #   ],
-    #   "analyzer": "textIcu",
-    #   "search_analyzer": "textIcuSearch"
-    # },
-    #
-    body = {
-        "query": {
-            "match": {
-                "name": {
-                    "query": container.name,
-                    "operator": "AND"
-                }
-            }
-        },
-        "size": size,
-    }
-    resp = es.search(body=body, index="fatcat_container")
-    if resp["hits"]["total"] > 0:
-        return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
-
-    # Get fuzzy.
-    # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
-    body = {
-        "query": {
-            "match": {
-                "name": {
-                    "query": container.name,
-                    "operator": "AND",
-                    "fuzziness": "AUTO",
-                }
-            }
-        },
-        "size": size,
-    }
-    resp = es.search(body=body, index="fatcat_container")
-    if resp["hits"]["total"] > 0:
-        return response_to_entity_list(resp, entity_type=ContainerEntity, api=api)
-
-    return []
-
-
-def match_release_fuzzy(release: ReleaseEntity,
-                        size: int = 5,
-                        es: Optional[Union[str, elasticsearch.client.Elasticsearch]] = None,
-                        api: Optional[DefaultApi] = None) -> List[ReleaseEntity]:
-    """
-    Given a release entity, return a number similar release entities from
-    fatcat using Elasticsearch.
-    """
-    assert isinstance(release, ReleaseEntity)
-
-    if size is None or size == 0:
-        size = 10000  # or any large number
-
-    if isinstance(es, str):
-        es = elasticsearch.Elasticsearch([es])
-    if es is None:
-        es = elasticsearch.Elasticsearch()
-
-    # Try to match by external identifier.
-    ext_ids = release.ext_ids
-    attrs = {
-        "doi": "doi",
-        "wikidata_qid": "wikidata_qid",
-        "isbn13": "isbn13",
-        "pmid": "pmid",
-        "pmcid": "pmcid",
-        "core": "code_id",
-        "arxiv": "arxiv_id",
-        "jstor": "jstor_id",
-        "ark": "ark_id",
-        "mag": "mag_id",
-    }
-    for attr, es_field in attrs.items():
-        value = getattr(ext_ids, attr)
-        if not value:
-            continue
-        s = (elasticsearch_dsl.Search(using=es,
-                                      index="fatcat_release").query("term", **{
-                                          es_field: value
-                                      }).extra(size=size))
-        resp = s.execute()
-        if len(resp) > 0:
-            return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
-
-    body = {
-        "query": {
-            "match": {
-                "title": {
-                    "query": release.title,
-                    "operator": "AND"
-                }
-            }
-        },
-        "size": size,
-    }
-    resp = es.search(body=body, index="fatcat_release")
-    if resp["hits"]["total"] > 0:
-        return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
-
-    # Get fuzzy.
-    # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
-    body = {
-        "query": {
-            "match": {
-                "title": {
-                    "query": release.title,
-                    "operator": "AND",
-                    "fuzziness": "AUTO",
-                }
-            }
-        },
-        "size": size,
-    }
-    resp = es.search(body=body, index="fatcat_release")
-    if resp["hits"]["total"] > 0:
-        return response_to_entity_list(resp, entity_type=ReleaseEntity, api=api)
-
-    return []
-
-
-def verify_serial_name(a: str, b: str) -> MatchStatus:
-    """
-    Serial name verification. Serial names are a subset of container names.
-    There are about 2M serials.
-    """
-    def verify(a: Set[str], b: Set[str]) -> MatchStatus:
-
-        # If any name yields multiple ISSN-L, we cannot decide.
-        if len(a) > 1:
-            return MatchStatus.AMBIGIOUS
-        if len(b) > 1:
-            return MatchStatus.AMBIGIOUS
-
-        # If both names point the same ISSN-L, it is an exact match.
-        if len(a) > 0 and len(a) == len(b):
-            if len(a & b) == len(a):
-                return MatchStatus.EXACT
-            else:
-                return MatchStatus.DIFFERENT
-
-        # Multiple names possible, but there is overlap.
-        if len(a & b) > 0:
-            return MatchStatus.STRONG
-
-        return MatchStatus.AMBIGIOUS
-
-    # First, try values as given.
-    issnls_for_a = serialsdb.get(a, set())
-    issnls_for_b = serialsdb.get(b, set())
-
-    status = verify(issnls_for_a, issnls_for_b)
-    if status != MatchStatus.AMBIGIOUS:
-        return status
-
-    # Try to match slightly cleaned up values.
-    issnls_for_a = serialsdb.get(a, set(), cleanup_pipeline=cleanups.basic)
-    issnls_for_b = serialsdb.get(b, set(), cleanup_pipeline=cleanups.basic)
-
-    return verify(issnls_for_a, issnls_for_b)
-
-
-def verify_container_name(a: str, b: str) -> MatchStatus:
-    status = verify_serial_name(a, b)
-    if status != MatchStatus.AMBIGIOUS:
-        return status
-
-    # TODO: add additional verification, string match and common patterns.
-
-
-def verify_container_match(a: ContainerEntity, b: ContainerEntity) -> MatchStatus:
-    pass
-
-
-def verify_release_match(a: ReleaseEntity, b: ReleaseEntity) -> MatchStatus:
-    assert isinstance(a, ReleaseEntity)
-    assert isinstance(b, ReleaseEntity)
-
-    if a == b:
-        return MatchStatus.EXACT
-
-    a_ext_ids, b_ext_ids = a.ext_ids, b.ext_ids
-    # Compare ext ids, result is a counter, we are interested in "hits" and
-    # "misses", only.
-    cmp_result = compare_ext_ids(a_ext_ids, b_ext_ids)
-
-    # Assume that if more ids match than mismatch, it is a good signal, e.g. if
-    # only a DOI is defined and they match, it is an exact match.
-    if cmp_result["hits"] > 0 and cmp_result["misses"] == 0:
-        return MatchStatus.EXACT
-    if cmp_result["hits"] > cmp_result["misses"]:
-        return MatchStatus.STRONG
-    if cmp_result["hits"] == 0 and cmp_result["misses"] > 0:
-        return MatchStatus.DIFFERENT
-    if cmp_result["hits"] < cmp_result["misses"]:
-        return MatchStatus.AMBIGIOUS
-
-    # TODO: do title verification, apply string cleanups, etc.