diff options
-rw-r--r-- | Makefile | 22 | ||||
-rw-r--r-- | pytest.ini | 2 | ||||
-rw-r--r-- | tests/test_matching.py | 68 |
3 files changed, 62 insertions, 30 deletions
@@ -23,13 +23,6 @@ fmt: ## Apply import sorting and yapf source formatting on all files dist: ## Create source distribution and wheel python setup.py sdist bdist_wheel -# https://engineering.fb.com/2018/07/13/data-infrastructure/xars-a-more-efficient-open-source-system-for-self-contained-executables/ -# -# Required a build from source https://github.com/vasi/squashfuse, to get the squashfuse_ll (low level) executable. -.PHONY: xar -xar: ## Create a XAR standalone package (https://github.com/facebookincubator/xar, https://github.com/vasi/squashfuse) - python setup.py bdist_xar - .PHONY: cov cov: ## Run coverage report pipenv run pytest --cov=fuzzycat fuzzycat/*.py tests/ # --cov-report annotate:cov_annotate --cov-report html @@ -74,18 +67,3 @@ upload: dist ## Upload to pypi # For automatic package deployments, also see: .gitlab-ci.yml. twine upload $(TWINE_OPTS) dist/* -# ==== data related targets -# -# data/release_export_expanded.json.gz: ## Download release export -# mkdir -p data -# wget -c https://archive.org/download/$(FATCAT_BULK_EXPORT_ITEM)/release_export_expanded.json.gz -O $@ -# -# data/container_export.json.gz: ## Download container export -# mkdir -p data -# wget -c https://archive.org/download/$(FATCAT_BULK_EXPORT_ITEM)/container_export.json.gz -O $@ -# -# data/name_to_issn.json: data/issn.ndj ## Create a name to ISSN mapping (needs an ISSN JSON dump) -# fuzzycat-issn --make-mapping $^ > $@ -# -# names.db: data/issn.ndj -# fuzzycat-issn --make-shelve -c basic -o names $^ diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..401d96f --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +log_cli = True diff --git a/tests/test_matching.py b/tests/test_matching.py index 56999e6..997a9e6 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,3 +1,18 @@ +<<<<<<< HEAD +import logging +import warnings + +import elasticsearch +import pytest +import requests +from dynaconf import Dynaconf +from fatcat_openapi_client import ReleaseEntity + +from fuzzycat.entities import entity_from_dict +from fuzzycat.matching import anything_to_entity, match_release_fuzzy + +warnings.filterwarnings("ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ... + from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fuzzycat.config import settings from fatcat_openapi_client import ReleaseEntity @@ -8,22 +23,59 @@ import logging logger = logging.getLogger('test_matching') logger.setLevel(logging.DEBUG) +# ad-hoc override search server with: FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 pytest ... FATCAT_SEARCH_URL = settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443") +def is_not_reachable(url, timeout=3): + return not is_reachable(url) + +def is_reachable(url, timeout=3): + """ + Return true, if URL is reachable and returns HTTP 200. + """ + try: + return requests.get(url, verify=False, timeout=timeout).ok + except Exception: + return False + @pytest.fixture def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -@pytest.mark.skip(reason="we cannot use POST on es, which client uses: https://git.io/JLssk") -def test_match_release_fuzzy(es_client): - cases = (("wtv64ahbdzgwnan7rllwr3nurm", 2), ) - for case, count in cases: - entity = anything_to_entity(case, ReleaseEntity) - logger.info(entity.title) +@pytest.mark.skipif(is_not_reachable(FATCAT_SEARCH_URL), + reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".format(FATCAT_SEARCH_URL)) +def test_match_release_fuzzy(es_client, caplog): + cases = ( + ("wtv64ahbdzgwnan7rllwr3nurm", 1), + ("eqcgtpav3na5jh56o5vjsvb4ei", 1), + ) + for i, (ident, count) in enumerate(cases): + entity = anything_to_entity(ident, ReleaseEntity) + + result = match_release_fuzzy(entity, es=es_client) + logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) + assert len(result) == count + # Partial data. + cases = ( + ({ + "title": "digital libraries", + "ext_ids": {} + }, 5), + ({ + "title": "The Future of Digital Scholarship", + "contribs": [{ + "raw_name": "Costantino Thanos" + }], + "ext_ids": {} + }, 5), + ) + for i, (doc, count) in enumerate(cases): + entity = entity_from_dict(doc, ReleaseEntity) result = match_release_fuzzy(entity, es=es_client) - logger.info("given: {}".format(entity.title)) - logger.info("found: {}".format(len(result))) + with caplog.at_level(logging.INFO): + logging.info("[{}] given {}, found {}, {}".format(i, entity.title, len(result), + [v.title for v in result])) assert len(result) == count |