diff options
-rw-r--r-- | fuzzycat/__main__.py | 131 | ||||
-rw-r--r-- | fuzzycat/matching.py | 2 | ||||
-rw-r--r-- | tests/test_matching.py | 15 |
3 files changed, 130 insertions, 18 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index cf29eaa..45c41d3 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -1,15 +1,47 @@ #!/usr/bin/env python """Usage: fuzzycat COMMAND [options] -Commands: cluster, verify, verify-single +COMMANDS -Run, e.g. fuzzycat cluster --help for more options. Example: + cluster + verify + verify_single + release_match - $ zstdcat -T0 release_export_expanded.json.zst | - parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 | - python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl + Run, e.g. fuzzycat cluster --help for more options. + +EXAMPLES + + Clustering with GNU parallel. + + $ zstdcat -T0 release_export_expanded.json.zst | + parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 | + python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl + + Bulk verification. + + $ zstdcat -T0 cluster_tsandcrawler.json.zst | + python -m fuzzycat verify | zstd -c9 > verify.tsv.zst + + Verify a randomly selected pair. + + $ python -m fuzzycat verify-single | jq . + { + "extra": { + "q": "https://fatcat.wiki/release/search?q=processes" + }, + "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm", + "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4", + "r": [ + "different", + "num_diff" + ] + } + + Release match (non-bulk). + + $ python -m fuzzycat release_match -q "hello world" -TODO: add docs. """ import argparse @@ -28,14 +60,20 @@ import requests from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, release_key_title_normalized, release_key_title_nysiis, release_key_title_sandcrawler) +from fuzzycat.entities import entity_to_dict +from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fuzzycat.utils import random_idents_from_query, random_word from fuzzycat.verify import GroupVerifier, verify +from fatcat_openapi_client import ReleaseEntity logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) def run_cluster(args): + """ + Run clustering over release entities from database dump. + """ logger = logging.getLogger('main.run_cluster') types = { 'title': release_key_title, @@ -59,29 +97,46 @@ def run_cluster(args): def run_verify(args): """ - TODO. Ok, we should not fetch data we have on disk (at the clustering - step). + Run match verification over dataset from clustering step. """ - gv = GroupVerifier(iterable=fileinput.input(files=args.files)) - gv.run() + GroupVerifier(iterable=fileinput.input(files=args.files)).run() def run_verify_single(args): """ - Run a single verification on a pair. + Run a single verification on a pair (or on a random pair, if none given). + + $ python -m fuzzycat verify-single | jq . + { + "extra": { + "q": "https://fatcat.wiki/release/search?q=processes" + }, + "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm", + "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4", + "r": [ + "different", + "num_diff" + ] + } + """ result = {} if args.a and args.b: a, b = args.a, args.b elif not args.a and not args.b: for _ in range(10): + # We try a few times, since not all random words might yield + # results. word = random_word(wordsfile='/usr/share/dict/words') try: idents = random_idents_from_query(query=word, r=2) + result.update( + {"extra": { + "q": "https://fatcat.wiki/release/search?q={}".format(word) + }}) + a, b = idents except RuntimeError: continue - result.update({"extra": {"q": "https://fatcat.wiki/release/search?q={}".format(word)}}) - a, b = idents break else: raise RuntimeError('could not fetch random releases') @@ -99,6 +154,34 @@ def run_verify_single(args): print(json.dumps(result)) +def run_release_match(args): + """ + Given a release, return similar releases. + """ + try: + entity = anything_to_entity(args.value, ReleaseEntity) + result = match_release_fuzzy(entity, size=args.size, es=args.es_url) + except Exception as err: + print("fuzzy match failed: {}".format(err), file=sys.stderr) + else: + if args.output_format == "tsv": + for ce in result: + vs = [ce.ident, ce.work_id, ce.container_id, ce.title] + print("\t".join((str(v) for v in vs))) + if args.output_format == "json": + matches = [] + for ce in result: + vs = { + "ident": ce.ident, + "work_id": ce.work_id, + "container_id": ce.container_id, + "title": ce.title, + } + matches.append(vs) + vs = {"entity": entity_to_dict(entity), "matches": matches, "match_count": len(matches)} + print(json.dumps(vs)) + + if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, @@ -114,6 +197,14 @@ if __name__ == '__main__': parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix') parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory') parser.add_argument('-P', '--profile', action='store_true', help='profile program') + parser.add_argument("--es-url", + default="https://search.fatcat.wiki", + help="fatcat elasticsearch") + parser.add_argument("-m", + "--output-format", + help="output format, e.g. tsv or json", + default="tsv") + parser.add_argument("-s", "--size", help="number of results to return", default=5, type=int) subparsers = parser.add_subparsers() sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) @@ -140,6 +231,20 @@ if __name__ == '__main__': sub_verify_single.add_argument('-b', help='ident or url to release') sub_verify_single.set_defaults(func=run_verify_single) + sub_release_match = subparsers.add_parser( + "release_match", + help="find release matches", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[parser]) + + sub_release_match.add_argument( + "--value", + help="release title, issn, QID, filename to entity JSON, or JSON lines", + default="hello world", + type=str, + ) + sub_release_match.set_defaults(func=run_release_match) + args = parser.parse_args() if not args.__dict__.get("func"): print(__doc__, file=sys.stderr) diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index b248024..48cc397 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -1,6 +1,6 @@ import os import re -from typing import List, Union, Type +from typing import List, Type, Union import elasticsearch import elasticsearch_dsl diff --git a/tests/test_matching.py b/tests/test_matching.py index 927b383..cded2c8 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -2,18 +2,25 @@ from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fatcat_openapi_client import ReleaseEntity import pytest import elasticsearch +import logging + +logger = logging.getLogger('test_matching') +logger.setLevel(logging.DEBUG) + @pytest.fixture def es_client(): - return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:80"]) + return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:443"]) + @pytest.mark.skip def test_match_release_fuzzy(es_client): - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 2), - ) + cases = (("wtv64ahbdzgwnan7rllwr3nurm", 2), ) for case, count in cases: entity = anything_to_entity(case, ReleaseEntity) + logger.info(entity.title) result = match_release_fuzzy(entity, es=es_client) + logger.info("given: {}".format(entity.title)) + logger.info("found: {}".format(len(result))) assert len(result) == count |