#!/usr/bin/env python """Usage: fuzzycat COMMAND [options] COMMANDS cluster verify verify_single verify_ref release_match unstructured Run, e.g. fuzzycat cluster --help for more options. EXAMPLES Clustering with GNU parallel. $ zstdcat -T0 release_export_expanded.json.zst | parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 | python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl Bulk verification. $ zstdcat -T0 cluster_tsandcrawler.json.zst | python -m fuzzycat verify | zstd -c9 > verify.tsv.zst Verify a randomly selected pair. $ python -m fuzzycat verify-single | jq . { "extra": { "q": "https://fatcat.wiki/release/search?q=processes" }, "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm", "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4", "r": [ "different", "num_diff" ] } Verify clustered refs: $ python -m fuzzycat verify-ref Release match (non-bulk). $ python -m fuzzycat release_match --value "hello world" TODO: Elasticsearch might not respond to POST queries (which is what the client library uses, see: https://git.io/JLssk). """ import argparse import cProfile as profile import fileinput import io import json import logging import pstats import sys import tempfile import elasticsearch import requests from fatcat_openapi_client import ReleaseEntity from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, release_key_title_normalized, release_key_title_nysiis, release_key_title_sandcrawler) from fuzzycat.entities import entity_to_dict from fuzzycat.grobid_unstructured import grobid_parse_unstructured from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fuzzycat.refs import RefsGroupVerifier from fuzzycat.simple import closest_fuzzy_release_match from fuzzycat.utils import random_idents_from_query, random_word from fuzzycat.verify import GroupVerifier, verify logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) def run_cluster(args): """ Run clustering over release entities from database dump. """ logger = logging.getLogger('main.run_cluster') types = { 'title': release_key_title, 'tnorm': release_key_title_normalized, 'tnysi': release_key_title_nysiis, 'tss': release_key_title_ngram, 'tsandcrawler': release_key_title_sandcrawler, } key_denylist = None if args.key_denylist: with open(args.key_denylist, 'r') as f: key_denylist = [l.strip() for l in f.readlines()] cluster = Cluster(iterable=fileinput.input(args.files), key=types.get(args.type), tmpdir=args.tmpdir, compress=args.compress, key_denylist=key_denylist, prefix=args.prefix) cluster.run() logger.debug(json.dumps(dict(cluster.counter))) def run_verify(args): """ Run match verification over dataset from clustering step. """ verifier = GroupVerifier(iterable=fileinput.input(files=args.files), verbose=args.verbose, max_cluster_size=args.max_cluster_size) verifier.run() def run_verify_single(args): """ Run a single verification on a pair (or on a random pair, if none given). $ python -m fuzzycat verify-single | jq . { "extra": { "q": "https://fatcat.wiki/release/search?q=processes" }, "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm", "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4", "r": [ "different", "num_diff" ] } """ result = {} if args.a and args.b: a, b = args.a, args.b elif not args.a and not args.b: for _ in range(10): # We try a few times, since not all random words might yield # results. word = random_word(wordsfile='/usr/share/dict/words') try: idents = random_idents_from_query(query=word, r=2) result.update( {"extra": { "q": "https://fatcat.wiki/release/search?q={}".format(word) }}) a, b = idents except RuntimeError: continue break else: raise RuntimeError('could not fetch random releases') else: raise ValueError('specify either both -a, -b or none') def fetch_ident(ident): return requests.get("https://api.fatcat.wiki/v0/release/{}".format(ident)).json() result.update({ "a": "https://fatcat.wiki/release/{}".format(a), "b": "https://fatcat.wiki/release/{}".format(b), "r": verify(fetch_ident(a), fetch_ident(b)), }) print(json.dumps(result)) def run_ref_verify(args): verifier = RefsGroupVerifier(iterable=fileinput.input(files=args.files), verbose=args.verbose) verifier.run() def run_release_match(args): """ Given a release, return similar releases. """ try: entity = anything_to_entity(args.value, ReleaseEntity) result = match_release_fuzzy(entity, size=args.size, es=args.es_url) except Exception as err: print("fuzzy match failed: {}".format(err), file=sys.stderr) else: if args.output_format == "tsv": for ce in result: vs = [ce.ident, ce.work_id, ce.container_id, ce.title] print("\t".join((str(v) for v in vs))) if args.output_format == "json": matches = [] for ce in result: vs = { "ident": ce.ident, "work_id": ce.work_id, "container_id": ce.container_id, "title": ce.title, } matches.append(vs) vs = {"entity": entity_to_dict(entity), "matches": matches, "match_count": len(matches)} print(json.dumps(vs)) def run_unstructured(args): """ Given a raw citation string, parse it and find "closest" match. Uses lower-level routines instead of simple.closest_fuzzy_unstructured_match(raw_citation) """ es_client = elasticsearch.Elasticsearch(args.es_url) print("## Sending to GROBID...", file=sys.stderr) release = grobid_parse_unstructured(args.raw_citation) if not release: print("Did not parse") sys.exit(-1) else: print(entity_to_dict(release)) print("## Fuzzy matching...", file=sys.stderr) closest = closest_fuzzy_release_match(release, es_client=es_client) if not closest: print("Did not match/verify") sys.exit(-1) print(f"{closest.status.name}\t{closest.reason.name}\trelease_{closest.release.ident}") if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # TODO: group flags sensibly parser = argparse.ArgumentParser(prog='fuzzycat', description=__doc__, usage='%(prog)s command [options]', add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix') parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory') parser.add_argument('-P', '--profile', action='store_true', help='profile program') parser.add_argument("--es-url", default="https://search.fatcat.wiki", help="fatcat elasticsearch") parser.add_argument("-m", "--output-format", help="output format, e.g. tsv or json", default="tsv") parser.add_argument("-s", "--size", help="number of results to return", default=5, type=int) parser.add_argument("-v", "--verbose", help="be verbose", action='store_true') subparsers = parser.add_subparsers() sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) sub_cluster.set_defaults(func=run_cluster) sub_cluster.add_argument('-C', '--compress', action="store_true", help='compress intermediate results') sub_cluster.add_argument('-f', '--files', default="-", help='input files') sub_cluster.add_argument('--key-denylist', help='file path to key denylist') sub_cluster.add_argument('--min-cluster-size', default=2, type=int, help='ignore smaller clusters') sub_cluster.add_argument('-t', '--type', default='title', help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler') sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser]) sub_verify.add_argument('-f', '--files', default="-", help='input files') sub_verify.add_argument('--max-cluster-size', default=10, type=int, help='ignore large clusters') sub_verify.set_defaults(func=run_verify) sub_verify_single = subparsers.add_parser('verify_single', help='verify a single pair', parents=[parser]) sub_verify_single.add_argument('-a', help='ident or url to release') sub_verify_single.add_argument('-b', help='ident or url to release') sub_verify_single.set_defaults(func=run_verify_single) sub_ref_verify = subparsers.add_parser('verify_ref', help='verify ref groups', parents=[parser]) sub_ref_verify.add_argument('-f', '--files', default="-", help='input files') sub_ref_verify.set_defaults(func=run_ref_verify) sub_release_match = subparsers.add_parser( "release_match", help="find release matches", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[parser]) sub_release_match.add_argument( "--value", help="release title, issn, QID, filename to entity JSON, or JSON lines", default="hello world", type=str, ) sub_release_match.set_defaults(func=run_release_match) sub_unstructured = subparsers.add_parser("unstructured", help="parse and match unstructured citation string", formatter_class=argparse.ArgumentDefaultsHelpFormatter) sub_unstructured.add_argument( "raw_citation", help="unstructured/raw citation string", type=str, ) sub_unstructured.set_defaults(func=run_unstructured) args = parser.parse_args() if not args.__dict__.get("func"): print(__doc__, file=sys.stderr) sys.exit(1) if args.profile: logging.disable(logging.DEBUG) pr = profile.Profile() pr.enable() args.func(args) if args.profile: pr.disable() s = io.StringIO() ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') ps.print_stats() print(s.getvalue(), file=sys.stderr)