From c9cd6f76dd1dd080c1bc52159ab02ff5898f5f46 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 11 Feb 2021 13:51:13 +0100 Subject: add a batch verifier for ref groups --- fuzzycat/__main__.py | 9 +++++++ fuzzycat/refs.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 fuzzycat/refs.py (limited to 'fuzzycat') diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index cd94f9c..67ffa40 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -67,6 +67,7 @@ from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngra from fuzzycat.entities import entity_to_dict from fuzzycat.matching import anything_to_entity, match_release_fuzzy from fuzzycat.utils import random_idents_from_query, random_word +from fuzzycat.refs import RefsGroupVerifier from fuzzycat.verify import GroupVerifier, verify logging.getLogger("requests").setLevel(logging.WARNING) @@ -160,6 +161,10 @@ def run_verify_single(args): }) print(json.dumps(result)) +def run_ref_verify(args): + verifier = RefsGroupVerifier(iterable=fileinput.input(files=args.files), + verbose=args.verbose) + verifier.run() def run_release_match(args): """ @@ -248,6 +253,10 @@ if __name__ == '__main__': sub_verify_single.add_argument('-b', help='ident or url to release') sub_verify_single.set_defaults(func=run_verify_single) + sub_ref_verify = subparsers.add_parser('verify', help='verify ref groups', parents=[parser]) + sub_ref_verify.add_argument('-f', '--files', default="-", help='input files') + sub_ref_verify.set_defaults(func=run_ref_verify) + sub_release_match = subparsers.add_parser( "release_match", help="find release matches", diff --git a/fuzzycat/refs.py b/fuzzycat/refs.py new file mode 100644 index 0000000..04420d0 --- /dev/null +++ b/fuzzycat/refs.py @@ -0,0 +1,67 @@ +import collections +import itertools +import json +import operator +import sys + +from fuzzycat.verify import verify +from fuzzycat.common import Reason, Status +from glom import PathAccessError, glom + + +def find_release_entity(docs): + """ + Return one "pivot" release entity (i.e. that does not have + "extra.skate.status == "ref"). + """ + for doc in docs: + try: + if glom(doc, "extra.skate.status") == "ref": + continue + except PathAccessError: + return doc + + raise ValueError("docs do not contain any release") + +def ref_entities(docs): + """ + Genator yielding ref entities only. + """ + for doc in docs: + try: + if glom(doc, "extra.skate.status") == "ref": + yield doc + except PathAccessError: + continue + +class RefsGroupVerifier: + """ + A specific verifier for grouped releases and references. We do not need to + pair-wise compare, just compare one release to all references. + """ + def __init__(self, + iterable: collections.abc.Iterable, + verbose=False): + self.iterable: collections.abc.Iterable = iterable + self.verbose: bool = verbose + self.counter: Counter = collections.Counter() + + def run(self): + get_key_values = operator.itemgetter("k", "v") + for i, line in enumerate(self.iterable): + if i % 20000 == 0 and self.verbose: + print(i, file=sys.stderr) + line = line.strip() + if not line: + continue + doc = json.loads(line) + k, vs = get_key_values(doc) + pivot = find_release_entity(vs) + for entity in ref_entities(vs): + result, reason = verify(a, b) + self.counter[reason] += 1 + print("https://fatcat.wiki/release/{}".format(a["ident"]), + "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason) + + self.counter["total"] = sum(v for _, v in self.counter.items()) + -- cgit v1.2.3