aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-02-11 13:51:13 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-02-11 13:51:13 +0100
commitc9cd6f76dd1dd080c1bc52159ab02ff5898f5f46 (patch)
tree7ec8962c6d19d81f44463a517d8397942c242c49 /fuzzycat
parente75a77fdedae4a4a37c5ddc12c796c70164900dc (diff)
downloadfuzzycat-c9cd6f76dd1dd080c1bc52159ab02ff5898f5f46.tar.gz
fuzzycat-c9cd6f76dd1dd080c1bc52159ab02ff5898f5f46.zip
add a batch verifier for ref groups
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/__main__.py9
-rw-r--r--fuzzycat/refs.py67
2 files changed, 76 insertions, 0 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index cd94f9c..67ffa40 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -67,6 +67,7 @@ from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngra
from fuzzycat.entities import entity_to_dict
from fuzzycat.matching import anything_to_entity, match_release_fuzzy
from fuzzycat.utils import random_idents_from_query, random_word
+from fuzzycat.refs import RefsGroupVerifier
from fuzzycat.verify import GroupVerifier, verify
logging.getLogger("requests").setLevel(logging.WARNING)
@@ -160,6 +161,10 @@ def run_verify_single(args):
})
print(json.dumps(result))
+def run_ref_verify(args):
+ verifier = RefsGroupVerifier(iterable=fileinput.input(files=args.files),
+ verbose=args.verbose)
+ verifier.run()
def run_release_match(args):
"""
@@ -248,6 +253,10 @@ if __name__ == '__main__':
sub_verify_single.add_argument('-b', help='ident or url to release')
sub_verify_single.set_defaults(func=run_verify_single)
+ sub_ref_verify = subparsers.add_parser('verify', help='verify ref groups', parents=[parser])
+ sub_ref_verify.add_argument('-f', '--files', default="-", help='input files')
+ sub_ref_verify.set_defaults(func=run_ref_verify)
+
sub_release_match = subparsers.add_parser(
"release_match",
help="find release matches",
diff --git a/fuzzycat/refs.py b/fuzzycat/refs.py
new file mode 100644
index 0000000..04420d0
--- /dev/null
+++ b/fuzzycat/refs.py
@@ -0,0 +1,67 @@
+import collections
+import itertools
+import json
+import operator
+import sys
+
+from fuzzycat.verify import verify
+from fuzzycat.common import Reason, Status
+from glom import PathAccessError, glom
+
+
+def find_release_entity(docs):
+ """
+ Return one "pivot" release entity (i.e. that does not have
+ "extra.skate.status == "ref").
+ """
+ for doc in docs:
+ try:
+ if glom(doc, "extra.skate.status") == "ref":
+ continue
+ except PathAccessError:
+ return doc
+
+ raise ValueError("docs do not contain any release")
+
+def ref_entities(docs):
+ """
+ Genator yielding ref entities only.
+ """
+ for doc in docs:
+ try:
+ if glom(doc, "extra.skate.status") == "ref":
+ yield doc
+ except PathAccessError:
+ continue
+
+class RefsGroupVerifier:
+ """
+ A specific verifier for grouped releases and references. We do not need to
+ pair-wise compare, just compare one release to all references.
+ """
+ def __init__(self,
+ iterable: collections.abc.Iterable,
+ verbose=False):
+ self.iterable: collections.abc.Iterable = iterable
+ self.verbose: bool = verbose
+ self.counter: Counter = collections.Counter()
+
+ def run(self):
+ get_key_values = operator.itemgetter("k", "v")
+ for i, line in enumerate(self.iterable):
+ if i % 20000 == 0 and self.verbose:
+ print(i, file=sys.stderr)
+ line = line.strip()
+ if not line:
+ continue
+ doc = json.loads(line)
+ k, vs = get_key_values(doc)
+ pivot = find_release_entity(vs)
+ for entity in ref_entities(vs):
+ result, reason = verify(a, b)
+ self.counter[reason] += 1
+ print("https://fatcat.wiki/release/{}".format(a["ident"]),
+ "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
+
+ self.counter["total"] = sum(v for _, v in self.counter.items())
+