aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/refs.py
blob: 04420d0742662a961a8590ff325e3a4fb435c222 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import collections
import itertools
import json
import operator
import sys

from fuzzycat.verify import verify
from fuzzycat.common import Reason, Status
from glom import PathAccessError, glom


def find_release_entity(docs):
    """
    Return one "pivot" release entity (i.e. that does not have
    "extra.skate.status == "ref").
    """
    for doc in docs:
        try:
            if glom(doc, "extra.skate.status") == "ref":
                continue
        except PathAccessError:
            return doc

    raise ValueError("docs do not contain any release")

def ref_entities(docs):
    """
    Genator yielding ref entities only.
    """
    for doc in docs:
        try:
            if glom(doc, "extra.skate.status") == "ref":
                yield doc
        except PathAccessError:
            continue

class RefsGroupVerifier:
    """
    A specific verifier for grouped releases and references. We do not need to
    pair-wise compare, just compare one release to all references.
    """
    def __init__(self,
                 iterable: collections.abc.Iterable,
                 verbose=False):
        self.iterable: collections.abc.Iterable = iterable
        self.verbose: bool = verbose
        self.counter: Counter = collections.Counter()

    def run(self):
        get_key_values = operator.itemgetter("k", "v")
        for i, line in enumerate(self.iterable):
            if i % 20000 == 0 and self.verbose:
                print(i, file=sys.stderr)
            line = line.strip()
            if not line:
                continue
            doc = json.loads(line)
            k, vs = get_key_values(doc)
            pivot = find_release_entity(vs)
            for entity in ref_entities(vs):
                result, reason = verify(a, b)
                self.counter[reason] += 1
                print("https://fatcat.wiki/release/{}".format(a["ident"]),
                      "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)

        self.counter["total"] = sum(v for _, v in self.counter.items())