aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/refs.py
blob: 828bf2f1163c1f0c5e28db219984fa2f70b84ba7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import collections
import json
import operator
import sys

from glom import PathAccessError, glom

from fuzzycat.verify import verify


def find_release_entity(docs):
    """
    Return one "pivot" release entity (i.e. that does not have
    "extra.skate.status == "ref").
    """
    for doc in docs:
        try:
            if glom(doc, "extra.skate.status") == "ref":
                continue
        except PathAccessError:
            return doc

    raise ValueError("docs do not contain any release")


def ref_entities(docs):
    """
    Genator yielding ref entities only.
    """
    for doc in docs:
        try:
            if glom(doc, "extra.skate.status") == "ref":
                # XXX: on the fly fix for int/str years
                release_year = doc.get("release_year")
                if release_year is not None and isinstance(release_year, str):
                    doc["release_year"] = int(release_year)
                yield doc
        except PathAccessError:
            continue


class RefsGroupVerifier:
    """
    A specific verifier for grouped releases and references. We do not need to
    pair-wise compare, just compare one release to all references.
    """
    def __init__(self, iterable: collections.abc.Iterable, verbose=False):
        self.iterable: collections.abc.Iterable = iterable
        self.verbose: bool = verbose
        self.counter: Counter = collections.Counter()

    def run(self):
        get_key_values = operator.itemgetter("k", "v")
        for i, line in enumerate(self.iterable):
            if i % 20000 == 0 and self.verbose:
                print(i, file=sys.stderr)
            line = line.strip()
            if not line:
                continue
            doc = json.loads(line)
            k, vs = get_key_values(doc)
            pivot = find_release_entity(vs)
            for entity in ref_entities(vs):
                result, reason = verify(pivot, entity)
                self.counter[reason] += 1
                print("https://fatcat.wiki/release/{}".format(pivot["ident"]),
                      "https://fatcat.wiki/release/{}".format(entity["ident"]), result, reason)

        self.counter["total"] = sum(v for _, v in self.counter.items())