1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
import collections
import itertools
import json
import operator
import sys
from glom import PathAccessError, glom
from fuzzycat.common import Reason, Status
from fuzzycat.verify import verify
def find_release_entity(docs):
"""
Return one "pivot" release entity (i.e. that does not have
"extra.skate.status == "ref").
"""
for doc in docs:
try:
if glom(doc, "extra.skate.status") == "ref":
continue
except PathAccessError:
return doc
raise ValueError("docs do not contain any release")
def ref_entities(docs):
"""
Genator yielding ref entities only.
"""
for doc in docs:
try:
if glom(doc, "extra.skate.status") == "ref":
# XXX: on the fly fix for int/str years
release_year = doc.get("release_year")
if release_year is not None and isinstance(release_year, str):
doc["release_year"] = int(release_year)
yield doc
except PathAccessError:
continue
class RefsGroupVerifier:
"""
A specific verifier for grouped releases and references. We do not need to
pair-wise compare, just compare one release to all references.
"""
def __init__(self, iterable: collections.abc.Iterable, verbose=False):
self.iterable: collections.abc.Iterable = iterable
self.verbose: bool = verbose
self.counter: Counter = collections.Counter()
def run(self):
get_key_values = operator.itemgetter("k", "v")
for i, line in enumerate(self.iterable):
if i % 20000 == 0 and self.verbose:
print(i, file=sys.stderr)
line = line.strip()
if not line:
continue
doc = json.loads(line)
k, vs = get_key_values(doc)
pivot = find_release_entity(vs)
for entity in ref_entities(vs):
result, reason = verify(pivot, entity)
self.counter[reason] += 1
print("https://fatcat.wiki/release/{}".format(pivot["ident"]),
"https://fatcat.wiki/release/{}".format(entity["ident"]), result, reason)
self.counter["total"] = sum(v for _, v in self.counter.items())
|