aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/matching.py
blob: 0c482e0e065ada284924b6fcf37e7ca911b99d17 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import string
import re
from unidecode import unidecode
from ftfy import fix_text

from fuzzycat import MatchStatus, StringPipeline, StringAnnotator
from fuzzycat.utils import *


def compare_container_name(a: str, b: str) -> MatchStatus:
    """
    Given two strings representing container names, return a match status. This
    would be a subproblem of verify_container_match in cases where only a
    string is given or the entity has only a name. Factored out for ease of
    testing. TODO(martin): incorporate abbreviations mapping, other synonyms.

    Some name stats over 146302 real names from fatcat.

        In [11]: len(df)
        Out[11]: 146302

        In [12]: df.head()
        Out[12]:
                                                        name  nlen
        0                       Sartre Studies International    28
        1                                Revolutionary world    19
        2  Monograph Series on Nonlinear Science and Comp...    52
        3                                  Hepatitis Monthly    17
        4                                             TRACEY     6

        In [13]: df.describe()
        Out[13]:
                        nlen
        count  146302.000000
        mean       33.891861
        std        18.955551
        min         2.000000
        25%        20.000000
        50%        31.000000
        75%        44.000000
        max       286.000000

    Aroung 4000 names which are not [a-zA-z ], e.g.:

        In [23]: df[df.is_alpha_only == False].sample(n=5)
        Out[23]:
                                                             name  nlen  is_alpha_only
        118497                     Журнал Фронтирных Исследований    30          False
        124885  Õpetatud Eesti Seltsi Aastaraamat/Yearbook of ...    74          False
        142217             Études économiques de l'OCDE : Norvège    38          False
        34681             حولیة کلیة أصول الدین والدعوة بالمنوفیة    39          False
        132251  Известия Российской академии наук Теория и сис...    61          False


    """
    if a is None or b is None:
        raise ValueError("strings required, got: a = {}, b = {}".format(a, b))

    # Whitespace cleanup.Try to remove superfluous whitespace, which should
    # never matter, "HNO    Praxis"
    string_cleanups = StringPipeline([
        str.lower,
        str.strip,
        fix_text,
        lambda s: re.sub(r"\s{2,}", " ", s),
        lambda s: s.replace("&", "and"),
    ])
    a = string_cleanups.run(a)
    b = string_cleanups.run(b)

    # Derive some characteristics of the string. The keys are free form which
    # may or may not be a problem. TODO(martin): maybe subclass str and just
    # add additional methods?
    sa = StringAnnotator([
        lambda s: {
            "is_short_string": len(s) < 15
        },
        lambda s: {
            "is_printable_only": all(c in string.printable for c in s)
        },
        lambda s: {
            "is_single_token": len(s.split()) < 2
        },
        lambda s: {
            "letter_to_non_letter_ratio": letter_to_non_letter_ratio(s)
        },
        lambda s: {
            "alphanumeric_ratio": alphanumeric_ratio(s)
        },
        lambda s: {
            "has_diacritics": s != unidecode(s)
        },
        lambda s: {
            "startswith_the": s.startswith("the ")
        },
        lambda s: {
            "parenthesized_year": parenthesized_year(s)
        },
        lambda s: {
            "alphanumeric_only": alphanumeric_only(s)
        },
    ])
    asa = sa.run(a)
    bsa = sa.run(b)

    if asa["is_short_string"] and asa["letter_to_non_letter_ratio"] > 0.4:
        if a == b:
            return MatchStatus.EXACT

    if not asa["is_short_string"] and not asa["is_single_token"]:
        if a == b:
            return MatchStatus.EXACT

    # Short, single (ascii) word titles, like "Language" and the like. Single
    # token "臨床皮膚科" needs to pass.
    if asa["is_printable_only"] and asa["is_single_token"]:
        return MatchStatus.AMBIGIOUS

    if a == b:
        return MatchStatus.EXACT

    # Mostly ASCII, but with some possible artifacts.
    if (asa["alphanumeric_ratio"] > 0.9 and asa["alphanumeric_only"] == bsa["alphanumeric_only"]):
        return MatchStatus.STRONG

    # Year in parentheses case, e.g. "Conf X (2018)" and "Conf X (2019)" should
    # be different; about 3% of names contain a '(', 1% some possible date.
    if (asa["parenthesized_year"] and asa["parenthesized_year"] == bsa["parenthesized_year"]):
        return MatchStatus.DIFFERENT

    # Common prefixes (maybe curate these manually):
    common_prefixes = ("precarpathian bulletin of the shevchenko scientific society", )
    for prefix in common_prefixes:
        if a.startswith(prefix) and a != b:
            return MatchStatus.DIFFERENT

    if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.9):
        return MatchStatus.STRONG

    if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.7):
        return MatchStatus.WEAK

    # Address e.g. a char flip, but only, if we do not have diacritics.
    if (not asa["is_short_string"] and not asa["is_single_token"] and not asa["has_diacritics"] and hamming_distance(a, b) < 2):
        return MatchStatus.STRONG

    return MatchStatus.AMBIGIOUS