add matching submodule

author: Martin Czygan <martin.czygan@gmail.com> 2020-08-12 12:21:21 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-08-12 12:21:21 +0200
commit: a03200e1f1dc3e83674a92b617270371bbeb09e4 (patch)
tree: 9032180bd8398b77f6652709b1ff6d5485078ff8
parent: ac2ea9627acbac7a73dba4fcd0aa828ec2f4be90 (diff)
download: fuzzycat-a03200e1f1dc3e83674a92b617270371bbeb09e4.tar.gz
fuzzycat-a03200e1f1dc3e83674a92b617270371bbeb09e4.zip
2 files changed, 149 insertions, 0 deletions
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 6f0eb54..39cdbc6 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -1,3 +1,5 @@
 __version__ = "0.1.1"
 
 from fuzzycat.status import MatchStatus
+from fuzzycat.utils import *
+from fuzzycat.matching import compare_container_name
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
new file mode 100644
index 0000000..0c482e0
--- /dev/null
+++ b/fuzzycat/matching.py
@@ -0,0 +1,147 @@
+import string
+import re
+from unidecode import unidecode
+from ftfy import fix_text
+
+from fuzzycat import MatchStatus, StringPipeline, StringAnnotator
+from fuzzycat.utils import *
+
+
+def compare_container_name(a: str, b: str) -> MatchStatus:
+    """
+    Given two strings representing container names, return a match status. This
+    would be a subproblem of verify_container_match in cases where only a
+    string is given or the entity has only a name. Factored out for ease of
+    testing. TODO(martin): incorporate abbreviations mapping, other synonyms.
+
+    Some name stats over 146302 real names from fatcat.
+
+        In [11]: len(df)
+        Out[11]: 146302
+
+        In [12]: df.head()
+        Out[12]:
+                                                        name  nlen
+        0                       Sartre Studies International    28
+        1                                Revolutionary world    19
+        2  Monograph Series on Nonlinear Science and Comp...    52
+        3                                  Hepatitis Monthly    17
+        4                                             TRACEY     6
+
+        In [13]: df.describe()
+        Out[13]:
+                        nlen
+        count  146302.000000
+        mean       33.891861
+        std        18.955551
+        min         2.000000
+        25%        20.000000
+        50%        31.000000
+        75%        44.000000
+        max       286.000000
+
+    Aroung 4000 names which are not [a-zA-z ], e.g.:
+
+        In [23]: df[df.is_alpha_only == False].sample(n=5)
+        Out[23]:
+                                                             name  nlen  is_alpha_only
+        118497                     Журнал Фронтирных Исследований    30          False
+        124885  Õpetatud Eesti Seltsi Aastaraamat/Yearbook of ...    74          False
+        142217             Études économiques de l'OCDE : Norvège    38          False
+        34681             حولیة کلیة أصول الدین والدعوة بالمنوفیة    39          False
+        132251  Известия Российской академии наук Теория и сис...    61          False
+
+
+    """
+    if a is None or b is None:
+        raise ValueError("strings required, got: a = {}, b = {}".format(a, b))
+
+    # Whitespace cleanup.Try to remove superfluous whitespace, which should
+    # never matter, "HNO    Praxis"
+    string_cleanups = StringPipeline([
+        str.lower,
+        str.strip,
+        fix_text,
+        lambda s: re.sub(r"\s{2,}", " ", s),
+        lambda s: s.replace("&", "and"),
+    ])
+    a = string_cleanups.run(a)
+    b = string_cleanups.run(b)
+
+    # Derive some characteristics of the string. The keys are free form which
+    # may or may not be a problem. TODO(martin): maybe subclass str and just
+    # add additional methods?
+    sa = StringAnnotator([
+        lambda s: {
+            "is_short_string": len(s) < 15
+        },
+        lambda s: {
+            "is_printable_only": all(c in string.printable for c in s)
+        },
+        lambda s: {
+            "is_single_token": len(s.split()) < 2
+        },
+        lambda s: {
+            "letter_to_non_letter_ratio": letter_to_non_letter_ratio(s)
+        },
+        lambda s: {
+            "alphanumeric_ratio": alphanumeric_ratio(s)
+        },
+        lambda s: {
+            "has_diacritics": s != unidecode(s)
+        },
+        lambda s: {
+            "startswith_the": s.startswith("the ")
+        },
+        lambda s: {
+            "parenthesized_year": parenthesized_year(s)
+        },
+        lambda s: {
+            "alphanumeric_only": alphanumeric_only(s)
+        },
+    ])
+    asa = sa.run(a)
+    bsa = sa.run(b)
+
+    if asa["is_short_string"] and asa["letter_to_non_letter_ratio"] > 0.4:
+        if a == b:
+            return MatchStatus.EXACT
+
+    if not asa["is_short_string"] and not asa["is_single_token"]:
+        if a == b:
+            return MatchStatus.EXACT
+
+    # Short, single (ascii) word titles, like "Language" and the like. Single
+    # token "臨床皮膚科" needs to pass.
+    if asa["is_printable_only"] and asa["is_single_token"]:
+        return MatchStatus.AMBIGIOUS
+
+    if a == b:
+        return MatchStatus.EXACT
+
+    # Mostly ASCII, but with some possible artifacts.
+    if (asa["alphanumeric_ratio"] > 0.9 and asa["alphanumeric_only"] == bsa["alphanumeric_only"]):
+        return MatchStatus.STRONG
+
+    # Year in parentheses case, e.g. "Conf X (2018)" and "Conf X (2019)" should
+    # be different; about 3% of names contain a '(', 1% some possible date.
+    if (asa["parenthesized_year"] and asa["parenthesized_year"] == bsa["parenthesized_year"]):
+        return MatchStatus.DIFFERENT
+
+    # Common prefixes (maybe curate these manually):
+    common_prefixes = ("precarpathian bulletin of the shevchenko scientific society", )
+    for prefix in common_prefixes:
+        if a.startswith(prefix) and a != b:
+            return MatchStatus.DIFFERENT
+
+    if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.9):
+        return MatchStatus.STRONG
+
+    if (not asa["is_short"] and not bsa["is_short"] and common_prefix_length_ratio(a, b) > 0.7):
+        return MatchStatus.WEAK
+
+    # Address e.g. a char flip, but only, if we do not have diacritics.
+    if (not asa["is_short_string"] and not asa["is_single_token"] and not asa["has_diacritics"] and hamming_distance(a, b) < 2):
+        return MatchStatus.STRONG
+
+    return MatchStatus.AMBIGIOUS
author	Martin Czygan <martin.czygan@gmail.com>	2020-08-12 12:21:21 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-08-12 12:21:21 +0200
commit	a03200e1f1dc3e83674a92b617270371bbeb09e4 (patch)
tree	9032180bd8398b77f6652709b1ff6d5485078ff8
parent	ac2ea9627acbac7a73dba4fcd0aa828ec2f4be90 (diff)
download	fuzzycat-a03200e1f1dc3e83674a92b617270371bbeb09e4.tar.gz fuzzycat-a03200e1f1dc3e83674a92b617270371bbeb09e4.zip