wip: another contrib comparison

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-20 23:38:04 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-20 23:38:04 +0100
commit: 668727a06cf2552783cc03af7721f6781b578d56 (patch)
tree: 81f45d35344723d15be83ac8f27e82ff715fd3e8
parent: 4e392f175ede2bf5a8b6860d7ab9690deb345728 (diff)
download: fuzzycat-668727a06cf2552783cc03af7721f6781b578d56.tar.gz
fuzzycat-668727a06cf2552783cc03af7721f6781b578d56.zip
1 files changed, 92 insertions, 14 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index a924738..9ed7fdd 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -88,6 +88,7 @@ class OK(str, Enum):
     TITLE_AUTHOR_MATCH = 'ok.title_author_match'
     PREPRINT_PUBLISHED = 'ok.preprint_published'
     SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
+    TOKENIZED_AUTHORS = 'ok.tokenized_authors'
 
 
 class Miss(str, Enum):
@@ -185,6 +186,19 @@ def compare(a, b):
     if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()):
         return (Status.AMBIGUOUS, Miss.APPENDIX)
 
+    # TODO: figshare versions
+
+    if a.get("doi").startswith("10.6084/") and b.get("doi").startswith("10.6084/")
+
+    # TODO: datacite specific vocabulary
+    # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
+    # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
+    # TODO: does glom help?
+    # ...
+    def datacite_relations(doc):
+        pass
+        # doc.get("extra", {}).get("
+
     arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
     arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
     if arxiv_id_a and arxiv_id_b:
@@ -201,8 +215,10 @@ def compare(a, b):
             "release_type") and a.get("release_type") != b.get("release_type"):
         # TODO(martin): This can go wrong with "article" and "article-journal"
         # TODO(martin): Some arxiv articles are marked are release_type: report
+        # or paper-conference
+        # (https://fatcat.wiki/release/l4fyyvsckneuxkq7d3y2zvkvbe)
         types = set([a.get("release_type"), b.get("release_type")])
-        ignore_release_types = set(["article", "article-journal", "report"])
+        ignore_release_types = set(["article", "article-journal", "report", "paper-conference"])
         if len(types & ignore_release_types) == 0:
             return (Status.DIFFERENT, Miss.RELEASE_TYPE)
 
@@ -221,6 +237,18 @@ def compare(a, b):
             "type", "") == "component" and a.get("title") != b.get("title"):
         return (Status.DIFFERENT, Miss.COMPONENT)
 
+    # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines
+    a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
+    b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
+
+    if a_slug_title == b_slug_title:
+        a_subtitles = a.get("extra", {}).get("subtitle", []) or []
+        b_subtitles = b.get("extra", {}).get("subtitle", []) or []
+        for a_sub in a_subtitles:
+            for b_sub in b_subtitles:
+                if slugify_string(a_sub) != slugify_string(b_sub):
+                    return (Status.DIFFERENT, Miss.SUBTITLE)
+
     arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
     arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
 
@@ -237,7 +265,8 @@ def compare(a, b):
             # https://fatcat.wiki/release/oceozrqtcbc4tloizhddxaj2ti
             # preprint and published work may not be published in the same
             # year; compromise allow a small gap
-            if a_release_year and b_release_year and abs(int(a_release_year) - int(b_release_year)) > 1:
+            if a_release_year and b_release_year and abs(int(a_release_year) -
+                                                         int(b_release_year)) > 1:
                 return (Status.DIFFERENT, Miss.YEAR)
             return (Status.EXACT, OK.TITLE_AUTHOR_MATCH)
 
@@ -252,18 +281,6 @@ def compare(a, b):
             if abs(int(a_release_year) - int(b_release_year)) > 2:
                 return (Status.DIFFERENT, Miss.YEAR)
 
-    # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines
-    a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
-    b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
-
-    if a_slug_title == b_slug_title:
-        a_subtitles = a.get("extra", {}).get("subtitle", []) or []
-        b_subtitles = b.get("extra", {}).get("subtitle", []) or []
-        for a_sub in a_subtitles:
-            for b_sub in b_subtitles:
-                if slugify_string(a_sub) != slugify_string(b_sub):
-                    return (Status.DIFFERENT, Miss.SUBTITLE)
-
     if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (
             a_slug_title != b_slug_title):
         return (Status.DIFFERENT, Miss.CHEM_FORMULA)
@@ -286,12 +303,73 @@ def compare(a, b):
             return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
 
     if a_authors and len(a_slug_authors & b_slug_authors) == 0:
+        # Before we bail out, run an authors similarity check. TODO: This is
+        # not the right place, but lives here now, since these cases popped up
+        # in this block.
+        Score = collections.namedtuple("Score", "a b value")
+        scores = []
+        # account for the possible arbitrary ordering of authors
+        for a, b in itertools.product(a_slug_authors, b_slug_authors):
+            scores.append(Score(a, b, author_similarity_score(a, b)))
+        # TODO: less arbitrary metric and threshold
+        top_scores = []
+        for _, g in itertools.groupby(scores, key=lambda s: s.a):
+            sorted_scores = sorted(g, key=lambda s: s.value, reverse=True)
+            if len(sorted_scores) > 0:
+                top_scores.append(sorted_scores[0])
+        avg_score = sum(top_scores) / len(top_scores)
+        if avg_score > 0.5:
+            return (Status.STRONG, OK.TOKENIZED_AUTHORS)
+
+        # TODO: This misses spelling differences, e.g.
+        # https://fatcat.wiki/release/7nbcgsohrrak5cuyk6dnit6ega and
+        # https://fatcat.wiki/release/q66xv7drk5fnph7enwwlkyuwqm
         return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)
 
     todo[a.get("title")] += 1
     return (Status.AMBIGUOUS, OK.DUMMY)
 
 
+def author_similarity_score(u, v):
+    """
+    Given two author strings, return a similarity score between 0 and 1.
+    """
+    return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
+
+
+def jaccard(a, b):
+    """
+    Jaccard of sets a and b.
+    """
+    return len(a & b) / len(a | b)
+
+
+def token_n_grams(s):
+    """
+    Return n-grams, calculated per token.
+    """
+    return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+
+
+def tokenize_string(s):
+    """
+    Normalize and tokenize, should be broken up.
+    """
+    return [token for token in s.lower().split()]
+
+
+def nwise(iterable, n=2):
+    """
+    Generalized: func: `pairwise`. Split an iterable after every
+    `n` items.
+    """
+    i = iter(iterable)
+    piece = tuple(itertools.islice(i, n))
+    while piece:
+        yield piece
+        piece = tuple(itertools.islice(i, n))
+
+
 def num_project(s):
     """
     Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-20 23:38:04 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-20 23:38:04 +0100
commit	668727a06cf2552783cc03af7721f6781b578d56 (patch)
tree	81f45d35344723d15be83ac8f27e82ff715fd3e8
parent	4e392f175ede2bf5a8b6860d7ab9690deb345728 (diff)
download	fuzzycat-668727a06cf2552783cc03af7721f6781b578d56.tar.gz fuzzycat-668727a06cf2552783cc03af7721f6781b578d56.zip