aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-20 23:38:04 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-20 23:38:04 +0100
commit668727a06cf2552783cc03af7721f6781b578d56 (patch)
tree81f45d35344723d15be83ac8f27e82ff715fd3e8
parent4e392f175ede2bf5a8b6860d7ab9690deb345728 (diff)
downloadfuzzycat-668727a06cf2552783cc03af7721f6781b578d56.tar.gz
fuzzycat-668727a06cf2552783cc03af7721f6781b578d56.zip
wip: another contrib comparison
-rw-r--r--fuzzycat/verify.py106
1 files changed, 92 insertions, 14 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index a924738..9ed7fdd 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -88,6 +88,7 @@ class OK(str, Enum):
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
PREPRINT_PUBLISHED = 'ok.preprint_published'
SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
+ TOKENIZED_AUTHORS = 'ok.tokenized_authors'
class Miss(str, Enum):
@@ -185,6 +186,19 @@ def compare(a, b):
if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()):
return (Status.AMBIGUOUS, Miss.APPENDIX)
+ # TODO: figshare versions
+
+ if a.get("doi").startswith("10.6084/") and b.get("doi").startswith("10.6084/")
+
+ # TODO: datacite specific vocabulary
+ # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...}
+ # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
+ # TODO: does glom help?
+ # ...
+ def datacite_relations(doc):
+ pass
+ # doc.get("extra", {}).get("
+
arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
if arxiv_id_a and arxiv_id_b:
@@ -201,8 +215,10 @@ def compare(a, b):
"release_type") and a.get("release_type") != b.get("release_type"):
# TODO(martin): This can go wrong with "article" and "article-journal"
# TODO(martin): Some arxiv articles are marked are release_type: report
+ # or paper-conference
+ # (https://fatcat.wiki/release/l4fyyvsckneuxkq7d3y2zvkvbe)
types = set([a.get("release_type"), b.get("release_type")])
- ignore_release_types = set(["article", "article-journal", "report"])
+ ignore_release_types = set(["article", "article-journal", "report", "paper-conference"])
if len(types & ignore_release_types) == 0:
return (Status.DIFFERENT, Miss.RELEASE_TYPE)
@@ -221,6 +237,18 @@ def compare(a, b):
"type", "") == "component" and a.get("title") != b.get("title"):
return (Status.DIFFERENT, Miss.COMPONENT)
+ # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines
+ a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
+ b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
+
+ if a_slug_title == b_slug_title:
+ a_subtitles = a.get("extra", {}).get("subtitle", []) or []
+ b_subtitles = b.get("extra", {}).get("subtitle", []) or []
+ for a_sub in a_subtitles:
+ for b_sub in b_subtitles:
+ if slugify_string(a_sub) != slugify_string(b_sub):
+ return (Status.DIFFERENT, Miss.SUBTITLE)
+
arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
@@ -237,7 +265,8 @@ def compare(a, b):
# https://fatcat.wiki/release/oceozrqtcbc4tloizhddxaj2ti
# preprint and published work may not be published in the same
# year; compromise allow a small gap
- if a_release_year and b_release_year and abs(int(a_release_year) - int(b_release_year)) > 1:
+ if a_release_year and b_release_year and abs(int(a_release_year) -
+ int(b_release_year)) > 1:
return (Status.DIFFERENT, Miss.YEAR)
return (Status.EXACT, OK.TITLE_AUTHOR_MATCH)
@@ -252,18 +281,6 @@ def compare(a, b):
if abs(int(a_release_year) - int(b_release_year)) > 2:
return (Status.DIFFERENT, Miss.YEAR)
- # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines
- a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
- b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
-
- if a_slug_title == b_slug_title:
- a_subtitles = a.get("extra", {}).get("subtitle", []) or []
- b_subtitles = b.get("extra", {}).get("subtitle", []) or []
- for a_sub in a_subtitles:
- for b_sub in b_subtitles:
- if slugify_string(a_sub) != slugify_string(b_sub):
- return (Status.DIFFERENT, Miss.SUBTITLE)
-
if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (
a_slug_title != b_slug_title):
return (Status.DIFFERENT, Miss.CHEM_FORMULA)
@@ -286,12 +303,73 @@ def compare(a, b):
return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
if a_authors and len(a_slug_authors & b_slug_authors) == 0:
+ # Before we bail out, run an authors similarity check. TODO: This is
+ # not the right place, but lives here now, since these cases popped up
+ # in this block.
+ Score = collections.namedtuple("Score", "a b value")
+ scores = []
+ # account for the possible arbitrary ordering of authors
+ for a, b in itertools.product(a_slug_authors, b_slug_authors):
+ scores.append(Score(a, b, author_similarity_score(a, b)))
+ # TODO: less arbitrary metric and threshold
+ top_scores = []
+ for _, g in itertools.groupby(scores, key=lambda s: s.a):
+ sorted_scores = sorted(g, key=lambda s: s.value, reverse=True)
+ if len(sorted_scores) > 0:
+ top_scores.append(sorted_scores[0])
+ avg_score = sum(top_scores) / len(top_scores)
+ if avg_score > 0.5:
+ return (Status.STRONG, OK.TOKENIZED_AUTHORS)
+
+ # TODO: This misses spelling differences, e.g.
+ # https://fatcat.wiki/release/7nbcgsohrrak5cuyk6dnit6ega and
+ # https://fatcat.wiki/release/q66xv7drk5fnph7enwwlkyuwqm
return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)
todo[a.get("title")] += 1
return (Status.AMBIGUOUS, OK.DUMMY)
+def author_similarity_score(u, v):
+ """
+ Given two author strings, return a similarity score between 0 and 1.
+ """
+ return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
+
+
+def jaccard(a, b):
+ """
+ Jaccard of sets a and b.
+ """
+ return len(a & b) / len(a | b)
+
+
+def token_n_grams(s):
+ """
+ Return n-grams, calculated per token.
+ """
+ return ["".join(v) for v in itertools.chain(*[nwise(v, n=2) for v in tokenize_string(s)])]
+
+
+def tokenize_string(s):
+ """
+ Normalize and tokenize, should be broken up.
+ """
+ return [token for token in s.lower().split()]
+
+
+def nwise(iterable, n=2):
+ """
+ Generalized: func: `pairwise`. Split an iterable after every
+ `n` items.
+ """
+ i = iter(iterable)
+ piece = tuple(itertools.islice(i, n))
+ while piece:
+ yield piece
+ piece = tuple(itertools.islice(i, n))
+
+
def num_project(s):
"""
Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,