aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-01 23:54:02 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-01 23:54:02 +0100
commit72079139edd2e95a8d9e19dff5590771fee52202 (patch)
treec0793a3bd1b538c693d633aa272a77c20d1b0e50 /fuzzycat
parent8183e792ae122ae66b66299da1948697ae296ac7 (diff)
downloadfuzzycat-72079139edd2e95a8d9e19dff5590771fee52202.tar.gz
fuzzycat-72079139edd2e95a8d9e19dff5590771fee52202.zip
verify: bsi undated
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/utils.py9
-rw-r--r--fuzzycat/verify.py22
3 files changed, 23 insertions, 9 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index 2298185..34508b7 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -26,6 +26,7 @@ class OK(str, Enum):
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
TOKENIZED_AUTHORS = 'ok.tokenized_authors'
CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv'
+ CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated'
class Miss(str, Enum):
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index d6beb03..ef3b418 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -9,6 +9,15 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+def has_doi_prefix(v, prefix="10.1234"):
+ """
+ Returns False, if we cannot parse v or prefix does not match.
+ """
+ if not v:
+ return False
+ return v.split("/")[0] == prefix
+
+
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 84e17d8..e688d49 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -75,8 +75,8 @@ import sys
from glom import PathAccessError, glom
from fuzzycat.common import OK, Miss, Status
-from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project,
- slugify_string)
+from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix,
+ num_project, slugify_string)
# The result of clustering are documents that have a key k and a list of values
# (of the cluster) v.
@@ -171,6 +171,15 @@ def compare(a, b):
except PathAccessError:
pass
+ try:
+ a_doi = glom(a, "ext_ids.doi")
+ b_doi = glom(b, "ext_ids.doi")
+ if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
+ if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
+ return (Status.STRONG, OK.CUSTOM_BSI_UNDATED)
+ except PathAccessError:
+ pass
+
if "Zweckverband Volkshochschule " in a_title and a_title != b_title:
return (Status.DIFFERENT, Miss.CUSTOM_VHS)
@@ -275,13 +284,6 @@ def compare(a, b):
a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
- try:
- if glom(a, "ext_ids.doi") == "10.1109/nssmic.2013.6829591":
- print(a_slug_title)
- print(b_slug_title)
- except PathAccessError:
- pass
-
if a_slug_title == b_slug_title:
# via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi
# https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily
@@ -294,6 +296,8 @@ def compare(a, b):
except PathAccessError:
pass
+ # TODO: we might want to have some light python DSL to express these
+ # (commute) things
result = ieee_arxiv_pair_check(a, b)
if result:
return result