aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-17 02:18:51 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-17 02:18:51 +0100
commitfffe4b0dbae944bd7d2a3a3fefc30d6dab7daf4c (patch)
tree7055e724f534656fd4a1abccf6a826d65928762d
parenteb5fdbc237a07994d0992b765869ee9ffd47bfd8 (diff)
downloadfuzzycat-fffe4b0dbae944bd7d2a3a3fefc30d6dab7daf4c.tar.gz
fuzzycat-fffe4b0dbae944bd7d2a3a3fefc30d6dab7daf4c.zip
be less fine grained with datasets
-rw-r--r--fuzzycat/verify.py12
1 files changed, 11 insertions, 1 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index d7b2395..d277000 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -41,11 +41,11 @@ get_key_values = operator.itemgetter("k", "v")
TITLE_BLACKLIST = set([
"",
":{unav)",
- "abbildungsnachweis",
"[others]",
"[s.n.]",
"a correction",
"abbildung",
+ "abbildungsnachweis",
"abbreviations and acronyms",
"about the cover",
"about the editor",
@@ -65,6 +65,7 @@ TITLE_BLACKLIST = set([
"agradecimento",
"announcement",
"announcements",
+ "annual report",
"around the world",
"arthrobacter sp.",
"aufgaben",
@@ -77,6 +78,7 @@ TITLE_BLACKLIST = set([
"book reviews",
"books received",
"bookseller's catalogue",
+ "bureau of investigation",
"calendar",
"canto",
"canto",
@@ -127,6 +129,7 @@ TITLE_BLACKLIST = set([
"preliminary material",
"preservation image",
"references",
+ "regulations",
"reply",
"reviews of books",
"reviews",
@@ -185,6 +188,7 @@ class Miss(str, Enum):
YEAR = 'miss.year'
CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
NUM_DIFF = 'miss.num_diff'
+ DATASET_DOI = 'miss.dataset_doi'
class GroupVerifier:
"""
@@ -245,6 +249,12 @@ def compare(a, b):
if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
return (Status.DIFFERENT, Miss.CUSTOM_VHS)
+ if (a.get("extra", {}).get("crossref", {}).get("type", {}) == "dataset" and
+ b.get("extra", {}).get("crossref", {}).get("type", {}) == "dataset"):
+ if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and
+ a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")):
+ return (Status.DIFFERENT, Miss.DATASET_DOI)
+
arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
arxiv_id_b = b.get("ext_ids", {}).get("arxiv")