aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/utils.py7
-rw-r--r--fuzzycat/verify.py8
-rw-r--r--tests/data/release/2wx322n7pvbyxnbbrqrvbp7p7448
-rw-r--r--tests/data/release/lhgtefitvbd5lf6prb76mrgcci312
-rw-r--r--tests/data/verify.csv7
5 files changed, 376 insertions, 6 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 1cdac47..84db5ec 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -53,6 +53,13 @@ def dict_key_exists(doc, path):
return True
+def doi_prefix(v):
+ """
+ Return the prefix of a DOI.
+ """
+ return v.split("/")[0]
+
+
def has_doi_prefix(v, prefix="10.1234"):
"""
Returns False, if we cannot parse v or prefix does not match.
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 21c1a15..ff4567b 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -91,7 +91,8 @@ from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_
TITLE_FRAGMENT_BLACKLIST)
from fuzzycat.entities import entity_to_dict
from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists,
- has_doi_prefix, jaccard, num_project, parse_page_string, slugify_string)
+ doi_prefix, has_doi_prefix, jaccard, num_project, parse_page_string,
+ slugify_string)
Verify = collections.namedtuple("Verify", "status reason")
@@ -526,7 +527,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
b_doi = glom(b, "ext_ids.doi")
if a_container_id == b_container_id and a_doi != b_doi and not has_doi_prefix(
- a_doi, "10.1126"):
+ a_doi, "10.1126") and doi_prefix(a_doi) == doi_prefix(b_doi):
return Verify(Status.DIFFERENT, Reason.SHARED_DOI_PREFIX)
except PathAccessError:
pass
@@ -541,6 +542,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
# explodes.
a_trimmed = sorted(a_slug_authors)[:5]
b_trimmed = sorted(b_slug_authors)[:5]
+ num_authors = min(len(a_trimmed), len(b_trimmed))
for a, b in itertools.product(a_trimmed, b_trimmed):
scores.append(Score(a, b, author_similarity_score(a, b)))
# TODO: less arbitrary metric and threshold
@@ -551,7 +553,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
top_scores.append(sorted_scores[0].value)
if len(top_scores) > 0:
avg_score = sum(top_scores) / len(top_scores)
- if avg_score > 0.5:
+ if (num_authors < 3 and avg_score > 0.9) or (num_authors >= 3 and avg_score > 0.5):
return Verify(Status.STRONG, Reason.TOKENIZED_AUTHORS)
else:
pass
diff --git a/tests/data/release/2wx322n7pvbyxnbbrqrvbp7p74 b/tests/data/release/2wx322n7pvbyxnbbrqrvbp7p74
new file mode 100644
index 0000000..c01e437
--- /dev/null
+++ b/tests/data/release/2wx322n7pvbyxnbbrqrvbp7p74
@@ -0,0 +1,48 @@
+{
+ "abstracts": [],
+ "container_id": "sjomsvi4zngnnh4gx5bz2onwye",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "J LARKIN",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "H SIMON",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1016/s0364-0213(87)80026-5"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "S0364021387800265"
+ ],
+ "subject": [
+ "Experimental and Cognitive Psychology",
+ "Cognitive Neuroscience",
+ "Artificial Intelligence"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "2wx322n7pvbyxnbbrqrvbp7p74",
+ "language": "en",
+ "pages": "65-100",
+ "publisher": "Wiley",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1987,
+ "revision": "acc4e4c6-f95c-467a-8873-de4dafd2f889",
+ "state": "active",
+ "title": "Why a Diagram is (Sometimes) Worth Ten Thousand Words",
+ "volume": "11",
+ "work_id": "22e2h3f2m5ghbp5mkdlmkoisxy"
+}
diff --git a/tests/data/release/lhgtefitvbd5lf6prb76mrgcci b/tests/data/release/lhgtefitvbd5lf6prb76mrgcci
new file mode 100644
index 0000000..f336152
--- /dev/null
+++ b/tests/data/release/lhgtefitvbd5lf6prb76mrgcci
@@ -0,0 +1,312 @@
+{
+ "abstracts": [],
+ "container_id": "sjomsvi4zngnnh4gx5bz2onwye",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Jill H. Larkin",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Herbert A. Simon",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1111/j.1551-6708.1987.tb00863.x",
+ "wikidata_qid": "Q30473523"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "lhgtefitvbd5lf6prb76mrgcci",
+ "language": "en",
+ "pages": "65-100",
+ "publisher": "Wiley",
+ "refs": [
+ {
+ "container_name": "Psychological Review",
+ "extra": {
+ "authors": [
+ "Anderson"
+ ],
+ "doi": "10.1037/0033-295x.85.4.249",
+ "volume": "85"
+ },
+ "index": 0,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB1|cit1",
+ "locator": "249",
+ "title": "Arguments concerning representations for mental imagery",
+ "year": 1978
+ },
+ {
+ "container_name": "Representational Types: A Tricode Proposal (Technical Report #82-1)",
+ "extra": {
+ "authors": [
+ "Anderson"
+ ],
+ "volume-title": "Representational Types: A Tricode Proposal (Technical Report #82-1)"
+ },
+ "index": 1,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB2|cit2",
+ "year": 1984
+ },
+ {
+ "container_name": "Addison-Wesley Series in Artificial Intelligence. Programming Expert Systems in OPS5",
+ "extra": {
+ "authors": [
+ "Brownston"
+ ],
+ "volume-title": "Addison-Wesley Series in Artificial Intelligence. Programming Expert Systems in OPS5"
+ },
+ "index": 2,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB3|cit3",
+ "year": 1985
+ },
+ {
+ "container_name": "Visual information processing",
+ "extra": {
+ "authors": [
+ "Chase"
+ ],
+ "volume-title": "Visual information processing"
+ },
+ "index": 3,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB4|cit4",
+ "year": 1973
+ },
+ {
+ "container_name": "The Feynman Lectures on Physics. Figures 4-12",
+ "extra": {
+ "authors": [
+ "Feynman"
+ ],
+ "volume-title": "The Feynman Lectures on Physics. Figures 4-12"
+ },
+ "index": 4,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB5|cit5",
+ "locator": "11",
+ "year": 1966
+ },
+ {
+ "container_name": "The psychology of invention in the mathematical field",
+ "extra": {
+ "authors": [
+ "Hadamard"
+ ],
+ "volume-title": "The psychology of invention in the mathematical field"
+ },
+ "index": 5,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB6|cit6",
+ "year": 1945
+ },
+ {
+ "container_name": "Fundamentals of physics",
+ "extra": {
+ "authors": [
+ "Halliday"
+ ],
+ "volume-title": "Fundamentals of physics"
+ },
+ "index": 6,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB7|cit7",
+ "year": 1970
+ },
+ {
+ "container_name": "Knowledge and cognition",
+ "extra": {
+ "authors": [
+ "Hayes"
+ ],
+ "volume-title": "Knowledge and cognition"
+ },
+ "index": 7,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB8|cit8",
+ "year": 1974
+ },
+ {
+ "container_name": "Cognition and Instruction",
+ "extra": {
+ "authors": [
+ "Heller"
+ ],
+ "doi": "10.1207/s1532690xci0102_2"
+ },
+ "index": 8,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB9|cit9",
+ "locator": "177",
+ "title": "Prescribing effective human problem-solving processes: Problem description in physics",
+ "year": 1984
+ },
+ {
+ "container_name": "Psychological Review",
+ "extra": {
+ "authors": [
+ "Kintsch"
+ ],
+ "doi": "10.1037/0033-295x.85.5.363",
+ "volume": "85"
+ },
+ "index": 9,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB10|cit10",
+ "locator": "363",
+ "title": "Toward a model of text comprehension and production",
+ "year": 1978
+ },
+ {
+ "container_name": "Mechanisms of effective problem representation in physics (C.I.P. 434)",
+ "extra": {
+ "authors": [
+ "Larkin"
+ ],
+ "volume-title": "Mechanisms of effective problem representation in physics (C.I.P. 434)"
+ },
+ "index": 10,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB11|cit11",
+ "year": 1983
+ },
+ {
+ "container_name": "Principles of economics",
+ "extra": {
+ "authors": [
+ "Marshall"
+ ],
+ "volume-title": "Principles of economics"
+ },
+ "index": 11,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB12|cit12",
+ "year": 1890
+ },
+ {
+ "container_name": "Cognitive skills and their acquisition",
+ "extra": {
+ "authors": [
+ "Neves"
+ ],
+ "volume-title": "Cognitive skills and their acquisition"
+ },
+ "index": 12,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB13|cit13",
+ "year": 1981
+ },
+ {
+ "container_name": "Proceedings of the Western Joint Conference on Artificial Intelligence",
+ "extra": {
+ "authors": [
+ "Newell"
+ ],
+ "volume-title": "Proceedings of the Western Joint Conference on Artificial Intelligence"
+ },
+ "index": 13,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB14|cit14",
+ "year": 1959
+ },
+ {
+ "container_name": "Problem solving",
+ "extra": {
+ "authors": [
+ "Paige"
+ ],
+ "volume-title": "Problem solving"
+ },
+ "index": 14,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB15|cit15",
+ "year": 1966
+ },
+ {
+ "container_name": "Psychological Bulletin",
+ "extra": {
+ "authors": [
+ "Pylyshyn"
+ ],
+ "doi": "10.1037/h0034650",
+ "volume": "80"
+ },
+ "index": 15,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB16|cit16",
+ "title": "What the mind's eye tells the mind's brain: A critique of mental imagery",
+ "year": 1973
+ },
+ {
+ "container_name": "Physics",
+ "extra": {
+ "authors": [
+ "Sears"
+ ],
+ "volume-title": "Physics"
+ },
+ "index": 16,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB17|cit17",
+ "year": 1981
+ },
+ {
+ "container_name": "Minnesota studies in the philosophy of science. Vol. ix: Perception and cognition: Issues in the foundations of psychology",
+ "extra": {
+ "authors": [
+ "Simon"
+ ],
+ "volume-title": "Minnesota studies in the philosophy of science. Vol. ix: Perception and cognition: Issues in the foundations of psychology"
+ },
+ "index": 17,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB18|cit18",
+ "year": 1978
+ },
+ {
+ "container_name": "Psychological Review",
+ "extra": {
+ "authors": [
+ "Simon"
+ ],
+ "doi": "10.1037/h0028154",
+ "volume": "76"
+ },
+ "index": 18,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB19|cit19",
+ "locator": "473",
+ "title": "Information Processing Analysis of Perceptual Processes in Problem Solving",
+ "year": 1969
+ },
+ {
+ "container_name": "Artificial Intelligence",
+ "extra": {
+ "authors": [
+ "Waterman"
+ ],
+ "doi": "10.1016/0004-3702(70)90004-4"
+ },
+ "index": 19,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB20|cit20",
+ "locator": "121",
+ "title": "Generalization learning techniques for automating the learning of heuristics",
+ "year": 1970
+ },
+ {
+ "container_name": "Productive Thinking",
+ "extra": {
+ "authors": [
+ "Wertheimer"
+ ],
+ "volume-title": "Productive Thinking"
+ },
+ "index": 20,
+ "key": "10.1111/j.1551-6708.1987.tb00863.x-BIB21|cit21",
+ "locator": "228",
+ "year": 1959
+ }
+ ],
+ "release_date": "1987-01-03",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1987,
+ "revision": "542a0bc9-83db-4c69-9c47-2dae034cbcfb",
+ "state": "active",
+ "title": "Why a Diagram is (Sometimes) Worth Ten Thousand Words",
+ "volume": "11",
+ "work_id": "plzrdqz5l5cjnhizr2x7r64zum"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index fa80256..5822a9e 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -91,7 +91,7 @@ ocbjm53gxvavrhyupfqlynlq44,twzjy7m5irdu5gjkvrsp65hefa,Status.EXACT,WORK_ID
olhia3lm2jfsrg3jkisjjnowsu,twzjy7m5irdu5gjkvrsp65hefa,Status.EXACT,WORK_ID
q5qkykfazfbahbson4uiopnq4q,s4rrmyvibvdatgj7hiduekcjhe,Status.EXACT,TITLE_AUTHOR_MATCH
fuaz2iolhjegfpdmob3i3efvgm,uxzn4nznrfbttivwzdc7noptku,Status.EXACT,TITLE_AUTHOR_MATCH
-7j2dsplr45bhvdtrhqa7hykwka,pxzy4k45xjhgfgw6znf5xjayfa,Status.STRONG,TOKENIZED_AUTHORS
+7j2dsplr45bhvdtrhqa7hykwka,pxzy4k45xjhgfgw6znf5xjayfa,Status.STRONG,JACCARD_AUTHORS
7j2dsplr45bhvdtrhqa7hykwka,ud3tzdfacncvnkj232lkvvg34q,Status.EXACT,DOI
pxzy4k45xjhgfgw6znf5xjayfa,ud3tzdfacncvnkj232lkvvg34q,Status.DIFFERENT,RELEASE_TYPE
b5p5i7phjfejhiecjaz4arkp3m,rzicki3gcjayxaic7ckyx6bcmq,Status.DIFFERENT,SHARED_DOI_PREFIX
@@ -169,7 +169,7 @@ t3lw4dgwzfbuxjjoayxy6ow7la,vdlysz6eybbrtogx7hkfb3he6m,Status.STRONG,SLUG_TITLE_A
t3lw4dgwzfbuxjjoayxy6ow7la,y4eh7ypjlfefbgbaf6cwu5tcoy,Status.STRONG,SLUG_TITLE_AUTHOR_MATCH
vdlysz6eybbrtogx7hkfb3he6m,y4eh7ypjlfefbgbaf6cwu5tcoy,Status.STRONG,DATACITE_RELATED_ID
6nmlwrlcindltmrbrfb3k6bmmy,wtv64ahbdzgwnan7rllwr3nurm,Status.STRONG,TOKENIZED_AUTHORS
-c3m3t2l2urbkhmmy6qvvjnhb5q,z6427hizordgdghd2g26uwp45a,Status.STRONG,TOKENIZED_AUTHORS
+c3m3t2l2urbkhmmy6qvvjnhb5q,z6427hizordgdghd2g26uwp45a,Status.STRONG,JACCARD_AUTHORS
yzl6warijnehbnudaz6hcyxjnu,z6es4bb53zdkhggupvawc3koe4,Status.EXACT,TITLE_AUTHOR_MATCH
drzpue5r6zajlpa3fkyjdetuqy,fppfjl5kt5dsnfl2i5rarhqaaq,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY
drzpue5r6zajlpa3fkyjdetuqy,zqqbuha3uzd2fcvekdy3ygxnni,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY
@@ -227,7 +227,7 @@ kxjgpyz4ffbhzegxbx5qihb5ky,qq6hl3qk4zbzvdoaa5kp2x554i,Status.DIFFERENT,
kxjgpyz4ffbhzegxbx5qihb5ky,r5ey6krhrfbjrgvqqdzgora32m,Status.DIFFERENT,
kxjgpyz4ffbhzegxbx5qihb5ky,uvvfqlbtezh45ctyqrwqwfxlo4,Status.DIFFERENT,
kxjgpyz4ffbhzegxbx5qihb5ky,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT,
-mynqrxlzmve7ti7wl5wdnyfidy,p7z6la3nbzajpiia6ce47tiqfa,Status.DIFFERENT,
+mynqrxlzmve7ti7wl5wdnyfidy,p7z6la3nbzajpiia6ce47tiqfa,Status.AMBIGUOUS,UNKNOWN
mynqrxlzmve7ti7wl5wdnyfidy,qq6hl3qk4zbzvdoaa5kp2x554i,Status.DIFFERENT,
mynqrxlzmve7ti7wl5wdnyfidy,r5ey6krhrfbjrgvqqdzgora32m,Status.DIFFERENT,
mynqrxlzmve7ti7wl5wdnyfidy,uvvfqlbtezh45ctyqrwqwfxlo4,Status.DIFFERENT,
@@ -243,3 +243,4 @@ r5ey6krhrfbjrgvqqdzgora32m,uvvfqlbtezh45ctyqrwqwfxlo4,Status.DIFFERENT,
r5ey6krhrfbjrgvqqdzgora32m,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT,
uvvfqlbtezh45ctyqrwqwfxlo4,wfjdwsrr2rgd5aa3hsapivkp3m,Status.DIFFERENT,
ex2u4mgrpffp3asznccqd6n35q,zwpq2nocbzcixl6sswabsjg4ti,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY
+2wx322n7pvbyxnbbrqrvbp7p74,lhgtefitvbd5lf6prb76mrgcci,Status.STRONG,JACCARD_AUTHORS