aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-09 01:11:19 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-09 01:11:19 +0100
commit26c71fe52d4cabc3cca8b29f6fa3d99c66e5f912 (patch)
treeb8952a9234e6396a140b2d2a63b703ef9a911526
parentda27c5b093af923b0cb462903b273af798f39403 (diff)
downloadfuzzycat-26c71fe52d4cabc3cca8b29f6fa3d99c66e5f912.tar.gz
fuzzycat-26c71fe52d4cabc3cca8b29f6fa3d99c66e5f912.zip
another case
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/verify.py18
-rw-r--r--notes/2020_11_testruns.md7
-rw-r--r--tests/data/release/7x7tszf54zggvp4xkrhakp667u24
-rw-r--r--tests/data/release/eqcgtpav3na5jh56o5vjsvb4ei53
-rw-r--r--tests/data/verify.csv1
6 files changed, 104 insertions, 0 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index f03b7cc..d87bef4 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -24,6 +24,7 @@ class OK(str, Enum):
DOI = 'ok.doi'
DUMMY = 'ok.dummy'
FIGSHARE_VERSION = 'ok.figshare_version'
+ PMID_DOI_PAIR = 'ok.pmid_doi_pair'
PREPRINT_PUBLISHED = 'ok.preprint_published'
SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index a223e48..e945a15 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -135,6 +135,18 @@ class GroupVerifier:
self.counter["total"] = sum(v for _, v in self.counter.items())
+def dict_key_exists(doc, path):
+ """
+ Return true, if a value at a given path exists. XXX: probably in glom, too.
+ """
+ try:
+ _ = glom(doc, path)
+ except PathAccessError:
+ return False
+ else:
+ return True
+
+
def compare(a, b):
"""
Compare two entities, return match status and reason.
@@ -419,6 +431,12 @@ def compare(a, b):
if len(a_slug_authors & b_slug_authors) > 0:
return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
+ if any([a_authors, b_authors]) and not (a_authors and b_authors):
+ if a_release_year == b_release_year and a_title_lower == b_title_lower:
+ if ((dict_key_exists(a, "ext_ids.pmid") and not dict_key_exists(a, "ext_ids.doi")) or
+ (dict_key_exists(b, "ext_ids.pmid") and not dict_key_exists(b, "ext_ids.doi"))):
+ return (Status.STRONG, OK.PMID_DOI_PAIR)
+
if a_authors and len(a_slug_authors & b_slug_authors) == 0:
# Before we bail out, run an authors similarity check. TODO: This is
# not the right place, but lives here now, since these cases popped up
diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md
index 1b0b732..edc39d0 100644
--- a/notes/2020_11_testruns.md
+++ b/notes/2020_11_testruns.md
@@ -128,7 +128,14 @@ Choice review.
Tow different reviews. md: one has an author, the other not.
* [ ] https://fatcat.wiki/release/kqlifv7lyjdmbfictjzaoixahm https://fatcat.wiki/release/54ilu5kdj5fktohbs5zybtfq7y Status.AMBIGUOUS OK.DUMMY
+
+Defer.
+
* [ ] https://fatcat.wiki/release/7x7tszf54zggvp4xkrhakp667u https://fatcat.wiki/release/eqcgtpav3na5jh56o5vjsvb4ei Status.AMBIGUOUS OK.DUMMY
+
+Same, pubmed id only and oxfordjournals doi, same year.
+
+
* [ ] https://fatcat.wiki/release/idpgijvcsnbqrgs2dg36vzzdzm https://fatcat.wiki/release/wm2p5fznwffknjx56lvmr7hn4q Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/nqcfu4il45aixekvk3rwflahdm https://fatcat.wiki/release/72uzveph65ce7kfdct2wpgh5j4 Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/zizw6bgxu5cnxfx5h3v7q7gute https://fatcat.wiki/release/jwh6xci4m5dktmea6bphhc3mjy Status.AMBIGUOUS OK.DUMMY
diff --git a/tests/data/release/7x7tszf54zggvp4xkrhakp667u b/tests/data/release/7x7tszf54zggvp4xkrhakp667u
new file mode 100644
index 0000000..7b38aeb
--- /dev/null
+++ b/tests/data/release/7x7tszf54zggvp4xkrhakp667u
@@ -0,0 +1,24 @@
+{
+ "abstracts": [],
+ "container_id": "qd2paoh6sbg2xpfkny7g5j3ape",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.1093/oxfordjournals.qjmed.a068298"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "7x7tszf54zggvp4xkrhakp667u",
+ "language": "en",
+ "publisher": "Oxford University Press (OUP)",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1989,
+ "revision": "671374c0-62d9-4b44-952f-d92108b95221",
+ "state": "active",
+ "title": "Predicting the Outcome of Acute Stroke: Do Multivariate Models Help?",
+ "work_id": "ss3yihoe3fcjze7uqyq3rvderm"
+}
diff --git a/tests/data/release/eqcgtpav3na5jh56o5vjsvb4ei b/tests/data/release/eqcgtpav3na5jh56o5vjsvb4ei
new file mode 100644
index 0000000..b4e6e09
--- /dev/null
+++ b/tests/data/release/eqcgtpav3na5jh56o5vjsvb4ei
@@ -0,0 +1,53 @@
+{
+ "abstracts": [
+ {
+ "content": "Data collected by a single observer on 362 patients taking part in an acute intervention trial were used to derive simple methods of predicting outcome in conscious stroke victims. The effectiveness of these univariate methods was then compared with that of multivariate models based on discriminant function analysis. The multivariate models were somewhat better at predicting death within the first month in newly-admitted patients, and in predicting lack of functional improvement between one and six months in those still in hospital at one month. Even so, these predictions could not be made with certainty and the improvements in probability offered by multivariate over univariate analysis are unlikely to be of practical benefit to the clinician. Single variables such as power in the affected arm and continence were as good as the discriminant functions at predicting discharge within the first month and first six months, respectively. The predictive power of these single simple clinical variables was validated in a separate group of 277 stroke patients. The various proposed methods of prognostication in stroke need to be compared in prospective studies, but until this is done, we recommend a simple approach using the best available clinical information.",
+ "lang": "en",
+ "mimetype": "text/plain",
+ "sha1": "4d38ae4fa2c0c3e2c78e882712ffcec9bf44b49e"
+ }
+ ],
+ "container_id": "rj4lpn3f6vbhbnoknd2zccrydm",
+ "contribs": [
+ {
+ "extra": {},
+ "given_name": "D H",
+ "index": 0,
+ "raw_affiliation": "Department of Medicine, University Hospital, Queens Medical Centre, Nottingham.",
+ "raw_name": "D H Barer",
+ "role": "author",
+ "surname": "Barer"
+ },
+ {
+ "extra": {},
+ "given_name": "J R",
+ "index": 1,
+ "raw_name": "J R Mitchell",
+ "role": "author",
+ "surname": "Mitchell"
+ }
+ ],
+ "ext_ids": {
+ "pmid": "2574484"
+ },
+ "extra": {
+ "pubmed": {
+ "pub_types": [
+ "Journal Article"
+ ]
+ }
+ },
+ "ident": "eqcgtpav3na5jh56o5vjsvb4ei",
+ "issue": "261",
+ "language": "en",
+ "pages": "27-39",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1989,
+ "revision": "17e56502-50a5-4438-a691-0014285a2ae9",
+ "state": "active",
+ "title": "Predicting the outcome of acute stroke: do multivariate models help?",
+ "volume": "70",
+ "work_id": "ypaisxszjnhtjb5qdyvvpzsz7e"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 43820d5..e03eec4 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -127,3 +127,4 @@ yp3rs3xb5ra2riyx5xayrlqfum,zphoquzqbfccjoqix3wkpyhrm4,Status.EXACT,OK.WORK_ID
mz6a32xbp5f67i2cnbco2hmzj4,fo5dsqeocfekfhqdzgqyng3z6q,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW
g2swo5fewnhv3ihmlpl32sojr4,ab2q56gokfdmzpccrmwfcdljgy,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW
tmlg73royrdwdhl6nijf6m7vzy,3w4tibll4rdernjrn4hkkyqsem,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
+eqcgtpav3na5jh56o5vjsvb4ei,7x7tszf54zggvp4xkrhakp667u,Status.STRONG,OK.PMID_DOI_PAIR