aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-19 03:26:55 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-19 03:26:55 +0100
commitc4d403f7f55ec0a9fee476dee637b8b44b7b7596 (patch)
tree4633feb305d7b56584e0c093233ec144cd1c52bd
parentca0df663105335826d54baef8066f42a9d539961 (diff)
downloadfuzzycat-c4d403f7f55ec0a9fee476dee637b8b44b7b7596.tar.gz
fuzzycat-c4d403f7f55ec0a9fee476dee637b8b44b7b7596.zip
update notes
-rw-r--r--README.md23
-rw-r--r--fuzzycat/verify.py18
2 files changed, 29 insertions, 12 deletions
diff --git a/README.md b/README.md
index 72b91b4..342a03a 100644
--- a/README.md
+++ b/README.md
@@ -140,9 +140,28 @@ Cases
* common title, "Books by Our Readers", https://fatcat.wiki/release/4uv5jsy5vnhdvnxvzmucqlksvq, https://fatcat.wiki/release/4uv5jsy5vnhdvnxvzmucqlksvq
* common title, "The Future of Imprisonment"
-* same title "IEEE Transactions on Wireless Communications", same publisher, different year
+* common title, "In This Issue/Research Watch/News-in-Brief/News from the IASLC Tobacco Control Committee"
+* common title, "IEEE Transactions on Wireless Communications", same publisher, different year
+* common title, "ASMS News" (also different year)
+* common title, "AMERICAN INSTITUTE OF INSTRUCTION"
+* common title, "Contents lists"
* same, except DOI, but maybe the same item, after all? https://fatcat.wiki/release/kxgsbh66v5bwhobcaiuh4i7dwy, https://fatcat.wiki/release/thl7o44z3jgk3njdypixwrdbve
+Authors may be messy:
+
+* IR and published, be we currently yield `Miss.CONTRIB_INTERSECTION_EMPTY` -
+ https://fatcat.wiki/release/2kpa6ynwjzhtbbokqyxcl25gmm,
+https://fatcat.wiki/release/o4dh7w7nqvdknm4j336yrom4wy - may need to tokenize authors
+
Possible improvements:
-* when title and authors match, check the year, and maybe the doi prefix; doi with the same prefix may not be duplicates
+* [ ] when title and authors match, check the year, and maybe the doi prefix; doi with the same prefix may not be duplicates
+* [x] detect arxiv versions directly
+* [ ] if multiple authors, may require more than one overlap, e.g. "by Yuting
+ Yao, Yuting Yao, Yuting Yao, Imperial College London, Imperial College
+London" - will overlap with any other author including "Imperial College
+London" -- we label `OK.SLUG_TITLE_AUTHOR_MATCH`,
+https://fatcat.wiki/release/6qbne2adybegdf6plgb7dnly2a,
+https://fatcat.wiki/release/v6cjc6kxzncztebmfgzxwov7ym
+* [ ] "article-journal" and "article" `release_type` should be treated the same, https://fatcat.wiki/release/k5zdpb45ufcy7grrppqndtxxji, https://fatcat.wiki/release/ypyse6ff4nbzrfd44resyav25m
+
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 58fa0b8..88b7b71 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -182,6 +182,14 @@ def compare(a, b):
if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()):
return (Status.AMBIGUOUS, Miss.APPENDIX)
+ arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+ arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+ if arxiv_id_a and arxiv_id_b:
+ id_a, version_a = arxiv_id_a.split("v")
+ id_b, version_b = arxiv_id_b.split("v")
+ if id_a == id_b:
+ return (Status.STRONG, OK.ARXIV_VERSION)
+
if a.get("release_type") and b.get(
"release_type") and a.get("release_type") != b.get("release_type"):
return (Status.DIFFERENT, Miss.RELEASE_TYPE)
@@ -261,16 +269,6 @@ def compare(a, b):
if len(a_slug_authors & b_slug_authors) > 0:
return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
- arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
- arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
- if arxiv_id_a and arxiv_id_b:
- id_a, version_a = arxiv_id_a.split("v")
- id_b, version_b = arxiv_id_b.split("v")
- if id_a == id_b:
- return (Status.STRONG, OK.ARXIV_VERSION)
- else:
- return (Status.DIFFERENT, Miss.ARXIV_VERSION)
-
if a_authors and len(a_slug_authors & b_slug_authors) == 0:
return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)