diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-19 03:26:55 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-19 03:26:55 +0100 |
commit | c4d403f7f55ec0a9fee476dee637b8b44b7b7596 (patch) | |
tree | 4633feb305d7b56584e0c093233ec144cd1c52bd | |
parent | ca0df663105335826d54baef8066f42a9d539961 (diff) | |
download | fuzzycat-c4d403f7f55ec0a9fee476dee637b8b44b7b7596.tar.gz fuzzycat-c4d403f7f55ec0a9fee476dee637b8b44b7b7596.zip |
update notes
-rw-r--r-- | README.md | 23 | ||||
-rw-r--r-- | fuzzycat/verify.py | 18 |
2 files changed, 29 insertions, 12 deletions
@@ -140,9 +140,28 @@ Cases * common title, "Books by Our Readers", https://fatcat.wiki/release/4uv5jsy5vnhdvnxvzmucqlksvq, https://fatcat.wiki/release/4uv5jsy5vnhdvnxvzmucqlksvq * common title, "The Future of Imprisonment" -* same title "IEEE Transactions on Wireless Communications", same publisher, different year +* common title, "In This Issue/Research Watch/News-in-Brief/News from the IASLC Tobacco Control Committee" +* common title, "IEEE Transactions on Wireless Communications", same publisher, different year +* common title, "ASMS News" (also different year) +* common title, "AMERICAN INSTITUTE OF INSTRUCTION" +* common title, "Contents lists" * same, except DOI, but maybe the same item, after all? https://fatcat.wiki/release/kxgsbh66v5bwhobcaiuh4i7dwy, https://fatcat.wiki/release/thl7o44z3jgk3njdypixwrdbve +Authors may be messy: + +* IR and published, be we currently yield `Miss.CONTRIB_INTERSECTION_EMPTY` - + https://fatcat.wiki/release/2kpa6ynwjzhtbbokqyxcl25gmm, +https://fatcat.wiki/release/o4dh7w7nqvdknm4j336yrom4wy - may need to tokenize authors + Possible improvements: -* when title and authors match, check the year, and maybe the doi prefix; doi with the same prefix may not be duplicates +* [ ] when title and authors match, check the year, and maybe the doi prefix; doi with the same prefix may not be duplicates +* [x] detect arxiv versions directly +* [ ] if multiple authors, may require more than one overlap, e.g. "by Yuting + Yao, Yuting Yao, Yuting Yao, Imperial College London, Imperial College +London" - will overlap with any other author including "Imperial College +London" -- we label `OK.SLUG_TITLE_AUTHOR_MATCH`, +https://fatcat.wiki/release/6qbne2adybegdf6plgb7dnly2a, +https://fatcat.wiki/release/v6cjc6kxzncztebmfgzxwov7ym +* [ ] "article-journal" and "article" `release_type` should be treated the same, https://fatcat.wiki/release/k5zdpb45ufcy7grrppqndtxxji, https://fatcat.wiki/release/ypyse6ff4nbzrfd44resyav25m + diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 58fa0b8..88b7b71 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -182,6 +182,14 @@ def compare(a, b): if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()): return (Status.AMBIGUOUS, Miss.APPENDIX) + arxiv_id_a = a.get("ext_ids", {}).get("arxiv") + arxiv_id_b = b.get("ext_ids", {}).get("arxiv") + if arxiv_id_a and arxiv_id_b: + id_a, version_a = arxiv_id_a.split("v") + id_b, version_b = arxiv_id_b.split("v") + if id_a == id_b: + return (Status.STRONG, OK.ARXIV_VERSION) + if a.get("release_type") and b.get( "release_type") and a.get("release_type") != b.get("release_type"): return (Status.DIFFERENT, Miss.RELEASE_TYPE) @@ -261,16 +269,6 @@ def compare(a, b): if len(a_slug_authors & b_slug_authors) > 0: return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH) - arxiv_id_a = a.get("ext_ids", {}).get("arxiv") - arxiv_id_b = b.get("ext_ids", {}).get("arxiv") - if arxiv_id_a and arxiv_id_b: - id_a, version_a = arxiv_id_a.split("v") - id_b, version_b = arxiv_id_b.split("v") - if id_a == id_b: - return (Status.STRONG, OK.ARXIV_VERSION) - else: - return (Status.DIFFERENT, Miss.ARXIV_VERSION) - if a_authors and len(a_slug_authors & b_slug_authors) == 0: return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY) |