diff options
-rw-r--r-- | README.md | 12 | ||||
-rw-r--r-- | fuzzycat/verify.py | 17 |
2 files changed, 20 insertions, 9 deletions
@@ -136,7 +136,7 @@ Notes on cadd28a version clustering (nysiis) and verification. 93240 OK.TITLE_AUTHOR_MATCH ``` -Cases +#### Cases * common title, "Books by Our Readers", https://fatcat.wiki/release/4uv5jsy5vnhdvnxvzmucqlksvq, https://fatcat.wiki/release/4uv5jsy5vnhdvnxvzmucqlksvq * common title, "The Future of Imprisonment" @@ -145,6 +145,7 @@ Cases * common title, "ASMS News" (also different year) * common title, "AMERICAN INSTITUTE OF INSTRUCTION" * common title, "Contents lists" +* common title, "Submissions" * same, except DOI, but maybe the same item, after all? https://fatcat.wiki/release/kxgsbh66v5bwhobcaiuh4i7dwy, https://fatcat.wiki/release/thl7o44z3jgk3njdypixwrdbve Authors may be messy: @@ -153,7 +154,14 @@ Authors may be messy: https://fatcat.wiki/release/2kpa6ynwjzhtbbokqyxcl25gmm, https://fatcat.wiki/release/o4dh7w7nqvdknm4j336yrom4wy - may need to tokenize authors -Possible improvements: +A DOI prefix (10.1210, The Endocrine Society) may choose to include the same +document in different publications: + +* https://fatcat.wiki/release/52lwj4ip3nbdbgrgk4uwolbjt4 +* https://fatcat.wiki/release/6tbrmc3pq5axzf3yhqayq256a4 +* https://fatcat.wiki/release/457lzlw7czeo7aspcyttccvyrq + +#### Possible fixes * [ ] when title and authors match, check the year, and maybe the doi prefix; doi with the same prefix may not be duplicates * [x] detect arxiv versions directly diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 88b7b71..bd40e6a 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -298,6 +298,10 @@ def contains_chemical_formula(s): TITLE_FRAGMENT_BLACKLIST = set([ "air quality data from the life+respira project in pamplona", "irish studies, seminar", + "untersuchung einzelner abdominaler regionen und organe", + "student government minutes:", + "ieee membership application", + "nouvelles du corps médical", ]) # There titles appear too often, so ignore them for now. @@ -313,8 +317,6 @@ TITLE_BLACKLIST = set([ "11. 経皮的胃電気活動記録 (egg) の食事負荷による影響の検討(第 25 回 日本小児消化管機能研究会)", "1200 multiple choice questions in pharmacology", "1299 chemical shifts and coupling constants for c10h13cl2n2o2ps", - "13 untersuchung einzelner abdominaler regionen und organe (13.1 - 13.3)", - "13 untersuchung einzelner abdominaler regionen und organe (13.4 - 13.6)", "141st smpte technical conference and exhibition: marriott marquis hotel, new york city, november 19–22, 1999", "147 モルモット tdi 喘息モデルの研究 : ii. 下気道の組織学的変化の検討", "1536 chemical shifts and coupling constants for c10h24no2psi", @@ -339,14 +341,9 @@ TITLE_BLACKLIST = set([ "1. 歯科修復物の品質管理 (qc) とその指導(第 50 回九州歯科学会総会講演抄録)", "2002–2003 aamd board of directors", "2004s-os8-3 内航不定期輸送シミュレーション手法の研究 : 荷主・オペレータ間のe-ビジネスの評価(オーガナイズドセッション(os8):物流/輸送シミュレーション)", - "(2007 - 2008) student government minutes: 2007-10-02", - "(2009 - 2010) student government minutes: 2009-04-21", "2009 ssr awards", "2010-03-personalien", - "2010-44 nouvelles du corps médical", "2011 editorial collaborators", - "2014 ieee membership application", - "(2015-2016) student government minutes: 2016-02-02", "2017 membership report", "2018 thank-yous", "2188 chemical shifts and coupling constants for c12h19o4ps", @@ -521,6 +518,7 @@ TITLE_BLACKLIST = set([ "aluminium alloy al-p2014a-t4 or t42. sheet and strip 0,4 mm ≤ a ≤ 6 mm", "ama insights", "american board of dermatology examination dates", + "american institute of instruction", "american nurses association", "among the contributors", "among the publishers", @@ -602,6 +600,7 @@ TITLE_BLACKLIST = set([ "asme conference presenter attendance policy and archival proceedings", "asm meetings calendar", "asm news", + "asms news", "association affairs", "association directory", "association intelligence", @@ -731,6 +730,7 @@ TITLE_BLACKLIST = set([ "books and publications received", "books available list", "books brief", + "books by our readers", "bookseller's catalogue", "bookseller's catalogues", "booksellers' catalogues", @@ -1004,6 +1004,7 @@ TITLE_BLACKLIST = set([ "construction", "construction of new plant for sidney roofing and paper company in burnaby, b.c.", "contemporary echoes", + "content lists", "contents", "contents and chemical science", "contents,editorial board,forthcoming articles", @@ -1577,6 +1578,7 @@ TITLE_BLACKLIST = set([ "ieee sensors council information", "ieee signal processing society information", "ieee systems, man, and cybernetics society information", + "ieee transactions of wireless communications", "ieee transactions on antennas and propagation institutional listings", "ieee transactions on circuits and systems—ii: express briefs publication information", "ieee transactions on computer-aided design of integrated circuits and systems information for authors", @@ -3101,6 +3103,7 @@ TITLE_BLACKLIST = set([ "the evening world", "the express", "the future", + "the future of imprisonment", "the future of the medical profession", "the general medical council", "the good old days", |