diff options
-rw-r--r-- | fuzzycat/verify.py | 1327 |
1 files changed, 1293 insertions, 34 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 730a5ee..e1ff4d8 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -64,14 +64,54 @@ get_key_values = operator.itemgetter("k", "v") # There titles appear too often, so ignore them for now. TITLE_BLACKLIST = set([ "", - ":{unav)", - "[others]", - "[s.n.]", - "a correction", - "a personal note", + "100 years ago", + "10. schlußbemerkungen", + "(11899) weill", + "11-i-6 シロネズミ肝臓アミドホスフォリボシルトランスフェラーゼ活性に及ぼす核酸関連物質およびビタミンb_<12>添加の影響(研究発表)(日本ビタミン学会 : 第25回大会研究発表要旨)", + "11. quellen und literatur", + "165 脳spectにおける収集条件の画質への影響(第30回秋季学術大会 一般研究発表予稿集)", + "1760 chemical shifts and coupling constants for c11h17o3ps", + "181. 気管支喘息患児の抗食餌アレルゲン特異ige抗体の変動について(喘息-病態生理iii)", + "19. syntheses", + "1. general introduction", + "1-iii-9 20位水酸化ビタミンd_3誘導体の合成研究(一般演題要旨,日本ビタミン学会第64回大会講演要旨)", + "1. introduzione", + "1. vorbemerkungen", + "1. 歯科修復物の品質管理 (qc) とその指導(第 50 回九州歯科学会総会講演抄録)", + "2010-44 nouvelles du corps médical", + "2011 editorial collaborators", + "2018 thank-yous", + "2188 chemical shifts and coupling constants for c12h19o4ps", + "25. literaturhinweise", + "2871 chemical shifts and coupling constants for c14h14o3p2s3", + "29p-je-14 強磁性層と超伝導層の境界における称序パラメーターの振舞いii(低温)", + "2-ii-24 新規フッ素化ビタミンdアナログの合成 : 第47回大会研究発表要旨", + "2. konzeptionelle grundlagen", + "2 the ymca", + "340-911-1-pb__1_.pdf", + "3. argentina", + "3 biochemische und klinische analyse", + "3. biochemische und klinische analyse", + "3. dezember 1941", + "3. psychiatry", + "4. netherlands", + "4. optical properties", + "50 & 100 years ago", + "5098703 interferon-alpha 76", + "50 years ago", + "5 empirische untersuchung", + "5. soul loss", + "6. some applications", + "7. zusammenfassung der ergebnisse", + "8. fazit und ausblick", + "8. personalnachrichten", + "95% success", + "9. the future", "aacci corporate members", + "a. allgemeines", "abbildung", "abbildungsnachweis", + "abbotsford post", "abbreviations and acronyms", "about the cover", "about the editor", @@ -80,200 +120,1409 @@ TITLE_BLACKLIST = set([ "about this journal", "about this title", "abréviations", - "abstract withdrawn", + "abstracts", "abstracts of papers from other journals", "abstracts of papers to appear in future issues", - "abstracts", + "abstract withdrawn", "acknowledgement of reviewers", - "acknowledgement to reviewers", - "acknowledgements to reviewers", "acknowledgements", + "acknowledgements to reviewers", + "acknowledgements to reviewers:", + "acknowledgement to our reviewers", + "acknowledgement to reviewers", + "acknowledgement to reviewers 1984", "acknowledgment of reviewers", "acknowledgments", - "actualités professionnelles", + "acknowledgment to 2015-2016 reviewers", + "acknowledgment to reviewers", + "a correction", + "acp-9-2289-2009.pdf", + "acronyms", + "acronyms and abbreviations", + "acs news", + "activities and announcements", + "activities of the society", "actualités", + "actualités professionnelles", "addenda", + "a. einleitung", + "aeq volume 24 issue 3 cover and front matter", + "aerospace series. electrical contacts used in elements of connection", "agradecimento", "agradecimientos", + "agricultural inventions", + "agriculture", + "aims and scope", + "aims & scope/editorial board", + "air quality data from the life+respira project in pamplona, spain: no2 in 2015 at 00h cet (dataset)", + "air quality data from the life+respira project in pamplona, spain: no2 in 2015 at 16h cet (dataset)", + "air quality data from the life+respira project in pamplona, spain: o3 in 2015 (16h cet) (interpolated geotiff)", + "air quality data from the life+respira project in pamplona, spain: o3 in 2015 at 06h cet (dataset)", + "aktuell", + "al 2 br 2 cl 4", + "algeria", + "allergy.", "all pdfs of this category", + "also of interest", + "amphotericin b", + "anemia", "an epitome of current medical literature", + "angina pectoris", "an invitation to membership", + "annex", "announcement", "announcements", + "announcements and calendar", + "announcements of future meetings", + "annual general meeting", "annual meeting", "annual report", + "annual report: 1988–1989", + "a note on the texts", + "answers", + "answers to cme examination", + "anthropology", + "anxiety disorders", + "anzeige.", + "a personal note", "appendix c", "appendix d", "appendix d.", + "appendix k:", + "appendix: tables", + "applied science laboratories inc.", "archaeology", + "archeologie", + "archives", "around the world", "arthrobacter sp.", + "article abstract", + "arts and decoration", + "aseptic midwifery", + "asme conference presenter attendance policy and archival proceedings", + "association directory", + "association intelligence", "association notes", + "association notice", + "association suisse pour le suffrage féminin", + "at the literary table", "aufgaben", "ausgewählte literatur", + "author biographies", + "author biography", "author index", + "author index subject index", "author response image 1. author response", - "back matter", + "autorenverzeichnis", + "autorinnenverzeichnis", + "autriche", + "avertissement", + "awards alert", "background", + "[back inside cover]", + "back matter", "backmatter", + "bangladesh", + "bangladesh: 1972–2012", + "bartholomew's hospital.", + "basic concepts", + "basic research", + "beckman instruments, inc.", + "beckman® instruments, inc", + "beckman® instruments, inc.", + "bedside teaching 高齢者における心疾患・2-高齢者の急性心筋梗塞", + "beitrÄge", + "belgium", "bericht", + "berichte", "beyond the flyleaf", + "bfh v. 21.7.1960 (ii)", + "bibliographical note", + "bibliographical notes", + "bibliographical notices", + "bibliographie sélective", "bibliography", + "bildnachweis", + "bildnachweise", + "biographical notes", + "biosketches", + "[blank page - back cover]", + "blank page [back cover]", + "bma affairs", + "board of editor", + "board of editors", + "boekbespreking", + "boekbesprekingen", + "book marks", "book received", "book review", + "book reviewers", "book reviews", - "books received", + "book reviews comptes rendus", + "book review section", + "book reviews/revue de livres", + "books and publications received", "bookseller's catalogue", + "bookseller's catalogues", + "booksellers' catalogues", + "book shelf", + "bookshelf", + "books in brief", + "books of interest", + "books received", + "books received but not reviewed", + "books reviewed", + "boston medical library", + "botany", + "botulinum toxin a", "boundary creek times", + "braham, john (1774?–1856)", + "brazil", + "brief an theodor zwinger iii", + "brief communications", "briefer notices", + "briefer notices.", + "briefly noted", + "brief notices", + "brief review", + "brief reviews", "briefs", + "british columbia record", + "british medical association", + "british veterinary association", + "buchanzeigen.", + "buchbesprechungen – book reviews – livres nouveaux", + "bücherbesprechungen", "bulletin board", + "bulletins & highlights", "bureau of investigation", + "bureau of legal medicine and legislation", + "bürgerliches recht", + "business and personal wants", + "c 15 h 20 n 1 o 3 p 1", + "c21h26n2o5 - structure no. 1098", + "c 22 h 18 f 6 o 5 sn 1", + "c 22 h 18 f 6 o 5 zn 1", + "c2 - editorial board", + "c2: editorial board", + "c2h6o and c4h8", + "c3h8o and c8h18", + "c4h8o and c7h8", + "c5h12o and c8h18o3", + "c5h8o2 and c6h12", + "c6h12o2 and c7h16", "calendar", + "calendar of courses, symposiums, and conferences", + "calendar of forthcoming events", + "calendar of forthcoming meetings", + "calendar of future meetings", + "call for submissions", "canto", - "canto", + "captopril", + "career opportunities", + "carta do editor", + "cary instruments", + "case of the month", + "ceramic abstracts", + "ceramic abstracts1", + "chapitre ii", + "chapitre iii", + "[chapitre] v", + "chapitre v", + "chapitre vi", + "chapitre xxxi", + "chapter 7. conclusion", + "chapter 9. conclusion", + "chapter iv", + "chapter news", + "chapter xl", + "chapter xxix", + "character list", + "checklist", + "chemicals", + "chemical shifts and coupling constants for c23h21o6p", + "chemical shifts and coupling constants for c27h27o5p", + "chemical shifts and coupling constants of c5h3cl6nos", + "chemical shifts and coupling constants of c8h6cl3no", + "chemistry of vegetable physiology and agriculture", + "chromatography calendar", + "chronic pyelonephritis", + "ciclosporin", + "cimetidine*", + "circulation: clinical summaries: original research put into perspective for the practicing clinician", + "classic pages", + "classified advertisements", + "classroom notes", + "clinical connections", + "clinical implications", + "clinical memoranda", + "clinical news", + "clinical summaries", + "cme accreditation page", + "cme calendar 1998", + "cme examination", + "co2: editorial board", + "coffee break アルコール常飲者のビタミン欠乏", + "coffee break 潰瘍の再発をめぐって(その4)-観察成績の多様さについて", + "collected essays", + "colloquium", + "coming events manifestations futures", + "coming up", + "committee list", "communications to the editor", "company news", + "comulative author index to volume 289", + "concluding reflections", + "concluding thoughts", + "conclusio", "conclusion", + "conclusion générale", "conclusions", + "conclusions and outlook", + "conclusions and prospects", + "conference", + "conference 1951", + "conference announcement", + "conference announcements", + "conference author index", + "conference program", "conference report", "conferences", + "congratulations!", + "congratulations", + "congress calendar", + "consequences", + "construction", "contents", + "contents and chemical science", + "contents of previous volumes", + "contents to volume 2", + "contents to volume 35 (1980)", + "contents to volume 53 (1982)", + "contents to volume 71", "continuing education", + "continuing medical education", + "continuing professional education questionnaire", + "contributing author", "contributors", + "contributors to this volume", + "cooperation 1893–94", "copyright", + "corporation and institutional members", "correction", + "corrections and clarifications", "correspondence", "corrigendum", + "costa rica", + "council on pharmacy and chemistry", + "courses", + "courses and conferences", "cover", + "cover and front matter", + "cover caption", + "cover, original table of contents, and editorial board", + "cover picture", + "cranbrook herald", + "critical notices", + "cumulative author index", + "cumulative author index to volumes 271, 272", + "cumulative author index to volumes 291–304", + "cumulative index to authors", + "cumulative subject index*1", + "current educational literature in the periodicals", + "current events", + "current literature survey", + "current papers in oral biology", + "current publications received", + "current research", + "current topics", + "curriculum vitae", + "daily british columbian", "daily building record", + "dapsone", + "dasatinib", "data_sheet_1.zip", + "de-ci, de-là...", + "decorative notes", "dedication", + "definition of drunkenness", + "definitions", + "demonstration", + "descriptive statistics", + "development", + "diagnostik", + "diamagnetic bulk susceptibility data of c6h14s2", "diary of events", + "diary of forthcoming events", + "diary of meetings", + "die autoren", + "die gdch-zeitschriften bringen", + "directory: aao officers and organizations", + "directory of otolaryngologic societies", + "disclaimer", "discussion", - "editorial board and publication information", + "discussions of a.i.e.e. papers — as recommended for publication by technical committees", + "diseases of infancy and childhood", + "diskussion der ergebnisse", + "dissertationes", + "dokument 119-133", + "dokument 18-19", + "druckfehler", + "drug watch", + "eastern questionnaire, blue book for interviewee 49275a, answers for pages 055-079", + "e-commerce", + "economic review", + "ed board", + "ed. board", + "editorial", + "editorial announcement", "editorial board", + "editorial board/ aims & scope", + "editorial board/aims & scope", + "editorial board, aims & scope, table of contents", + "editorial board and publication information", + "editorial board/copyright information", + "editorial board - english", + "editorial board members", + "editorial board / redaksieraad", + "editorial board/reviewing committee", "editorial committee", - "editorial", + "editorial foreword", + "editor's choice", + "editor's commentary", "editors/ editorial board", + "editor's foreword", + "editor's message", + "editor's report", + "editors' report", + "educational forum", "educational intelligence", + "educational intelligence.", + "educational news", "educators personally", + "egypt: 1948–2012", + "eigenvalues and eigenvectors", "eingesandte schriften", "einleitung", + "einzelbesprechungen", + "emanations", + "empirische untersuchung", + "employment ads information", + "employment information", + "endocrine-related resources from the national institutes of health", + "energy", + "engineering notes", + "england and wales", + "english abstracts", + "environment", + "envoi", + "eov editorial board", + "epidemic keratoconjunctivitis", + "epidemiology section", + "epÍlogo", + "epitome of current medical literature", + "epoprostenol*", + "ergebnisse", "erratum", + "escitalopram", + "essay reviews", + "estonia", + "european perspectives", + "evaluation", "events calendar", + "examples", + "exhibition", + "explanitory notes", + "external reviewers", + "fe de errata", + "figure 3—source data 1.", + "filmographie", + "filmography", + "financial statement", + "finlande", + "firmenschriften", + "food allergy", + "foot-and-mouth disease.", + "foreign and insular statistical reports of countries and cities: untabulated", "foreword", + "forthcoming conferences", + "forthcoming features", + "for the record", + "foundations", + "fragment unbekannten inhalts", + "free colour illustrations in the online version of articles", + "from foreign journals", + "from the blogosphere", + "from the iee archives", + "from the lecture commitee", + "from the president", "front & back matter", "front cover", "front matter", "frontmatter", + "frontmatter and index", + "full issue", + "full title", + "full title (editorial board)", + "functions", "fundraising", + "fürs laboratorium", + "future contributions to journal of statistical physics", + "future meetings", + "f.y.eye", + "gastro‐enterology", + "gaussian job archive for c13h26o4", + "gaussian job archive for c14h16o", + "gaussian job archive for c15h14o", + "gaussian job archive for c20h20cl2o6", + "gaussian job archive for c5h12o2", + "gaussian job archive for c5h13no4", "gbif occurrence download", + "gdch-bewerberliste", "geleitwort", + "gem-a notices", + "general", + "general conclusion", "general information", + "general introduction", "general medical council", - "general", + "general medical council.", + "general notes", + "general policy and analysis", + "genève.", "geographical notes", + "geography of the day", + "gerichtsverfassung und verfahren.", + "germany", + "gesellschaftsrecht", + "getting started", + "gilford instrument laboratories inc.", + "glasgow pathological and clinical society", + "gliederung", + "globalization", + "glossary of terms", + "glycine max (l.) merr. mutante, mut 116 a/2", + "golden era", + "government services", + "graphical abstract (angew. chem. int. ed. engl. 13/1994)", + "graphical contents list & author index", + "griseofulvin", + "guest editor", + "guest editors", + "guide to further reading", + "guy's hospital.", + "heart disease and pregnancy", + "helicobacter pylori", + "highlights from the literature", + "hinweise für autoren", + "histamine", + "histoire de lire", + "historical background", + "history", + "hospital and dispensary management", + "hospital and institutional news", + "hospital reports", + "hungary", + "hygiene", + "hyg volume 114 issue 2 cover and front matter", + "hyg volume 50 issue 3 cover and front matter", + "hyg volume 61 issue 1 cover and back matter", + "ibc: guide for authors", + "ieee computational intelligence society information", + "ieee electron device letters information for authors", + "ieee foundation [advertisement]", + "ieee geoscience and remote sensing letters publication information", + "ieee signal processing society information", + "ieee systems, man, and cybernetics society information", + "ieee transactions on computer-aided design of integrated circuits and systems information for authors", + "ieee transactions on electron devices information for authors", + "ieee transactions on magnetics institutional listings", + "i einleitung", "i. einleitung", - "in this issue", + "ifc (ed board)", + "ifc (ed. board)", + "ifc - ed board", + "ifc-ed. board", + "ifc - publication information", + "ifc ‐ publication information", + "ii. abteilung", + "i. introduction", + "illustration credits", + "image_1.jpg", + "image_2.png", + "image credits", + "image of the month answer", + "in case you haven't heard", + "in case you haven't heard…", + "index", "index des auteurs", + "index des noms", "index des noms de personnes", - "index", + "indexes", + "index locorum", + "index of ancient sources", + "index of first lines", + "index of modern authors", + "index of names and places", + "index to authors", + "index to current literature", + "index to subjects", + "index to surgical progress", + "index—volume 12", + "index volume 21 1998", + "index, volume 26, 2007", + "index – volume 55", + "index volume 7, 2004", + "india and the colonies", + "india: calcutta. cholera and plague", + "industrial communication networks. fieldbus specifications", "industrial literature", - "industry news", + "industrial news", "industry", + "industry & business", + "industry/business", + "industry news", + "industry update", + "infectious disease", + "infectious diseases", + "information exchange", + "information for contributors", + "in future issues", "inhalt", + "inhalt / contents", + "inhalt · contents", "inhalt-impressum", "inhalt.impressum", + "in kürze", + "innovations", + "in other journals", + "in process", + "in process citation", + "inra:grapevine:0bdxx30", + "inra-onf:populus nigra:6-a07", + "inra:quercus robur x quercus petraea:h549", + "inra:quercus robur x quercus petraea:h615", + "inra:quercus robur x quercus petraea:i021", + "inra:quercus robur x quercus petraea:i052a", + "inra:quercus robur x quercus petraea:i078a", + "inra:quercus robur x quercus petraea:i217", + "in response", + "in science fields", + "inside aaas", + "inside front cover (c2), editorial board", + "inside front cover - scope & editors", + "inside this issue", + "instruction to authors", + "instrumentation", + "instrumentation news", + "integration 1", + "intelligenzblatt nro. iv", "interlude", + "interlude 3", + "international", + "international conference calendar 1995", + "international meetings", + "international news", + "interview", + "in the news", + "in this issue", "introduction", + "introduction and background", + "introduction générale", + "introduction to part i", + "introduction to part iii", + "introduzione", + "introduzione 7", + "israel", + "issid pages", + "issue contents", "issue highlights", "issue information", + "issue information - cover", + "issue information - editorial board", + "issue information - table of content", + "issue information - table of contents", + "issues and events", + "items", + "it says in the jer …", + "jahresinhaltsverzeichnis 2008", + "jahresregister 2011", + "jama 75 years ago", + "jas volume 53 issue 2 cover and back matter", + "jas volume 59 issue 1 cover and back matter", + "jaz volume 20 issue 1 cover and back matter", + "jim report 電子カルテ構築記・4-パソコンの診療への応用", + "journal cme questions", + "journal of the society of motion picture engineers — table of contents", + "journals", + "journals.", + "journals and new books", "journal scan", + "journals received", + "j. t. baker chemical co.", + "judicial decisions", + "justice", + "kapitel 11", + "kapitel i. einleitung", + "kay figure 24 from: heterick b, castalanelli m, shattuck s (2017) revision of the ant genus melophorus (hymenoptera, formicidae). zookeys 700: 1-420. https://doi.org/10.3897/zookeys.700.11784", + "kazakhstan", + "ker volume 5 issue 1 cover and back matter", + "keywords", + "kleine mitteilungen", + "knowledge", + "kommentar", + "kongressankündigungen", + "kongresse - symposien - seminare - messen", "kongresskalender" + "kongresskalender", + "kongreßkalender", + "kongresskalender 2016", + "kurzbesprechungen", + "land use : u.e.l.", + "laparoscopic cholecystectomy", + "leads from the mmwr", + "learning", + "legacies", + "legal issues", "les auteurs", - "letter to the editor", + "letter: n.d.", + "letters, notes, and answers", + "[letters to editor]", "letters to the editor", + "letter to the editor", + "letter vii.", + "letter xxxii", + "library notes", + "libri novi", + "libri nuovi", + "libri ricevuti", + "liebe leserinnen, liebe leser,", + "liebe leserinnen und leser", + "liste des tableaux", + "listings", + "list of authors", + "list of contents", "list of delegates", "list of figures and tables", + "list of forthcoming papers", + "list of journals scanned", + "list of members", + "list of members, 1920", + "list of plates", + "list of referees", + "list of symbols", + "list of tables and figures", + "literary notes", + "literature review", + "literaturhinweise", + "literaturübersicht", + "literatur — Übersicht", + "literatur und quellen", + "litteraturübersicht", + "liverpool", + "liverpool.", + "liverpool medical institution.", + "livres reçus / books received", + "looking back.", + "los autores", + "lösungen", + "low back pain", + "lsy volume 11 issue 2 cover and front matter", + "lu pour vous", + "luxembourg", + "malaysia", + "manager's notices.", + "manuscripts received", "map projections", + "marketplace", + "massachusetts department of public health", + "master audio part 3", "masthead", + "mathematica", + "mauritania", + "measurement", + "media reviews", + "mediation", + "medical and surgical appliances", "medical annotations", "medical annotations.", - "medical diary of the week.", + "medical diary", "medical diary." "medical diary.", + "medical diary for the ensuing weck.", + "medical diary for the ensuing week.", + "medical diary of the week.", + "medical education", + "medical highlights", + "medical motion pictures", + "medical notes", + "medicine 1977.", + "medicine and the law", + "medicine and the law.", + "medicine and the media", + "medico-legal and medico-ethical", + "medico-meteorological observations", + "medico-parliamentary", + "meetings and conference", + "meetings and conferences", + "meet our new contributor", + "membership", + "membership application", + "membership notes", + "memoirs", + "mental health", + "message from general chair", + "message from the editor", + "message from the president", + "meteorological record", + "method", + "methods of test for petroleum and its products", "methotrexate", "mexico", + "microscopy", + "minerva", + "minimum wage", "miscellany", + "mitteilungen des bde", + "mitteilungen des bdi", + "mixtura mirabilis", + "mobius aromaticity and delocalization", + "motivation", + "moving forward", "moyie leader", + "ms18-1 多種類抗体対応・高感度マルチアレルゲン蛋白チップ開発(ms18 アレルゲン,ミニシンポジウム,第61回日本アレルギー学会秋季学術大会)", + "ms9-7 喘息に伴う慢性副鼻腔炎に対する内視鏡下副鼻腔手術の術後評価(ms9 副鼻腔炎とアスピリン喘息,ミニシンポジウム,第63回日本アレルギー学会秋季学術大会)", + "musa aaaa itc1283", + "musa aa itc0411", + "mutation analysis (mutsig 2cv v3.1)", + "mutation analysis (mutsigcv v0.9)", + "myasthenia gravis.", + "mycophenolate mofetil", + "nachtrag", + "nachträge", "nachwort", + "namenregister", + "namensregister", + "namensverzeichnis", + "namen- und sachregister", + "national health service", + "naval and military medical services", + "negotiation", + "nelson daily miner.", + "nelson tribune", + "netlines", + "network", + "neue ger�te und chemikalien", + "neues aus forschung und industrie", + "neue werke", + "new and nonofficial remedies", + "new and recent ieee publications", + "new books and publications", + "new books, etc", "new books, etc.", + "new books received", + "new editions", + "new equipment", + "new inventions", + "new inventions.", + "new literature", "new members", - "news section", + "new miscellaneous inventions", + "new product developments", "news", + "news, comments, and service announcements", + "news focus", + "news from our chapters", + "news from the field", + "news letter", + "newsletter", + "newsletter2001 (4)", + "news of science", + "news section", + "news & update", + "news & views", + "new york.", + "nichtmetallische anorganische werkstoffe", "nivolumab", + "nomenclature", "norway", "not available", + "note from the editors", "note of appreciation / note de reconnaissance", - "notes for contributors", + "note on transliteration", "notes", + "notes and abstracts", + "notes, comments and abstracts", + "notes, comments, and abstracts", + "notes, comments, and abstracts.", + "notes de lecture", + "notes for authors", + "notes for contributors", + "notes from the field", + "notes, short comments & answers to correspondents.", + "notes, short comments, & answers to correspondents.", + "notes, shortcomments, & answers to correspondents.", + "notes to contributors", + "notice about photo copying", + "notice about photocopying", + "notices biographiques", + "notices of books.", + "notices of meetings", + "notices of new books", + "notices of recent publications", + "notices to correspondence", + "notice to authors", "notice to contributors", + "nouvelles du corps médical", "nova et vetera", + "nr. 121 (23 mai 1885)", + "nr. 12 (dezember)", + "nr. 3 (1. maerz 1882)", + "nro 4 (1. april 1870)", + "nuclear-chicago", + "nuclear-chicago corporation", + "nuclear magnetic resonance data of c10h14o2s", + "nuclear magnetic resonance data of c10h28b10 2−", + "nuclear magnetic resonance data of c13h18os", + "nuclear magnetic resonance data of c14h12clno", + "nuclear magnetic resonance data of c14h38b11o−", + "nuclear magnetic resonance data of c16h13cln2", + "nuclear magnetic resonance data of c17h20o5", + "nuclear magnetic resonance data of c17h24os", + "nuclear magnetic resonance data of c18h18o2", + "nuclear magnetic resonance data of c22h18cln3", + "nuclear magnetic resonance data of c4h19b12o 2 −", + "nuclear magnetic resonance data of c5h10o", + "nuclear magnetic resonance data of c8h12o", + "nuclear magnetic resonance data of c9h18s", + "o02-01 局所進行肺癌手術症例の検討 : t3,t4症例を中心に(肺癌2,第25回日本呼吸器外科学会総会)", + "obesity", "obituary notices", + "obstetrical society of philadelphia", + "obstetrics.", "occurrence download", + "october 15, 1870", + "oeffentliche gesundheitspflege", + "oeffentliches sanitätswesen", + "official photograph taken on the british western front", "official photograph taken on the british western front in france", + "official publications", + "oils and fats", + "on the cover", + "on the cover.", + "ontvangen boeken", + "open access", + "opening address", + "ophthalmology", + "optimization", + "oral presentations", + "ordinary meeting", + "organization section", + "orthopedic surgery", + "ortsregister", + "ortsregister - 725", + "other countries", + "other publications", + "other publications received", + "[others]", "oup accepted manuscript", + "our book shelf", + "our bookshelf", "outside front cover", + "overzicht van buitenlandse tijdschriften", + "p22-4 高度な気管狭窄を認めた成人t細胞白血病リンパ腫の一例(リンパ腫,ポスター22,第34回日本呼吸器内視鏡学会学術集会)", + "pÆdiatrics", + "pakistan", + "panel and contract practice", + "panel discussion", + "panorama dermatologische praxis", + "paraneoplastic syndromes.", + "parliament", "parliamentary intelligence", + "parliamentary intelligence.", + "participants", + "part ii. reviews and bibliographical notices", + "patent claims", + "patent ductus arteriosus", + "patent list", "patent report", + "patent reports", + "patent review", + "patent reviews: 3,947,630; 3,974,383; 3,976,380; 3,976,882; 3,977,773; 3,978,281; 3,978,334; 3,980,403; 3,983,529; 3,983,573; 3,984,171; 3,985,419; 3,989,348; 3,989,355; 3,989,946; 3,990,771; 3,990,773", + "patent selections", + "patent selections:", + "pediatrics", + "pembrolizumab", + "peninsula times", "people and events", "people and places", + "periscope", + "personalien", + "personal-nachrichten", + "personalnachrichten", + "personal notes", + "personalnotizen", + "personal- und hochschulnachrichten", + "personenverzeichnis", + "perspective", + "perspectives", "petitions.xlsx", + "pharmanews", + "phaseolus vulgaris eca027", + "phaseolus vulgaris eca043", + "phaseolus vulgaris eca141", + "phaseolus vulgaris eca210", + "phaseolus vulgaris ece109", + "phaseolus vulgaris l. g8199k", + "phenomenology", + "philippines", + "philosophy", + "photovoltaic devices", + "physiological chemistry", + "physiology", + "pisum sp., 4301 /49", + "poetry", "positions available", + "positions wanted", + "position wanted", + "poster abstracts", + "poster presentations", + "poster session", + "poster session 1", + "poster session 30", + "postskriptum 3", + "potpourri", + "praefatio", "preface", + "preface and acknowledgements", + "prelim ii: editorial board", + "prelim(iii) editorial board", + "preliminaries", "preliminary material", + "preparation of papers", "preparations and appliances", + "presentation_1.pptx", "preservation image", + "president's page", + "[president's report]", + "prince rupert journal", + "principles of interior renovation", + "problems for solution", "proceedings of societies", + "proceedings of the society, 1891", + "proceedings of the society, 1899–1900", + "proceedings of the society, 1925–1926", + "proceedings of the society, 1931–1932.", + "product finder", "production", + "product review", + "products & materials", + "professional appointments", + "professional directory", + "professional notes", + "profile", + "progress of medical science", + "prÓlogo", + "prospector", + "proven, practical guidance from the planned giving experts", + "provincial medical & surgical journal", + "publication anouncement", + "publications received / ouvrages reçus / eingegangene schriften", + "publisher's announcement", + "queensland", + "queensland.", + "quellenverzeichnis", + "questions and comments", + "quiz of the month", + "quotations", + "radio unnameable", + "readers' comment", + "readers forum", + "readers' forum", + "recent actions regarding treaties to which the united states is a party", + "recent american and foreign patents", "recent books", + "recent cases", + "recently published papers", + "recent ornithological publications", + "reception", + "recordings", "recto", + "referee awards", + "referee comments", + "referees 2011", + "referees 2013", "references", + "références bibliographiques 215", + "reflections", "regulations", + "related articles", + "related publication, children and their families in the big cities, seminar 641, 1993-1994", + "religion", "reply", + "reply by the authors", + "réponses et questions", + "reported mortality", + "reporting on adverse clinical events", + "report of the annual meeting", + "reports and analyses and descriptions of new inventions, in medicine, surgery, dietetics, and the allied sciences", + "reports and publications", + "reports from national quarantine and inspection stations", + "reports of medical and surgical practice in the hospitals of great britain", + "reports of medical societies", + "reports of meetings", + "research", + "research brief", + "research briefs", + "research highlights", + "research in progress", "research items", + "research report", + "research reports", + "research roundup", + "resolution 2322 (2016)", + "resource center", + "resources", + "results", + "resümee", + "resumenes", + "retraction", + "revelation", + "review article", + "reviewer acknowledgement", + "reviewer acknowledgement 2011", + "reviewer acknowledgements", + "reviewer acknowledgements 2008", + "reviewer acknowledgment", + "reviewers list", + "review essays", + "review of current literature", + "review of current literature.", + "review of reviews", + "reviews", "reviews and notices", + "reviews in brief", "reviews of books", - "reviews", + "review symposium", + "revolution in retailing", + "rezensionen / reviews", + "rezensionen/reviews", + "richard ii", + "rio de janeiro", + "risk management", + "rivaroxaban", + "roster page", + "royal college of physicians", + "royal college of surgeons of england", + "royal society", + "sammlung göschen / bandnummernfolge", + "sarcoidosis", + "sardinia_2019", + "scanning sports", + "scanning the issue", + "scanning the issues", + "scheduled meetings", "schlussbemerkung", + "schlussbemerkungen", + "schlussfolgerungen", + "science", + "science and technology", + "science news", + "science news: 44", + "science notes", + "science/technology concentrate", + "science & technology concentrates", + "science/technology concentrates", + "scientific serials", + "scientific surgery", + "scurvy", + "secretary's report", + "section news", + "section news / nouvelles des sections", + "section news/nouvelles des sections", + "sections 1–82", + "selected titles", + "self-assessment questions", + "sem título", + "september 1847", + "september 1914", + "september 1933", + "series foreword", + "setting the scene", + "setting the stage", + "sheet 1 of 2", + "sheet 2 of 4", + "short notes", "short notices", + "short reviews", + "short term courses for graduate physical therapists", + "short-term courses for graduate physical therapists", + "similkameen star", + "sin título", + "sirolimus", + "[s.n.]", "society news", + "society news.", + "society related material", + "software survey section", + "soil action", + "solothurn", + "solutions", + "solutions to ✰-exercises", + "some applications", + "some papers to be published in future issues", + "some recent books", + "sources", + "spain.", "special announcement", + "special announcements", + "special notice", + "special report", + "special reports", + "specifications for essential oils", + "spectrum", + "spotlight", + "spotlights on recent jacs publications", + "spring 1940", + "stability", + "[staff list]", + "staff list", + "[staff listing]", + "stained glass orders 1907 ‐ 1926: page 119", + "stained glass orders 1907 ‐ 1926: page 128", + "stained glass orders 1907 ‐ 1926: page 174", + "stained glass orders 1907 ‐ 1926: page 183", + "standards and recommended practices", + "state board examinations—1961", + "state member board briefs", "st. bartholomew's hospital", + "st. bartholomew's hospital.", + "st. mary's hospital", + "st. mary's hospital.", + "stock watch", + "strafrecht und verfahren", "streptomyces sp.", + "stroke: highlights of selected articles", + "style", "subject index", + "subscribers page", + "subscription information", "subscription page", + "subscriptions page", + "summaries of articles", + "supplemental images", + "supplemental information 1: raw data", + "supplemental information 1: raw data.", + "supplementary file", + "supplementary file 1.", + "supplementary material", + "survey of india topo sheet 46c13 1951 1st edition", + "survey of india topo sheet 48m10 1977 1st edition", + "survey of india topo sheet 48m9 1975 1st edition", + "survey of india topo sheet 54m1 1976 1st edition", + "survey of india topo sheet 55o4 1977 1st edition", + "survey of india topo sheet 63a2 1977 1st edition", + "survey of india topo sheet 73a9 1983 1st edition", + "symposium24-1", + "synformissue 2012/02", + "tabellen", + "tabellenverzeichnis", + "table_1.doc", + "table_1.xls", + "table_6.xls", + "table of cases", "table of contents", + "table of contents, barcode", + "table of contents volume - 3", + "tacrolimus", + "tafeln", + "talking points from books", "taxonomic abstract for the species.", + "td1-p01379-p01379a.mp3", + "technical program", + "technical program committee", + "temperature of water", + "thailand", + "thanks to 2012 reviewers", + "thanks to our reviewers", + "thanks to reviewers", + "thanks to reviewers 2008", "thank you", + "thank yous", + "thank you to our reviewers", + "thank you to our reviewers 2016", + "thank you to reviewers, 2019", + "the american association for thoracic surgery", "the applause data release 2", + "théâtre", + "the british medical journal", + "the clinical chemist", + "the continental news", + "the end of indexes", + "the essenes", + "the evening sun", + "the express", + "the future", + "the general medical council", + "the grand forks sun and kettle valley orchardist", + "the heat equation", + "the last word", + "the ledge", + "the legacy and appearance of hospital buildings", + "the mail herald", + "the nelson tribune", "the new westminster news", + "the paystreak", + "the post‐graduate committee in medicine in the university of sydney", + "the postgraduate committee in medicine in the university of sydney", + "the president 1978-1979", + "the propaganda for reform", + "the prospector", + "the public service", + "the public service.", + "therapeutic index to advertisers", + "therapeutics", + "therapeutics.", + "the silvertonian", + "the state, seminar 401, 1952-1953", + "the twentieth century", + "the war", + "the war.", + "the week", + "the weekly news", + "this month in the journal", + "this week in business", + "this week in science", + "this week's issue", "titelseiten", + "title page / contents / foreword", + "title page / contents / preface", + "title page & editorial board", + "title page / editorial board", "title page/editorial board", + "title page i - volume 3", + "to-day's drugs", + "todes-anzeige", + "to john keble", + "to john keble (ii)", + "to new subscribers", + "to our readers", + "topics of the information community", "transactions", + "transactions and communications", + "transactions of branches", "transcript", + "transcriptions", + "transitions", + "translation", + "translators' preface", + "transport", + "trauma", + "trend of the times", + "trends and tangents", + "tweets of the week", + "Übersicht", + "ulcerative colitis", + ":{unav)", + "uncertain (bowl)", "unidentified", "united kingdom", + "units", + "universal decimal classification. english full edition", + "university and educational intelligence", + "university of london : appointments", + "university of london: appointments", + "untersuchungsmethoden", + "upcoming events", + "upcoming events 36.2", + "upcoming meetings related to alzheimer's disease", + "u.s.i. chemical news", + "vapor-liquid equilibrium of the mixture c3h6o3 + c3h8o2 (lb4812, evlm 1231)", + "vapor-liquid equilibrium of the mixture ch4o + c6h14o (lb4908, evlm 1231)", + "varia", + "venezuela", "veranstaltungen", "veranstaltungskalender", "verein deutscher chemiker", + "vereinsangelegenheiten", + "verwaltungsentscheidungen", + "verzeichnis der abbildungen", + "verzeichnis der abkürzungen", + "video_1.mp4", + "video_3.mp4", + "video views", + "vient de paraître", + "volkshochschule arnsberg: frühjahrsprogramm 2010", + "volkshochschule der stadt homburg-saar: jahresprogramm 2012/13", + "volkshochschule der stadt schweinfurt: herbstprogramm 2007", + "volkshochschule der stadt trier: programm 3. trimester 1949", + "volkshochschule kreis und stadt hersfeld: programm 2. semester 1963/1964", + "volkshochschule schongau: herbstprogramm 2008", + "volkshochschule schwelm: programm 1. semester 1971", + "volume 10 1986–1987 index", + "volume 13 index", + "volume 3 1978 index", + "volume author index", + "volume index", + "vorbemerkungen", "vorrede", + "vorwort der herausgeber", + "vorwort zur ersten auflage", + "vorwort zur zweiten auflage", + "wanted", + "warfarin", + "washington", + "washington news", + "web alert", + "weekly mortality table, cities of the united states", + "weekly mortality table, foreign and insular cities", + "welcome message", + "westminster hospital", + "westminster hospital,", + "what's on the web", + "what's your diagnosis?", + "what they say", + "where are we going?", + "winterthur, slm 2", + "wirtschaftlicher teil u. vereinsnachrichten", + "wissenschaftliche rundschau", + "with the technicians", + "woman's auxiliary", + "works cited", + "world in brief", + "world news", + "world wide chemistry", + "xxv. auszüge", + "xxvi. auszüge", + "zeitschriftenschau", + "zeittafel", + "zur besprechung eingelaufen.", + "Унежева З.С., Султанова А.М. Женский национальный пояс конца xviii- первой половине xix в.в.", + "آينه افغان ايد : نشريه داخلي سه ماهه مؤسسه افغان ايد = afghanaid mirror", + "ワークショップ(1〜7) 7 月 2 日・3 日 a・b・c 会場", + "印刷雑誌 = japan printer (総目次 36(1)-36(12))", + "基于系统发育分析的dna条形码技术在澄清芍药属牡丹组物种问题中的应用", + "大鼠骨骼肌肌质网非序列依赖性dna结合蛋白及其功能的初步研究*", "奥付", + "工業化と労働 : 1966年ilo第50回総会事務局長報告", "投稿規定", + "特集 common disease インストラクションマニュアル-患者に何をどう説明するか 呼吸器疾患 アスベスト関連疾患", + "特集 common disease インストラクションマニュアル-患者に何をどう説明するか 物理・化学的因子による疾患 虫刺症", + "特集 帰してはいけない「こども」を見逃さないために 今月のquestion & keyword index", + "特集 最近のトピックス2007 clinical dermatology 2007 5. 皮膚科医のための臨床トピックス 学校保健における皮膚科医の活動", + "特集 第41回日本臨床眼科学会講演集 (6) 学術展示 vdt作業に伴う涙液量と瞬目数の経時的変化について", + "特集 脳のシステム障害と理学療法 eoi(essences of the issue)", + "特集 退院支援-理学療法士はその先が見えているか eoi(essences of the issue)", + "特集 顔の総合診療 顔をみればわかること 今月のquestion & keyword index", + "猪瀬優理著, 『信仰はどのように継承されるか-創価学会にみる次世代育成-』, 北海道大学出版会, 2011年10月刊, a5判, v+296頁, 3,990円(書評とリプライ)", "目次", + "第22回日本小児外科学会秋季シンポジウム : 膵・胆管合流異常-小児例でのconsensusを目指して-(プログラム)", + "経セミ : 経済セミナー : the keizai seminar (429)", + "英文誌journal of nutritional science and vitaminology vol.46, no.1掲載論文要旨", "表紙", "裏表紙", + "診断と治療 = diagnosis and treatment 臨時增刋 第四編", + "週刊ダイヤモンド = diamond weekly 別冊", ]) CONTAINER_NAME_BLACKLIST = set([ @@ -326,6 +1575,7 @@ class Miss(str, Enum): CHEM_FORMULA = 'miss.chem_formula' SUBTITLE = 'miss.subtitle' + class GroupVerifier: """ Verifier. @@ -359,7 +1609,8 @@ class GroupVerifier: continue for a, b in itertools.combinations(vs, r=2): for re in (a, b): - if re.get("extra", {}).get("container_name", "").lower().strip() in CONTAINER_NAME_BLACKLIST: + if re.get("extra", {}).get("container_name", + "").lower().strip() in CONTAINER_NAME_BLACKLIST: self.counter["skip.container_name_blacklist"] += 1 continue if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST: @@ -372,6 +1623,12 @@ class GroupVerifier: self.counter["total"] = sum(v for _, v in self.counter.items()) print(json.dumps(dict(self.counter)), file=sys.stderr) + with open("xxxx-todo", "w") as f: + print(json.dumps(todo.most_common()), file=f) + + +todo = collections.Counter() + def compare(a, b): """ @@ -385,13 +1642,13 @@ def compare(a, b): if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"): return (Status.DIFFERENT, Miss.CUSTOM_VHS) - if a.get("release_type") and b.get("release_type") and a.get("release_type") != b.get("release_type"): + if a.get("release_type") and b.get( + "release_type") and a.get("release_type") != b.get("release_type"): return (Status.DIFFERENT, Miss.RELEASE_TYPE) - if (a.get("release_type") == "dataset" and - b.get("release_type") == "dataset"): - if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and - a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")): + if (a.get("release_type") == "dataset" and b.get("release_type") == "dataset"): + if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") + and a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")): return (Status.DIFFERENT, Miss.DATASET_DOI) arxiv_id_a = a.get("ext_ids", {}).get("arxiv") @@ -425,7 +1682,8 @@ def compare(a, b): if slugify_string(a_sub) != slugify_string(b_sub): return (Status.DIFFERENT, Miss.SUBTITLE) - if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (a_slug_title != b_slug_title): + if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and ( + a_slug_title != b_slug_title): return (Status.DIFFERENT, Miss.CHEM_FORMULA) if len(a_slug_title) < 10 and a_slug_title != b_slug_title: @@ -440,7 +1698,8 @@ def compare(a, b): if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None: return (Status.STRONG, OK.PREPRINT_PUBLISHED) - if a_slug_title and b_slug_title and a_slug_title.strip().replace(" ", "") == b_slug_title.strip().replace(" ", ""): + if a_slug_title and b_slug_title and a_slug_title.strip().replace( + " ", "") == b_slug_title.strip().replace(" ", ""): if len(a_slug_authors & b_slug_authors) > 0: return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH) @@ -457,6 +1716,7 @@ def compare(a, b): if a_authors and len(a_slug_authors & b_slug_authors) == 0: return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY) + todo[a.get("title")] += 1 return (Status.AMBIGUOUS, OK.DUMMY) @@ -469,6 +1729,7 @@ def num_project(s): """ return re.sub('\d+', '<NUM>', s) + def contains_chemical_formula(s): """ Returns true, if we find C3H8O or the like in title. @@ -476,5 +1737,3 @@ def contains_chemical_formula(s): for token in s.split(): if CHEM_FORMULA.search(token): return True - return False - |