aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-17 22:23:16 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-17 22:23:16 +0100
commit2868d23d0a2746954156cb9d36ed53512ed012a9 (patch)
treef54444c0ed0f03c2ab56c0191bdfed16b9be5acc /fuzzycat
parente16d83f62be4530d3086ae8db596934f4db471d8 (diff)
downloadfuzzycat-2868d23d0a2746954156cb9d36ed53512ed012a9.tar.gz
fuzzycat-2868d23d0a2746954156cb9d36ed53512ed012a9.zip
move blacklist to the end
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/verify.py893
1 files changed, 666 insertions, 227 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 4a011b1..9a89f4f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -61,29 +61,280 @@ from fuzzycat.cluster import slugify_string
get_key_values = operator.itemgetter("k", "v")
+# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
+CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+
+
+class Status(str, Enum):
+ """
+ Match status.
+ """
+ EXACT = 'exact'
+ DIFFERENT = 'different'
+ STRONG = 'strong'
+ WEAK = 'weak'
+ AMBIGUOUS = 'ambigiuous'
+
+
+class OK(str, Enum):
+ """
+ Reason for assuming we have a match.
+ """
+ ARXIV_VERSION = 'ok.arxiv_version'
+ DUMMY = 'ok.dummy'
+ TITLE_AUTHOR_MATCH = 'ok.title_author_match'
+ PREPRINT_PUBLISHED = 'ok.preprint_published'
+ SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
+
+
+class Miss(str, Enum):
+ """
+ Reasons indicating mismatch.
+ """
+ ARXIV_VERSION = 'miss.arxiv_version'
+ BLACKLISTED = 'miss.blacklisted'
+ BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment'
+ CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
+ SHORT_TITLE = 'miss.short_title'
+ YEAR = 'miss.year'
+ CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
+ NUM_DIFF = 'miss.num_diff'
+ DATASET_DOI = 'miss.dataset_doi'
+ RELEASE_TYPE = 'miss.release_type'
+ CHEM_FORMULA = 'miss.chem_formula'
+ SUBTITLE = 'miss.subtitle'
+ BOOK_CHAPTER = 'miss.book_chapter'
+ TITLE_FILENAME = 'miss.title_filename'
+ COMPONENT = 'miss.component'
+ APPENDIX = 'miss.appendix'
+
+
+class GroupVerifier:
+ """
+ Verifier.
+
+ Within a group, we could have multiple sub clusters, e.g.
+
+ > [AABAB]
+
+ We would need to compare each possible pair and decide whether they are the
+ same.
+ """
+ def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10):
+ self.iterable: collections.abc.Iterable = iterable
+ self.max_cluster_size: int = 10
+ self.counter = collections.Counter()
+
+ def run(self):
+ for i, line in enumerate(self.iterable):
+ if i % 20000 == 0:
+ print(i, file=sys.stderr)
+ line = line.strip()
+ if not line:
+ continue
+ doc = json.loads(line)
+ k, vs = get_key_values(doc)
+ if len(vs) < 2:
+ self.counter["skip.unique"] += 1
+ continue
+ if len(vs) > self.max_cluster_size:
+ self.counter["skip.too_large"] += 1
+ continue
+ for a, b in itertools.combinations(vs, r=2):
+ for re in (a, b):
+ if re.get("extra", {}).get("container_name",
+ "").lower().strip() in CONTAINER_NAME_BLACKLIST:
+ self.counter["skip.container_name_blacklist"] += 1
+ continue
+ if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST:
+ self.counter["skip.publisher_blacklist"] += 1
+ continue
+ result, reason = compare(a, b)
+ self.counter[reason] += 1
+ print("https://fatcat.wiki/release/{}".format(a["ident"]),
+ "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
+
+ self.counter["total"] = sum(v for _, v in self.counter.items())
+ print(json.dumps(dict(self.counter)), file=sys.stderr)
+ with open("xxxx-todo", "w") as f:
+ print(json.dumps(todo.most_common()), file=f)
+
+
+todo = collections.Counter()
+
+
+def compare(a, b):
+ """
+ Compare two entities, return match status.
+ """
+ if len(a.get("title", "")) < 5:
+ return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
+ if a.get("title", "").lower() in TITLE_BLACKLIST:
+ return (Status.AMBIGUOUS, Miss.BLACKLISTED)
+
+ for fragment in TITLE_FRAGMENT_BLACKLIST:
+ if fragment in a.get("title", "").lower():
+ return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT)
+
+ if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
+ return (Status.DIFFERENT, Miss.CUSTOM_VHS)
+
+ if re.match(r"appendix ?[^ ]*$", a.get("title", "").lower()):
+ return (Status.AMBIGUOUS, Miss.APPENDIX)
+
+ if a.get("release_type") and b.get(
+ "release_type") and a.get("release_type") != b.get("release_type"):
+ return (Status.DIFFERENT, Miss.RELEASE_TYPE)
+
+ if (a.get("release_type") == "dataset" and b.get("release_type") == "dataset"):
+ if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi")
+ and a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")):
+ return (Status.DIFFERENT, Miss.DATASET_DOI)
+
+ if (a.get("release_type") == "chapter" and b.get("release_type") == "chapter"
+ and a.get("extra", {}).get("container_name")
+ and b.get("extra", {}).get("container_name") and
+ a.get("extra", {}).get("container_name") != b.get("extra", {}).get("container_name")):
+ return (Status.DIFFERENT, Miss.BOOK_CHAPTER)
+
+ if a.get("extra", {}).get("crossref", {}).get(
+ "type", "") == "component" and a.get("title") != b.get("title"):
+ return (Status.DIFFERENT, Miss.COMPONENT)
+
+ arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+ arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+
+ a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
+ b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
+ a_slug_authors = set((slugify_string(v) for v in a_authors if v))
+ b_slug_authors = set((slugify_string(v) for v in b_authors if v))
+ a_release_year = a.get("release_year")
+ b_release_year = b.get("release_year")
+
+ if a.get("title", "").lower() == b.get("title", "").lower():
+ if a_authors and (a_authors == b_authors):
+ if a_release_year and b_release_year and a_release_year != b_release_year:
+ return (Status.DIFFERENT, Miss.YEAR)
+ return (Status.EXACT, OK.TITLE_AUTHOR_MATCH)
+
+ if (len(a.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{3,3}", a.get("title", ""))
+ or len(b.get("title", "").split()) == 1
+ and re.match(r".*[.][a-z]{3,3}", b.get("title", ""))):
+ if a.get("title") != b.get("title"):
+ return (Status.DIFFERENT, Miss.TITLE_FILENAME)
+
+ if a.get("title") and a.get("title") == b.get("title"):
+ if a_release_year and b_release_year:
+ if abs(int(a_release_year) - int(b_release_year)) > 2:
+ return (Status.DIFFERENT, Miss.YEAR)
+
+ # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines
+ a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
+ b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
+
+ if a_slug_title == b_slug_title:
+ a_subtitles = a.get("extra", {}).get("subtitle", []) or []
+ b_subtitles = b.get("extra", {}).get("subtitle", []) or []
+ for a_sub in a_subtitles:
+ for b_sub in b_subtitles:
+ if slugify_string(a_sub) != slugify_string(b_sub):
+ return (Status.DIFFERENT, Miss.SUBTITLE)
+
+ if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (
+ a_slug_title != b_slug_title):
+ return (Status.DIFFERENT, Miss.CHEM_FORMULA)
+
+ if len(a_slug_title) < 10 and a_slug_title != b_slug_title:
+ return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
+
+ if re.search(r'\d', a_slug_title) and a_slug_title != b_slug_title and num_project(
+ a_slug_title) == num_project(b_slug_title):
+ return (Status.DIFFERENT, Miss.NUM_DIFF)
+
+ if a_slug_title and b_slug_title and a_slug_title == b_slug_title:
+ if a_authors and len(a_authors & b_authors) > 0:
+ if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None:
+ return (Status.STRONG, OK.PREPRINT_PUBLISHED)
+
+ if a_slug_title and b_slug_title and a_slug_title.strip().replace(
+ " ", "") == b_slug_title.strip().replace(" ", ""):
+ if len(a_slug_authors & b_slug_authors) > 0:
+ return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
+
+ arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
+ arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
+ if arxiv_id_a and arxiv_id_b:
+ id_a, version_a = arxiv_id_a.split("v")
+ id_b, version_b = arxiv_id_b.split("v")
+ if id_a == id_b:
+ return (Status.STRONG, OK.ARXIV_VERSION)
+ else:
+ return (Status.DIFFERENT, Miss.ARXIV_VERSION)
+
+ if a_authors and len(a_slug_authors & b_slug_authors) == 0:
+ return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)
+
+ todo[a.get("title")] += 1
+ return (Status.AMBIGUOUS, OK.DUMMY)
+
+
+def num_project(s):
+ """
+ Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
+ https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u
+
+ Unify every occurence of a digit (or group of digits).
+ """
+ return re.sub('\d+', '<NUM>', s)
+
+
+def contains_chemical_formula(s):
+ """
+ Returns true, if we find C3H8O or the like in title.
+ """
+ for token in s.split():
+ if CHEM_FORMULA.search(token):
+ return True
+
+
+TITLE_FRAGMENT_BLACKLIST = set([
+ "air quality data from the life+respira project in pamplona",
+ "irish studies, seminar",
+])
+
# There titles appear too often, so ignore them for now.
TITLE_BLACKLIST = set([
"",
+ "05-20-t001-15 一流110mハードル選手の踏切および踏切準備動作に関するバイオメカニクス的研究(05 バイオメカニクス,一般研究発表抄録)",
+ "09方-3a-p31 スポーツ選手のid化による競技サポートの検討(09.体育方法,一般研究発表抄録)",
"100 years ago",
"10. schlußbemerkungen",
- "(11899) weill",
+ "11-8 イネ篩管液中のmrnaの検出(11.植物の栄養生態)",
"11-i-6 シロネズミ肝臓アミドホスフォリボシルトランスフェラーゼ活性に及ぼす核酸関連物質およびビタミンb_<12>添加の影響(研究発表)(日本ビタミン学会 : 第25回大会研究発表要旨)",
"11. quellen und literatur",
+ "11. 経皮的胃電気活動記録 (egg) の食事負荷による影響の検討(第 25 回 日本小児消化管機能研究会)",
"1200 multiple choice questions in pharmacology",
"1299 chemical shifts and coupling constants for c10h13cl2n2o2ps",
"13 untersuchung einzelner abdominaler regionen und organe (13.1 - 13.3)",
"13 untersuchung einzelner abdominaler regionen und organe (13.4 - 13.6)",
"141st smpte technical conference and exhibition: marriott marquis hotel, new york city, november 19–22, 1999",
+ "147 モルモット tdi 喘息モデルの研究 : ii. 下気道の組織学的変化の検討",
"1536 chemical shifts and coupling constants for c10h24no2psi",
+ "159 肺門部扁平上皮癌術後に発生した早期扁平上皮癌に対する気管支鏡的 nd-yag レーザー治療の経験(レーザー・腔内照射)",
"165 脳spectにおける収集条件の画質への影響(第30回秋季学術大会 一般研究発表予稿集)",
+ "16th australian dental congress",
"1746. september",
+ "1749-8104-4-2-s1.tiff",
"1760 chemical shifts and coupling constants for c11h17o3ps",
+ "17.発作時の冠動脈造影像からみたst上昇型狭心症とst下降型狭心疾の発症機序の差異 : 第43回日本循環器学会学術集会 : 虚血性心疾患",
"181. 気管支喘息患児の抗食餌アレルゲン特異ige抗体の変動について(喘息-病態生理iii)",
"1917-1930 administrative records",
+ "194.心筋spectにおける吸収体の影響 : 第44回総会学術大会会員研究発表予稿",
"1980 annual meeting",
"19. syntheses",
"1. filme",
"1. general introduction",
+ "1-i-20 ラット水晶体におけるビタミンb_2代謝,特に吸収について : 第34回大会研究発表要旨 : 日本ビタミン学会",
"1-iii-9 20位水酸化ビタミンd_3誘導体の合成研究(一般演題要旨,日本ビタミン学会第64回大会講演要旨)",
"1. introduzione",
"1. vorbemerkungen",
@@ -92,6 +343,7 @@ TITLE_BLACKLIST = set([
"2004s-os8-3 内航不定期輸送シミュレーション手法の研究 : 荷主・オペレータ間のe-ビジネスの評価(オーガナイズドセッション(os8):物流/輸送シミュレーション)",
"(2007 - 2008) student government minutes: 2007-10-02",
"(2009 - 2010) student government minutes: 2009-04-21",
+ "2009 ssr awards",
"2010-03-personalien",
"2010-44 nouvelles du corps médical",
"2011 editorial collaborators",
@@ -100,14 +352,23 @@ TITLE_BLACKLIST = set([
"2017 membership report",
"2018 thank-yous",
"2188 chemical shifts and coupling constants for c12h19o4ps",
+ "21世紀の情報専門職養成 (<特集>infostaシンポジウム'99)",
+ "23 (12 bl., c.)",
+ "25aps-71 鉄系超伝導体のミニマル・モデルに対する動的平均場理論ii(25aps 領域8ポスターセッション(低温),領域8(強相関系:高温超伝導,強相関f電子系など))",
+ "25a-t-3 σ-相v合金のv^<51>四重極効果",
"25. literaturhinweise",
+ "26-i-17 ビタミンb_1の脂質過酸化による分解(ii)(研究発表 日本ビタミン学会 : 第24回大会研究)",
+ "2805, 1778-03-15, maecus",
"2871 chemical shifts and coupling constants for c14h14o3p2s3",
"29p-je-14 強磁性層と超伝導層の境界における称序パラメーターの振舞いii(低温)",
+ "2a-s-11 n_2励起色素レーザーによる大気中のno_2の螢光寿命測定",
"2-ii-24 新規フッ素化ビタミンdアナログの合成 : 第47回大会研究発表要旨",
+ "2-iii-18 ビタミンdによる細胞増殖抑制の作用機構(一般演題要旨,日本ビタミン学会第64回大会講演要旨)",
"2. konzeptionelle grundlagen",
"2nd international conference on the female reproductive tract",
"2. neurology",
"2 the ymca",
+ "30p-e-4 光電子広域出現電圧分光法(eaps)の固体表面への応用",
"31e congrès de la société française d'hématologie",
"31 fe, mnが水稲の養分吸収におよぼす影響について : 植物の比較生理化学的研究(関東支部講演会講演要旨)",
"340-911-1-pb__1_.pdf",
@@ -126,6 +387,7 @@ TITLE_BLACKLIST = set([
"50 & 100 years ago",
"5098703 interferon-alpha 76",
"50 years ago",
+ "52.ctのスライス厚について(ct2 画像評価1)(第33回総会会員研究発表)",
"566. ultrasound guided ilio-inguinal block - the way forward",
"5 empirische untersuchung",
"5. measurable functions",
@@ -135,21 +397,27 @@ TITLE_BLACKLIST = set([
"6. some applications",
"7370 - recto",
"7. zusammenfassung der ergebnisse",
+ "8700532 monoclonal antibody",
"8. fazit und ausblick",
"8 numerical methods",
"8. personalnachrichten",
"95% success",
+ "9-66 イネケイ酸吸収遺伝子lsi1の輸送特性の解析(9. 植物の無機栄養, 2006年度秋田大会講演要旨)",
+ "9.spac t_4による血中サイロキシン濃度の測定(◇中部部会(第13回))",
"9. the future",
"aaas news and notes",
"aacci corporate members",
"aaem news and comments",
"aae volume 21 issue 1 cover and back matter",
"aae volume 47 issue 4 cover and back matter",
+ "aain today",
"a. allgemeines",
+ "aargau",
"abbildung",
"abbildungsnachweis",
"abbotsford post",
"abbreviations and acronyms",
+ "abortions, 1977",
"about the cover",
"about the editor",
"about the editors",
@@ -158,17 +426,20 @@ TITLE_BLACKLIST = set([
"about this title",
"abréviations",
"abréviations et sigles",
+ "abstract p1",
"abstracts",
"abstracts from other journals",
"abstracts from the current literature",
"abstracts from transactions published in japanese",
"abstracts of current computer literature",
"abstracts of interest",
+ "abstracts of japanese articles",
"abstracts of papers from other journals",
"abstracts of papers to appear in future issues",
"abstracts of technical papers",
"abstracts of the current literature",
"abstract withdrawn",
+ "ac briefs",
"acei exchange",
"acetylacetonato(dicarbonyl)rhodium",
"acknowledgement of reviewers",
@@ -184,6 +455,7 @@ TITLE_BLACKLIST = set([
"acknowledgment to reviewers",
"a correction",
"acp-9-2289-2009.pdf",
+ "acquisitions",
"acronyms",
"acronyms and abbreviations",
"acs news",
@@ -207,6 +479,7 @@ TITLE_BLACKLIST = set([
"advertiser/product index",
"advertisers' index/liste des annonceurs",
"advertising policy and guidelines",
+ "advisory board",
"a. einleitung",
"aeq volume 24 issue 3 cover and front matter",
"aerobic co/nhydroxysuccinimide-catalyzed oxidation of p-tolylsiloxanes to p-carboxyphenylsiloxanes: synthesis of functionalized siloxanes as promising building blocks for siloxane-based materials",
@@ -246,22 +519,27 @@ TITLE_BLACKLIST = set([
"all pdfs of this category",
"also noted",
"also of interest",
+ "altria group inc",
"aluminium alloy al-p2014a-t4 or t42. sheet and strip 0,4 mm ≤ a ≤ 6 mm",
+ "ama insights",
"american board of dermatology examination dates",
"american nurses association",
"among the contributors",
+ "among the publishers",
"amphotericin b",
"anaesthetic section of the royal society of medicine",
"analysis of formulated detergents",
"analytical chemistry",
+ "anatomy and physiology",
+ "anatomy and physiology.",
"andrews glacier",
"anemia",
"an epitome of current medical literature",
"angelegenheiten der redaction",
"angina pectoris",
- "an introduction to linear programming",
"an invitation to membership",
"anlagen und produkte",
+ "anmeldelser",
"annex",
"annoncen",
"annotated bibliography",
@@ -269,16 +547,15 @@ TITLE_BLACKLIST = set([
"announcement",
"announcement and call for papers",
"announcements",
- "announcements, 2000-02-24. irish studies, seminar 535, 1999-2000",
- "announcements, 2012-09-07. irish studies, seminar 535, 2012-2013",
"announcements and calendar",
"announcements and reports",
- "announcements, irish studies, seminar 535, 2000-2001",
"announcements of future meetings",
+ "annual awards",
"annual general meeting",
"annual meeting",
"annual report",
"annual report: 1988–1989",
+ "annual report for period ending april 30, 1970.",
"annual review of physiology.",
"a note on the texts",
"ans volume 21 issue 3 cover and front matter",
@@ -292,6 +569,7 @@ TITLE_BLACKLIST = set([
"a-organoelement nitrones: synthesis, properties, and ir and 13c nmr spectral and x-ray structural characterization",
"aorn proceedings",
"a personal note",
+ "apparition",
"appendix c",
"appendix d",
"appendix d.",
@@ -313,15 +591,20 @@ TITLE_BLACKLIST = set([
"article abstract",
"articles and papers",
"articles of significant interest selected from this issue by the editors",
+ "articles to appear in forthcoming issues",
"arts and decoration",
"art school notes",
+ "ascii 13(5)(143)",
+ "ascpt news",
"a selected public health bibliography with annotations",
"aseptic midwifery",
"asge update",
+ "asia-pacific",
"as i see it",
"asme conference presenter attendance policy and archival proceedings",
"asm meetings calendar",
"asm news",
+ "association affairs",
"association directory",
"association intelligence",
"association notes",
@@ -329,7 +612,13 @@ TITLE_BLACKLIST = set([
"association suisse pour le suffrage féminin",
"assurance responsabilité civile professionelle",
"astronomical phenomena for the week 1887 march 27-april2",
+ "astronomical topics",
+ "asylum reports.",
+ "at ams headquarters",
+ "ath. va 44 ἀϰαρπότερος ἀγρίππου.",
+ "atlin claim",
"atmospheric pressure (expressed in inches and hundredths)",
+ "at our intake desk",
"a travers les revues",
"attendance",
"at the literary table",
@@ -359,12 +648,15 @@ TITLE_BLACKLIST = set([
"autour de la guerre",
"autriche",
"avertissement",
+ "avis!",
+ "awards [3 awards]",
"awards alert",
"awards of excellence",
"background",
"[back inside cover]",
"back matter",
"backmatter",
+ "back to basics",
"b. analyse",
"bangladesh",
"bangladesh: 1972–2012",
@@ -381,6 +673,7 @@ TITLE_BLACKLIST = set([
"bedside teaching 高齢者における心疾患・2-高齢者の急性心筋梗塞",
"behavior",
"behavioral sciences",
+ "beiträge",
"beitrÄge",
"bej keywords",
"belgium",
@@ -399,25 +692,33 @@ TITLE_BLACKLIST = set([
"bibliographie sélective",
"bibliography",
"bibliography section",
+ "biblos",
"bildnachweis",
"bildnachweise",
"biochemical",
"biographical notes",
"biomarker benchmark - gene expression data from gene expression omnibus - gse46691",
"biosketches",
+ "birmingham.",
"bja volume 25 issue 3 cover and back matter",
"[blank page - back cover]",
"blank page [back cover]",
"bleomycin/cisplatin",
"bma affairs",
+ "b. migrations",
"board of directors",
"board of editor",
"board of editors",
"board of editors of the american journal of international law",
+ "body and mind",
"boekbespreking",
"boekbesprekingen",
+ "book browsing",
"book chronicle",
+ "book list",
+ "booklist",
"book marks",
+ "book pharm",
"book received",
"book review",
"book reviewers",
@@ -427,6 +728,7 @@ TITLE_BLACKLIST = set([
"book review section 3",
"book reviews / revue de livres",
"book reviews/revue de livres",
+ "books and materials received",
"books and odd volumes",
"books and publications received",
"books available list",
@@ -437,6 +739,7 @@ TITLE_BLACKLIST = set([
"book shelf",
"bookshelf",
"books in brief",
+ "books of essays",
"books of interest",
"books received",
"books received but not reviewed",
@@ -445,6 +748,7 @@ TITLE_BLACKLIST = set([
"boston medical library",
"botanischer tauschverein in wien",
"botany",
+ "botswana",
"botulinum toxin a",
"boundary creek times",
"bourg-ciné-sonore",
@@ -465,37 +769,66 @@ TITLE_BLACKLIST = set([
"briefs",
"british columbia federationist",
"british columbia record",
+ "british dental association.",
+ "british gallup poll: cq 023b",
+ "british gallup poll cq 385",
+ "british gallup poll cq 428",
+ "british gallup poll cq 613",
+ "british gallup poll cq 628",
+ "british gallup poll cq 739",
+ "british gallup poll cq 814",
+ "british gallup poll cq 866a",
+ "british gallup poll cq 906",
+ "british gallup poll cq 979",
"british medical association",
+ "british medical association.",
+ "british official photograph from the western front",
"british veterinary association",
+ "brown-forman corp.",
"buchanzeigen.",
+ "bucharest.",
+ "buchbesprechungen - book reviews",
"buchbesprechungen - book reviews - livres nouveaux",
"buchbesprechungen – book reviews – livres nouveaux",
"bücherbesprechungen",
"buchrezensionen",
+ "budapest",
"bulletin board",
"bulletin critique",
"bulletins & highlights",
"bureau of investigation",
"bureau of legal medicine and legislation",
"bürgerliches recht",
+ "business",
"business and personal wants",
"business meeting",
+ "by-laws",
+ "c 10 h 18 mo 1 o 4 p 2",
+ "c12h12cr2o8s2",
+ "c12h14cl2n2",
"c15h19nose",
"c 15 h 20 n 1 o 3 p 1",
+ "c 20 h 18 f 6 o 7 s 2 zn 1",
"c21h25no4 - structure no. 1093",
"c21h26n2o5 - structure no. 1098",
"c 22 h 18 f 6 o 5 sn 1",
"c 22 h 18 f 6 o 5 zn 1",
+ "c2/ ed. board",
"c2 - editorial board",
"c2: editorial board",
"c2h6o and c4h8",
"c3h8o and c8h18",
"c4h8o and c7h8",
+ "c5h10 and c7h12",
+ "c5h10o3 and c5h12o",
"c5h12o and c8h18o3",
"c5h8o2 and c6h12",
+ "c5h8o2 and c8h8o",
"c6h12o2 and c7h16",
+ "c6h6 and c6h12",
"c6h8, bicyclo[3.1.0]hex-2-ene",
"c7h2fe2o6se2",
+ "c7h8 and c7h16",
"calendar",
"calendar—89 (1–6), 471 (n), 949 (n), 1468 (n), 1827 (n), 2467 (n), 3014 (n)",
"calendar of courses, symposiums, and conferences",
@@ -504,14 +837,21 @@ TITLE_BLACKLIST = set([
"calendar of forthcoming meetings",
"calendar of future meetings",
"calendar of international conferences, symposia, lectures and meetings of interest",
+ "call for nominations",
"call for submissions",
+ "canadian gallup poll # 260",
+ "canadian gallup poll # 282",
+ "canadian gallup poll # 376",
"canadian medical association journal, toronto",
"canadian oxygen prize",
+ "canagliflozin",
+ "cancer in british colonies",
"candidates for ccm",
"canto",
"captopril",
"carbon monoxide poisoning",
"cardiopulmonary resuscitation",
+ "cardiovascular syphilis",
"career opportunities",
"caring for the aged.",
"carnegie church organs, salladasburg, pennsylvania",
@@ -526,6 +866,9 @@ TITLE_BLACKLIST = set([
"case notes",
"case of the month",
"case presentation 3",
+ "celanese* formaldehyde",
+ "central association of science and mathematics teachers",
+ "central scientific company",
"ceramic abstracts",
"ceramic abstracts1",
"ceramics japan = セラミックス : bulletin of the ceramic society of japan 28(3)",
@@ -551,6 +894,7 @@ TITLE_BLACKLIST = set([
"chapter xl",
"chapter xxix",
"character list",
+ "charing cross hospital",
"check list",
"checklist",
"chemical oceanography",
@@ -565,6 +909,7 @@ TITLE_BLACKLIST = set([
"chemie-dissertationen",
"chemistry of vegetable physiology and agriculture",
"chest x-ray quiz - question",
+ "cholera.",
"chromatography calendar",
"chronic pyelonephritis",
"church efficiency",
@@ -580,14 +925,19 @@ TITLE_BLACKLIST = set([
"climate change",
"clinical connections",
"clinical implications",
+ "clinical lecture",
"clinical memoranda",
"clinical news",
"clinical notes",
+ "clinical society of london.",
"clinical summaries",
+ "clinical training 検査の読み方-末梢血液検査",
"cme accreditation page",
"cme calendar",
"cme calendar 1998",
+ "cme enrollment form",
"cme examination",
+ "cme questions",
"co2: editorial board",
"coal and lignite production",
"cocaine abuse",
@@ -596,6 +946,7 @@ TITLE_BLACKLIST = set([
"coffee break 胃癌診断と天気予報",
"coke oven emissions",
"collected essays",
+ "collective bargaining",
"colloquium",
"colorectal cancer",
"colour blindness",
@@ -603,7 +954,9 @@ TITLE_BLACKLIST = set([
"coming events manifestations futures",
"coming events/ manifestations futures",
"coming in the next issue",
+ "comings and goings",
"coming up",
+ "comité éditorial",
"committee list",
"committee lists",
"committee member",
@@ -613,8 +966,10 @@ TITLE_BLACKLIST = set([
"communication networks and systems for power utility automation",
"communications of the european neurological society",
"communications to the editor",
+ "community",
"company news",
"comparison of the clustering results on the reduced space and on the original space (iii)",
+ "composition",
"comptes rendus de lecture",
"comptes-rendus des traités",
"comulative author index to volume 289",
@@ -639,12 +994,15 @@ TITLE_BLACKLIST = set([
"conference report",
"conferences",
"conferences and symposia",
+ "confirmation",
"conflict of interest",
"congratulations!",
"congratulations",
"congrès, formation, enseignement",
"congress calendar",
+ "congress information",
"consequences",
+ "constitution and by-laws",
"construction",
"construction of new plant for sidney roofing and paper company in burnaby, b.c.",
"contemporary echoes",
@@ -691,10 +1049,13 @@ TITLE_BLACKLIST = set([
"corrections and clarifications",
"correspondence",
"correspondence, 1962-02-27. organization and management, seminar 423, 1961-1962",
+ "correspondents.",
"corrigendum",
"corrosion commentary",
"costa rica",
"cost considerations …",
+ "council 1978—79",
+ "council on drugs",
"council on medical education and hospitals",
"council on pharmacy and chemistry",
"council on physical medicine",
@@ -712,8 +1073,10 @@ TITLE_BLACKLIST = set([
"cover picture",
"cpr news",
"cranbrook herald",
+ "creep",
"criminal law and practice in scotland",
"critical notices",
+ "crizotinib",
"croonian lectures on points connected with diabetes.",
"cross-cultural medicine",
"cruise hly04ta on uscgc healy",
@@ -736,15 +1099,19 @@ TITLE_BLACKLIST = set([
"current papers in oral biology",
"current publications received",
"current research",
+ "current review",
"current science",
"current topics",
"curriculum vitae",
"cv2-editorial board",
+ "cv3 - editorial board/english",
+ "cyclosporin",
"daily british columbian",
"daily building record",
"daily ledger",
"dapsone",
"dasatinib",
+ "data s1: raw data",
"data sheet",
"data_sheet_1.csv",
"data_sheet_1.zip",
@@ -809,6 +1176,7 @@ TITLE_BLACKLIST = set([
"dissertationen / dissertations",
"dissertationes",
"dissertations",
+ "doctors and overpopulation",
"dokument 119-133",
"dokument 18-19",
"domain pattern in electrodeposited nickel-iron thin films",
@@ -817,15 +1185,19 @@ TITLE_BLACKLIST = set([
"drug watch",
"dutch ph. d. theses",
"earn continuing education credit for reading articles in the counseling psychologist!",
+ "ear‐to‐ground",
"eastern questionnaire, blue book for interviewee 46259, answers for pages 103-118",
"eastern questionnaire, blue book for interviewee 47236, answers for pages 065-081",
"eastern questionnaire, blue book for interviewee 47277, answers for pages 158 -171",
"eastern questionnaire, blue book for interviewee 48172, answers for pages 043-059",
"eastern questionnaire, blue book for interviewee 49275a, answers for pages 055-079",
"e-commerce",
+ "economic indicators",
"economic review",
+ "ecuador",
"ed board",
"ed. board",
+ "edinburgh obstetrical society.",
"editorial",
"editorial announcement",
"editorial board",
@@ -838,6 +1210,7 @@ TITLE_BLACKLIST = set([
"editorial board; barcode",
"editorial board/copyright information",
"editorial board - english",
+ "editorial board (ifc)",
"editorial board / inside contents",
"editorial board members",
"editorial board: proteomics 1'18",
@@ -847,8 +1220,10 @@ TITLE_BLACKLIST = set([
"editorial board / redaksieraad",
"editorial board/reviewing committee",
"editorial calendar",
+ "editorial collaborators",
"editorial committee",
"editorial foreword",
+ "editorial notes and news",
"editorial statement & general notes",
"editor' note",
"editor / regional editors / advisory editorial board",
@@ -856,6 +1231,7 @@ TITLE_BLACKLIST = set([
"editor's choice",
"editor's commentary",
"editors/ editorial board",
+ "editors & editorial board. publication info",
"editor's foreword",
"editors for scripta materialia",
"editor's message",
@@ -867,9 +1243,11 @@ TITLE_BLACKLIST = set([
"educational intelligence",
"educational intelligence.",
"educational news",
+ "education in action",
"educators personally",
"efavirenz/emtricitabine/tenofovir-disoproxil-fumarate",
"effect of rf1 on alanine incorporation by tmrna",
+ "egypt.",
"egypt: 1948–2012",
"ehistology kaufman atlas plate 12 image b",
"eigenvalues and eigenvectors",
@@ -882,8 +1260,10 @@ TITLE_BLACKLIST = set([
"einzelunfallversicherung",
"electromagnetic compatibility for industrial-process measurement and control equipment",
"emanations",
+ "embryology",
"emissions radiophoniques",
"empirische untersuchung",
+ "employment",
"employment ads information",
"employment information",
"enderby press and walker's weekly",
@@ -908,6 +1288,7 @@ TITLE_BLACKLIST = set([
"ergänzende literatur",
"ergebnisse",
"erratum",
+ "erythromycin",
"escitalopram",
"escv membership",
"esp newsletter",
@@ -915,6 +1296,7 @@ TITLE_BLACKLIST = set([
"essential hypertension",
"estiv flyer",
"estonia",
+ "ethics",
"europe and the ussr",
"european news",
"european perspectives",
@@ -923,15 +1305,18 @@ TITLE_BLACKLIST = set([
"evaporation of initially heated sessile droplets and the resultant dried colloidal deposits on substrates held at ambient temperature",
"events calendar",
"events guide",
+ "examination results",
"examples",
"exercise-induced asthma.",
"exhibition",
+ "exhibitors' columns",
"explanitory notes",
"external reviewers",
"extra supplement-the nursing mirror",
"fachgruppen",
"fachinformationen, kennzahlen",
"fach- und personalnachrichten",
+ "faculty matters",
"failures '94",
"fe de errata",
"federal",
@@ -964,17 +1349,23 @@ TITLE_BLACKLIST = set([
"filmographie",
"filmography",
"film, review of the month",
+ "films",
"financial assets and liabilities of investment funds: portugal",
"financial statement",
"financial statements",
+ "finland",
"finlande",
"fire hazard testing",
"firmenschriften",
"first international symposium on thermal stresses and related topics thermal stresses '95",
+ "floor plans",
"flügelprofil",
"fluoxetine",
"food allergy",
"food and agriculture organization",
+ "food in hospitals",
+ "foods and drugs analysis",
+ "foot-and-mouth disease",
"foot-and-mouth disease.",
"foreign and insular statistical reports of countries and cities: untabulated",
"foreign and insular statistical reports of countries and cities—yearly and monthly",
@@ -985,14 +1376,17 @@ TITLE_BLACKLIST = set([
"fort george herald",
"forthcoming conferences",
"forthcoming features",
+ "for the history class",
"for the record",
"foundations",
"fracture of carpal scaphoid",
"fragebogen",
"fragment unbekannten inhalts",
+ "französisch",
"free colour illustrations in the online version of articles",
"french-language abstracts",
"from foreign journals",
+ "from lisa",
"from the archive",
"from the archives",
"from the archives of the archives",
@@ -1011,6 +1405,7 @@ TITLE_BLACKLIST = set([
"front matter",
"frontmatter",
"frontmatter and index",
+ "fujitsu (総索引 20(1)-29(7))",
"full issue",
"full length article",
"full title",
@@ -1025,8 +1420,11 @@ TITLE_BLACKLIST = set([
"future contributions tojournal of statistical physics",
"future meetings",
"f.y.eye",
+ "gallery",
"gamesman solutions",
+ "gas and power",
"gastro‐enterology",
+ "gastrotomy",
"gaussian job archive for brh3o",
"gaussian job archive for c10h24cu2i3n3p2",
"gaussian job archive for c13h26o4",
@@ -1082,6 +1480,7 @@ TITLE_BLACKLIST = set([
"globalization",
"glossary of abbreviations",
"glossary of terms",
+ "glycerine producers' association",
"glycine max (l.) merr. mutante, mut 116 a/2",
"glycine max (l.) merr. mutante, mut 321",
"glycine max (l.) merr. mutante, mut 4458",
@@ -1089,14 +1488,17 @@ TITLE_BLACKLIST = set([
"glycine max (l.) merr. mutante, mut 60",
"golden era",
"gordon research conferences",
+ "gout.",
"government services",
"graphical abstract (angew. chem. int. ed. engl. 13/1994)",
+ "graphical abstract toc cont'd",
"graphical abstract toc continued",
"graphical contents list & author index",
"graphic technology. print product metadata for pdf files",
"graphique 3 - encours de la dette extérieure totale (pourcentage du pib) et service de la dette (pourcentage des exportations des biens et services)",
"graubünden.",
"great britain",
+ "great britain g.b.2",
"grimsel",
"griseofulvin",
"guest editor",
@@ -1106,6 +1508,8 @@ TITLE_BLACKLIST = set([
"guidelines for manuscript preparation",
"guide to further reading",
"guy's hospital.",
+ "gynæcology",
+ "gynecology",
"haloperidol",
"halothane hepatitis",
"handelsrecht.",
@@ -1120,18 +1524,22 @@ TITLE_BLACKLIST = set([
"herbal medicine",
"herbal medicines",
"heritable disorders of connective tissue",
+ "hewlett packard",
"high-energy physics",
"highlights from the literature",
"highlights of recent meetings",
+ "hints and notions",
"hinweise für autoren",
"histamine",
"histoire de lire",
+ "histology.",
"história da palestina nos tempos do novo testamento (v)",
"historical background",
"historical news",
"historical section",
"history",
"hospital abuse.",
+ "hospital administration. no. iii",
"hospital and dispensary management",
"hospital and institutional news",
"hospital digest",
@@ -1140,6 +1548,7 @@ TITLE_BLACKLIST = set([
"hospital of surgery, panton square, st. james's",
"hospital reform",
"hospital reports",
+ "house of lords.",
"how do you measure experience?",
"hungary",
"huntington's chorea",
@@ -1148,6 +1557,7 @@ TITLE_BLACKLIST = set([
"hyg volume 114 issue 2 cover and front matter",
"hyg volume 50 issue 3 cover and front matter",
"hyg volume 61 issue 1 cover and back matter",
+ "hyperthyroidism",
"ibc: guide for authors",
"iceland - marginal tax wedge decomposition",
"iceland: other economic indicators",
@@ -1161,6 +1571,7 @@ TITLE_BLACKLIST = set([
"ieee foundation [advertisement]",
"ieee geoscience and remote sensing letters publication information",
"ieee instrumentation & measurement magazine - staff listing",
+ "ieee journal of solid-state circuits",
"ieee power engineering society information for authors",
"ieee professional communication society",
"ieee robotics and automation society",
@@ -1171,6 +1582,7 @@ TITLE_BLACKLIST = set([
"ieee transactions on antennas and propagation institutional listings",
"ieee transactions on circuits and systems—ii: express briefs publication information",
"ieee transactions on computer-aided design of integrated circuits and systems information for authors",
+ "ieee transactions on education information for authors",
"ieee transactions on electron devices information for authors",
"ieee transactions on fuzzy systems information for authors",
"ieee transactions on human-machine systems information for authors",
@@ -1186,6 +1598,7 @@ TITLE_BLACKLIST = set([
"ifc (ed board)",
"ifc (ed. board)",
"ifc - ed board",
+ "ifc - ed. board",
"ifc-ed. board",
"ifc: editorial board, aims and scope",
"ifc - publication information",
@@ -1200,6 +1613,7 @@ TITLE_BLACKLIST = set([
"image_4.png",
"image credits",
"image of the month answer",
+ "immigration",
"implant therapy outcomes, surgical aspects",
"in case you haven't heard",
"in case you haven't heard…",
@@ -1245,10 +1659,13 @@ TITLE_BLACKLIST = set([
"industry update",
"industry updates",
"in extremis",
+ "infant mortality",
"infectious disease",
"infectious diseases",
"infectious hepatitis",
"inflammatory bowel disease",
+ "influenza 1953",
+ "información",
"information* concerning the hague conventions on private international law",
"informationen",
"information exchange",
@@ -1261,7 +1678,9 @@ TITLE_BLACKLIST = set([
"inhalt · contents",
"inhalt heft 2",
"inhalt-impressum",
+ "inhalt. impressum",
"inhalt.impressum",
+ "inhalt nr. 5/6",
"inheritance and education, 1513–1582",
"initiates",
"in kürze",
@@ -1277,6 +1696,8 @@ TITLE_BLACKLIST = set([
"inra:grapevine:0bdxx30",
"inra:grapevine:6590bdx1",
"inra-onf:populus nigra:6-a07",
+ "inra-onf:populus nigra:all-022",
+ "inra-onf:populus nigra:uli-022",
"inra:quercus robur x quercus petraea:h549",
"inra:quercus robur x quercus petraea:h615",
"inra:quercus robur x quercus petraea:i021",
@@ -1300,7 +1721,10 @@ TITLE_BLACKLIST = set([
"institute news",
"institute news and radio notes",
"institute, vereine, fachveranstaltungen",
+ "institutional research",
+ "instruction for authors",
"instructions aux auteurs",
+ "instructions for contributors",
"instructions for obtaining anesthesiology continuing medical education (cme) credit",
"instruction to authors",
"instrumentation",
@@ -1321,6 +1745,7 @@ TITLE_BLACKLIST = set([
"international conference calendar 1995",
"international conferences 1981–1982–1983",
"international council of nurses",
+ "international court of justice",
"international diary",
"international meeting on sex hormones and anti-hormones in endocrine dependent pathology: basic and clinical aspects",
"international meetings",
@@ -1349,6 +1774,9 @@ TITLE_BLACKLIST = set([
"inventions new and interesting",
"invited speaker",
"iowa's notable dead",
+ "ipilimumab/nivolumab",
+ "isoniazid",
+ "isotech inc.",
"israel",
"issid pages",
"issn page",
@@ -1397,6 +1825,7 @@ TITLE_BLACKLIST = set([
"joint commission of the empress shôken fund. no 71",
"joi volume 13 issue 3 cover and front matter",
"joi volume 15 issue 2 cover and front matter",
+ "journal abstracts",
"journal announcements",
"journal cme questions",
"journal of the smpte — editorial board",
@@ -1409,6 +1838,7 @@ TITLE_BLACKLIST = set([
"journals received",
"jse volume 20 issue 2 cover and back matter",
"jspe membership guidance",
+ "jsps information",
"j. t. baker chemical co.",
"judicial decisions",
"justice",
@@ -1441,8 +1871,11 @@ TITLE_BLACKLIST = set([
"kongresskalender 2016",
"kootenay mail",
"kunstausstellungen",
+ "kursaal",
"kurzbesprechungen",
+ "labindustries",
"laboratory practice 病理:細胞像からここまでわかる 尿(1) 尿中に出現する非腫瘍性細胞",
+ "lamb family",
"land use : u.e.l.",
"laparoscopic cholecystectomy",
"la patrie suisse",
@@ -1450,15 +1883,20 @@ TITLE_BLACKLIST = set([
"latex allergies",
"law and science",
"lead coating on steel by hot dipping. [ii]",
+ "leading articles",
"leading the field since 1884 [advertisement]",
"leads from the mmwr",
"learning",
+ "learning and teaching",
"legacies",
"legal issues",
"legionnaires' disease.",
+ "legislation",
+ "législation",
"les auteurs",
"letter from charles dollard to richard sterner, july 23, 1941",
"letter: n.d.",
+ "letters from readers",
"letters, notes, and answers",
"[letters to editor]",
"letters to the editor",
@@ -1468,6 +1906,7 @@ TITLE_BLACKLIST = set([
"letter xxxii",
"libraries of small organic molecules",
"library notes",
+ "library table.",
"libri novi",
"libri nuovi",
"libri ricevuti",
@@ -1493,6 +1932,7 @@ TITLE_BLACKLIST = set([
"list of members, 1920",
"list of plates",
"list of referees",
+ "list of sponsors",
"list of symbols",
"list of tables and figures",
"literary notes",
@@ -1508,9 +1948,14 @@ TITLE_BLACKLIST = set([
"liver function test",
"liverpool",
"liverpool.",
+ "liverpool medical institution",
"liverpool medical institution.",
"livres",
"livres reçus / books received",
+ "local government department",
+ "local government department.",
+ "lolium multiflorum lam., gra 1099",
+ "london association of the medical women's federation.",
"looking back.",
"los autores",
"lösungen",
@@ -1520,11 +1965,14 @@ TITLE_BLACKLIST = set([
"lu pour vous",
"luxembourg",
"lxvii. notices respecting new books",
+ "madrid",
"magazine, radio and tv report",
"magnetic resonance imaging",
"malaysia",
"malignant melanoma of the skin",
"manager's notices.",
+ "manager's notices. to subscribers.",
+ "manchester.",
"manihot esculenta crantz col1463",
"manihot esculenta crantz col1781",
"manihot esculenta crantz col2151",
@@ -1533,6 +1981,7 @@ TITLE_BLACKLIST = set([
"manihot esculenta crantz col2426",
"manihot esculenta crantz col407",
"manuscript received date",
+ "manuscript referees, 1987-1988",
"manuscript referees, 2005-2006",
"manuscript reviewers",
"manuscripts accepted for publication",
@@ -1548,6 +1997,7 @@ TITLE_BLACKLIST = set([
"master index—volumes 91–100",
"masthead",
"materials",
+ "materials handling",
"materials news",
"mathematica",
"mauritania",
@@ -1577,7 +2027,9 @@ TITLE_BLACKLIST = set([
"medical miscellany",
"medical motion pictures",
"medical notes",
+ "medical progress.",
"medical societies",
+ "medicinal and dietetic preparations",
"medicine",
"medicine 1977.",
"medicine and politics",
@@ -1585,6 +2037,7 @@ TITLE_BLACKLIST = set([
"medicine and the law.",
"medicine and the media",
"medico-legal and medico-ethical",
+ "medico-legal society",
"medico-meteorological observations",
"medico-parliamentary",
"medizinrecht",
@@ -1596,18 +2049,23 @@ TITLE_BLACKLIST = set([
"meetings and conferences · tagungen und kongresse",
"meetings and courses",
"meetings and events",
+ "meetings and programs",
"meetings/courses",
"meetings of interest for spine physicians and surgeons",
+ "meetings of the ams",
"meet our new contributor",
"meet our new contributors",
"meet the editorial board",
+ "mehrsprachige beiträge",
"membership",
"membership application",
"membership notes",
"member societies",
"members of commissions, boards and committees",
+ "members of the philological society, 1899.",
"memoirs",
"memorial shiksha shodh sansthan",
+ "mental deficiency",
"mental health",
"mergers and acquisitions in the entertainment and media sector",
"message from general chair",
@@ -1627,6 +2085,7 @@ TITLE_BLACKLIST = set([
"mettler",
"mettler instrument corporation",
"mexico",
+ "michigan",
"microscope on washington",
"microscopy",
"military and naval medical services",
@@ -1636,36 +2095,50 @@ TITLE_BLACKLIST = set([
"minutes, 1959-04-22. mathematical methods in the social sciences, seminar 447, 1958-1959",
"minutes, 1978-11-17. ottoman & turkish studies, seminar 551, 1978-1979",
"minutes, 1985-12-16. women and society, seminar 545, 1985-1986",
+ "misc.",
"miscellanea medica",
"miscellaneous inventions",
"miscellany",
+ "mitsubishi chemical industries limited",
"mitteilungen - communications",
"mitteilungen der schriftleitung",
"mitteilungen des bde",
"mitteilungen des bdi",
"mixed news",
"mixtura mirabilis",
+ "mlc1-t43-a.mp3",
"mobile and portable dvb-t/h radio access",
"mobius aromaticity and delocalization",
"models of translatory rock bursting in coal",
"modern sociology",
"modules",
"molecular insights into division of single human cancer cells in on-chip transparent microtubes",
+ "money matters",
+ "monthly summary",
+ "moral hazard",
"more books",
+ "more news on reviews",
"motivation",
"movers & shakers",
"moving forward",
"moyie leader",
"ms18-1 多種類抗体対応・高感度マルチアレルゲン蛋白チップ開発(ms18 アレルゲン,ミニシンポジウム,第61回日本アレルギー学会秋季学術大会)",
"ms9-7 喘息に伴う慢性副鼻腔炎に対する内視鏡下副鼻腔手術の術後評価(ms9 副鼻腔炎とアスピリン喘息,ミニシンポジウム,第63回日本アレルギー学会秋季学術大会)",
+ "muḥammad ii",
"multiple choice questions",
"musa aaaa itc1283",
+ "musa aaaa itc1284",
"musa aa itc0411",
+ "musa aa itc0809",
+ "museum notes",
+ "music",
"mutation analysis (mutsig 2cv v3.1)",
+ "mutation analysis (mutsigcv v0.6)",
"mutation analysis (mutsigcv v0.9)",
"myasthenia gravis.",
"mycophenolate mofetil",
"nachtrag",
+ "nachtrag.",
"nachträge",
"nachwort",
"namenregister",
@@ -1678,6 +2151,7 @@ TITLE_BLACKLIST = set([
"national health service",
"national institute of mental health",
"national research council",
+ "natural gas production",
"naval and military medical services",
"naval notes",
"nécrologie",
@@ -1693,14 +2167,17 @@ TITLE_BLACKLIST = set([
"neue apparate",
"neue ger�te und chemikalien",
"neu eingegangene arbeiten",
+ "neue instrumente",
"neue literatur",
"neue patente",
"neue produkte · firmennachrichten",
+ "neuerscheinungen: bücher",
"neues aus forschung und industrie",
"neue spezialitäten (einschließl. nährpräparate und geheimmittel)",
"neue spezialitäten (einschl. nährpräparate und geheimmittel)",
"neue werke",
"neuropathology",
+ "nevada state medical association",
"new and nonofficial remedies",
"new and recent ieee publications",
"new appliance",
@@ -1711,6 +2188,8 @@ TITLE_BLACKLIST = set([
"new books, etc",
"new books, etc.",
"new books received",
+ "new chemicals and specialties",
+ "new drugs",
"new editions",
"new editor-in-chief professor degan shu",
"new equipment",
@@ -1724,10 +2203,12 @@ TITLE_BLACKLIST = set([
"new miscellaneous inventions",
"new product developments",
"new products: new products",
+ "new products this week",
"news",
"news @ a glance",
"news analysis",
"news and coming events",
+ "news and comment on recent developments from around the world",
"news and personals",
"newscap",
"news, comments, and service announcements",
@@ -1735,6 +2216,7 @@ TITLE_BLACKLIST = set([
"news focus",
"news from our chapters",
"news from the field",
+ "news from the united states",
"news letter",
"newsletter",
"newsletter2001 (4)",
@@ -1751,13 +2233,18 @@ TITLE_BLACKLIST = set([
"next month in neurosurgery",
"nichtmetallische anorganische werkstoffe",
"nicolet",
+ "nigeria",
"nivolumab",
"no. 22514. convention on the civil aspects of international child abduction. concluded at the hague on 25 october 1980",
"no. 27531. convention on the rights of the child. adopted by the general assembly of the united nations on 20 november 1989",
"no. 29447. international development association and china",
+ "no. 6 (1. juni)",
"no 98 (février 1894)",
"nomenclature",
"non-metallic inorganic materials",
+ "normung",
+ "northern counties notes.",
+ "north london hospital,",
"norway",
"norwegen",
"notarrecht",
@@ -1770,6 +2257,8 @@ TITLE_BLACKLIST = set([
"notes and abstracts",
"notes and announcements",
"notes and events",
+ "notes and notices",
+ "notes and reviews",
"notes, comments and abstracts",
"notes, comments, and abstracts",
"notes, comments, and abstracts.",
@@ -1785,6 +2274,7 @@ TITLE_BLACKLIST = set([
"notes on books, etc",
"notes on new books",
"notes on north american plants. ii.",
+ "notes on the formation of glazed frost1",
"notes, short comments & answers to correspondents.",
"notes, short comments, & answers to correspondents.",
"notes, shortcomments, & answers to correspondents.",
@@ -1809,11 +2299,19 @@ TITLE_BLACKLIST = set([
"nouvelles de l'industrie / neues aus der industrie",
"nouvelles du corps médical",
"nova et vetera",
+ "nr. 10 (10 mars)",
"nr. 121 (23 mai 1885)",
+ "nr. 12 (24 mars)",
"nr. 12 (dezember)",
"nr. 1 (31.dezember 1921)",
+ "nr. 13 (26 mars)",
+ "nr. 13 (29 mars)",
+ "nr. 24 (20 juin)",
+ "nr. 24 (28 juin)",
+ "nr. 24 (29 juin)",
"nr. 2 (februar 1905)",
"nr. 3 (1. maerz 1882)",
+ "nro. 39-51 (april)",
"nro 4 (1. april 1870)",
"nuclear-chicago",
"nuclear-chicago corporation",
@@ -1846,21 +2344,28 @@ TITLE_BLACKLIST = set([
"nye kandidater udgået fra københavns universitets geografiske institut",
"o02-01 局所進行肺癌手術症例の検討 : t3,t4症例を中心に(肺癌2,第25回日本呼吸器外科学会総会)",
"obesity",
+ "obituary listing",
"obituary notices",
+ "observations at honolulu",
"obstetrical society of philadelphia",
"obstetrics.",
+ "obturator hernia.",
"occurrence download",
"october 15, 1870",
"october 2003",
"oeffentliche gesundheitspflege",
"oeffentliches sanitätswesen",
"ofc - contents list",
+ "official photographs taken on the british western front",
"official photograph taken on the british western front",
"official photograph taken on the british western front in france",
"official publications",
"official reports",
+ "of optics and opticists",
"of special interest",
+ "o.i.corporation",
"oils and fats",
+ "old testament",
"omineca herald",
"on the cover",
"on the cover.",
@@ -1872,17 +2377,20 @@ TITLE_BLACKLIST = set([
"open access",
"opening address",
"ophthalmology",
+ "ophthalmology.",
"optimierung",
"optimization",
"oral abstracts",
"oral presentations",
"oral presentations 292-312",
"orchard city record",
+ "order form",
"order form for reprints",
"ordinary meeting",
"organic solvent-free fabrication of durable and multifunctional superhydrophobic paper from waterborne fluorinated cellulose nanofiber building blocks",
"organization of payment",
"organization section",
+ "orientation",
"orthopedic surgery",
"ortsregister",
"ortsregister - 725",
@@ -1903,7 +2411,10 @@ TITLE_BLACKLIST = set([
"oxidative addition, transmetalation, and reductive elimination at a 2,2-bipyridyl-ligated gold center",
"p-100 胎児期の臍帯過捻転が原因と考えられた新生児臍帯基部皮膚欠損の2例(ポスター 新生児1,science and art for sick children,第46回日本小児外科学会学術集会)",
"p22-4 高度な気管狭窄を認めた成人t細胞白血病リンパ腫の一例(リンパ腫,ポスター22,第34回日本呼吸器内視鏡学会学術集会)",
+ "p233 病診連携により一般開業医へ引き継がれた気管支喘息患者のqolと重症度の推移に関する検討(気管支喘息治療3,第20回日本アレルギー学会春季臨床大会)",
+ "p-36 気管分岐部癌術後の吻合部肉芽狭窄にバルーン拡張術が有効であった 1 例(示説 (v))(第 16 回日本気管支学会総会)",
"pädagogische chronik",
+ "pædiatrics",
"pÆdiatrics",
"pakistan",
"panel and contract practice",
@@ -1914,12 +2425,16 @@ TITLE_BLACKLIST = set([
"papers to appear",
"papers to appear in ecotoixcology and environmental safety environmental research, section b",
"papers to be published",
+ "papers to come",
"paraneoplastic syndromes.",
"parkinson's disease",
"parliament",
"parliamentary intelligence",
"parliamentary intelligence.",
+ "parliamentary proceedings.",
"parliament, press, radio and tv",
+ "part 1 of 1",
+ "part 1 of 2",
"participants",
"part ii. reviews and bibliographical notices",
"part number 1",
@@ -1941,9 +2456,11 @@ TITLE_BLACKLIST = set([
"patentschau",
"patent selections",
"patent selections:",
+ "pathology",
"pdf not yet available in ieee xplore",
"pediatric radiology continuing medical education activity",
"pediatrics",
+ "pelvic inflammatory disease",
"pembrolizumab",
"penicillin therapy in acute tonsillitis, phlegmonous tonsillitis and ulcerative tonsillitis",
"peninsula times",
@@ -1955,6 +2472,7 @@ TITLE_BLACKLIST = set([
"periodicals",
"periodicals received",
"periscope",
+ "periscopic review",
"personal and bibliographical.",
"personal and other items",
"personalien",
@@ -1991,18 +2509,24 @@ TITLE_BLACKLIST = set([
"philippines",
"philosophy",
"photovoltaic devices",
+ "physical therapy",
"physiological chemistry",
"physiology",
"pisum sp., 4301 /49",
+ "pisum sp., pis 6462",
+ "pittsburgh",
"placement",
"pneumococcal 13 valent crm197 vaccine conjugate",
"pneumococcal-13-valent-crm197-vaccine-conjugate",
+ "pneumonic influenza",
"poetry",
"poetry and medicine",
"police office, (single boatshed). kavha archaeological report 17, part 1. [norfolk island] [colour version 2 of 2 pdfs]",
+ "poliomyelitis",
"political notebook",
"polyvinylpyrrolidine/thionyl chloride as a new polymeric reagent for facile conversion of epoxides to β-chlorohydrins.",
"ports, waterways and railways",
+ "positional cloning",
"positions available",
"positions wanted",
"position wanted",
@@ -2012,6 +2536,7 @@ TITLE_BLACKLIST = set([
"poster session 1",
"poster session 30",
"post-graduate courses",
+ "postgraduate work",
"postskriptum 3",
"potpourri",
"pour en savoir plus",
@@ -2022,6 +2547,7 @@ TITLE_BLACKLIST = set([
"preface",
"preface and acknowledgements",
"prelim 3: full title (editorial board) (issue 1 only)",
+ "prelim ii(edi board)",
"prelim ii: editorial board",
"prelim(iii) editorial board",
"preliminaries",
@@ -2039,12 +2565,15 @@ TITLE_BLACKLIST = set([
"prevalence of disease: foreign",
"prevalence of disease: in certain states and cities",
"prevalence of disease: united states",
+ "preventive medicine.",
"previous issues",
+ "primary anatomy",
"prince rupert journal",
"principles 101",
"principles of interior renovation",
"problems for solution",
"proceedings of societies",
+ "proceedings of the association of applied biologists",
"proceedings of the pathological society of dublin",
"proceedings of the royal college of veterinary surgeons and veterinary medical societies, etc",
"proceedings of the royal scottish geographical society",
@@ -2055,11 +2584,13 @@ TITLE_BLACKLIST = set([
"proceedings of the society, 1931–1932.",
"proceedings of the society of public analysts",
"proceedings of the statistical society",
+ "product capsules",
"product finder",
"production",
"product review",
"products & materials",
"product update",
+ "produktforum",
"produktinformationen / product information",
"produktnachrichten",
"professional appointments",
@@ -2071,6 +2602,7 @@ TITLE_BLACKLIST = set([
"professional services",
"professional status of the biologist",
"profile",
+ "prognosis and treatment",
"program at-a-glance",
"progress of medical science",
"progress of microscopical science",
@@ -2080,17 +2612,23 @@ TITLE_BLACKLIST = set([
"prostate cancer",
"proven, practical guidance from the planned giving experts",
"provincial medical & surgical journal",
+ "psjm2005 第62回直腸肛門奇形研究会",
+ "psychology",
"psychosocial factors and health",
"publication announcement",
"publication anouncement",
"publications of interest",
"publications received / ouvrages reçus / eingegangene schriften",
"publications recently receiveds",
+ "public health engineering",
"public health services.",
"public relations profession",
"publisher note",
"publisher's acknowledgement",
"publisher's announcement",
+ "pulmonary embolism",
+ "pump faqs",
+ "quanta",
"quarterly chronicle and documentation",
"quarterly list of new publications",
"quarterly summary",
@@ -2101,10 +2639,13 @@ TITLE_BLACKLIST = set([
"questions and comments",
"quiz of the month",
"quotations",
+ "radiation standards",
+ "radiology",
"radio unnameable",
"random noise",
"random vectors",
"ratifications accessions subsequent agreements etc. concerning treaties and international agreements registered with the secretariat of the united nations",
+ "rb5-bantu_ekoid_bakor_nkim-wordlists_08_49_51.mp3",
"rcr meetings",
"readers' comment",
"readers forum",
@@ -2120,15 +2661,19 @@ TITLE_BLACKLIST = set([
"recent earthquakes",
"recent events",
"recent foreign theology",
+ "recent inventions",
"recently published papers",
"recent ornithological publications",
"reception",
"recommendations",
"recordings",
+ "recordings received",
"recto",
"referee awards",
"referee comments",
+ "refereed papers",
"referees",
+ "referees 1985–1986",
"referees 2004",
"referees 2011",
"referees 2013",
@@ -2152,8 +2697,11 @@ TITLE_BLACKLIST = set([
"reporting on adverse clinical events",
"report of immigration at new york",
"report of the annual meeting",
+ "report of the treasurer (1949)",
+ "report of the treasurer (1957)",
"reports and analyses and descriptions of new inventions in medicine, surgery, dietetics, and the allied sciences",
"reports and analyses and descriptions of new inventions, in medicine, surgery, dietetics, and the allied sciences",
+ "reports and documents",
"reports and other publications",
"reports and publications",
"reports from national quarantine and inspection stations",
@@ -2163,6 +2711,7 @@ TITLE_BLACKLIST = set([
"reports of medical societies",
"reports of meetings",
"reports of states and yearly and monthly reports of cities",
+ "reports on medical and surgical practice in the hospitals and asylums of the british empire",
"reports on the progress of astronomy",
"reproduction",
"research",
@@ -2185,6 +2734,7 @@ TITLE_BLACKLIST = set([
"resümee",
"resumenes",
"resúmenes al español",
+ "resumes",
"résumés / resumenes / zusammenfassungen",
"res volume 16 issue 2 cover and back matter",
"retraction",
@@ -2201,6 +2751,7 @@ TITLE_BLACKLIST = set([
"reviewer acknowledgment",
"reviewer list 2019",
"reviewers list",
+ "reviewers of manuscripts, volume 62, 1975",
"review essays",
"review index",
"review of 1956 dental research",
@@ -2214,6 +2765,7 @@ TITLE_BLACKLIST = set([
"reviews in cardiology",
"reviews of books",
"reviews of environmental contamination and toxicology",
+ "reviews of recent books",
"reviews & previews",
"review symposium",
"revolution in retailing",
@@ -2231,7 +2783,10 @@ TITLE_BLACKLIST = set([
"roster page",
"round the hospitals",
"royal college of physicians",
+ "royal college of physicians.",
"royal college of surgeons of england",
+ "royal college of surgeons of england.",
+ "royal navy and army medical services",
"royal society",
"ruh volume 8 issue 1 cover and front matter",
"ruh volume 9 issue 1 cover and front matter",
@@ -2252,6 +2807,7 @@ TITLE_BLACKLIST = set([
"school problems",
"schools approved for training physical therapists",
"schools of physical therapy",
+ "schrifttum",
"science",
"science and technology",
"science news",
@@ -2262,6 +2818,7 @@ TITLE_BLACKLIST = set([
"science/technology concentrate",
"science & technology concentrates",
"science/technology concentrates",
+ "scientific books",
"scientific events",
"scientific journals and articles",
"scientific program",
@@ -2282,6 +2839,7 @@ TITLE_BLACKLIST = set([
"selected highlights from other journals",
"selected titles",
"select filmography",
+ "selection of patients",
"selective exposure",
"self-assessment questions",
"seminare / séminaires 2009",
@@ -2292,23 +2850,29 @@ TITLE_BLACKLIST = set([
"september 1914",
"september 1933",
"series foreword",
+ "sermon cxxvii",
"setting the scene",
"setting the stage",
"sheet 1 of 2",
"sheet 2 of 4",
+ "sheffield medico - chirurgical society.",
"short notes",
"short notices",
"short reviews",
+ "short takes",
"short term courses for graduate physical therapists",
"short-term courses for graduate physical therapists",
+ "silicon-29 nmr data of c40h38s8si4",
"similkameen star",
"simulation in the service of society",
"sinninseln",
"sin título",
"sirolimus",
+ "slovakia",
"slr volume 1 issue i1 cover and back matter",
"smallpox in the united states",
"[s.n.]",
+ "social darwinism",
"social media",
"social work and global health inequalities: practice and policy developments",
"society announcements",
@@ -2318,6 +2882,8 @@ TITLE_BLACKLIST = set([
"society news.",
"society news of the european confederation of neuropathological societies",
"society notices",
+ "society of anæsthetists.",
+ "society of medical officers of health.",
"society of public analysts",
"society related material",
"software/online briefs",
@@ -2328,15 +2894,19 @@ TITLE_BLACKLIST = set([
"solutions",
"solutions to ✰-exercises",
"some applications",
+ "some new publications",
"some papers to be published in future issues",
"some recent books",
"sorafenib",
"source code",
"sources",
+ "source wanted",
+ "south australia",
"southeast asia",
"southern africa",
"southwark, archbishop of, (rc), (most rev. peter david smith) (born 21 oct. 1943)",
"spain.",
+ "spanisch",
"spatial cycling of rab gtpase, driven by the gtpase cycle, controls rabs subcellular distribution",
"speaker's paper, 1982-11-19. law and social and economic change in the american past, seminar 519, 1982-1983",
"special announcement",
@@ -2353,6 +2923,7 @@ TITLE_BLACKLIST = set([
"spectral data of porphyrin derivative c100h142n4o8",
"spectrum",
"spine",
+ "sponsoring organizations and liaisons",
"spotlight",
"spotlights on recent jacs publications",
"spring 1940",
@@ -2360,6 +2931,7 @@ TITLE_BLACKLIST = set([
"[staff list]",
"staff list",
"[staff listing]",
+ "staff listing",
"stained glass orders 1907 ‐ 1926: page 119",
"stained glass orders 1907 ‐ 1926: page 128",
"stained glass orders 1907 ‐ 1926: page 174",
@@ -2383,6 +2955,7 @@ TITLE_BLACKLIST = set([
"stereochemistry abstracts",
"steric scale of common substituents from rotational barriers of n(osubstituted aryl)thiazoline-2-thione atropisomers",
"steuerprisma",
+ "st. kilda",
"st. mary's hospital",
"st. mary's hospital.",
"stocks consolidés, par instrument et par secteur : belgique",
@@ -2390,15 +2963,22 @@ TITLE_BLACKLIST = set([
"stock watch",
"strafrecht und verfahren",
"streptomyces sp.",
+ "streptomycin",
"stress alert",
"stroke: highlights of selected articles",
+ "st. thomas's hospital.",
+ "student activities",
+ "student behavior",
"student chapter news",
+ "student characteristics",
+ "student development",
"studies on the carotenogenesis of rhodotorula sp. part ii",
"style",
"style guide",
"subarachnoid hemorrhage.",
"subject fields of editors",
"subject index",
+ "submissions",
"subscribers page",
"subscription information",
"subscription page",
@@ -2437,6 +3017,8 @@ TITLE_BLACKLIST = set([
"surviving the night shift",
"sustaining members",
"sutures, 1754",
+ "symposia and supplements",
+ "symposium",
"symposium24-1",
"synformissue 2012/02",
"synthesis and characterization of stable hypervalent carbon compounds (10-c-5) bearing a 2, 6-bis(p- substituted phenyloxymethyl)benzene ligand",
@@ -2465,17 +3047,22 @@ TITLE_BLACKLIST = set([
"talking points from books",
"talks with architects",
"tam volume 48 issue 3 cover and front matter",
+ "tasmania",
"taxes and royalties attributable to petroleum production",
"taxonomic abstract for the species.",
+ "td1-p00174-a.mp3",
+ "td1-p002-a.mp3",
"td1-p01379-p01379a.mp3",
"technical program",
"technical program committee",
"technical section",
+ "telecommunications: africa",
"telecommunications, computer and information services",
"telemedicine effective for perioperative orientation program",
"television, 1990s",
"temperature of water",
"temporary empty doi 1",
+ "ten geleide",
"termine",
"tetrahedron symposia-in-print",
"thailand",
@@ -2516,12 +3103,16 @@ TITLE_BLACKLIST = set([
"the evening world",
"the express",
"the future",
+ "the future of the medical profession",
"the general medical council",
"the good old days",
"the grand forks sun and kettle valley orchardist",
"the heat equation",
+ "the institute of brewing",
"the institute of chemistry of great britain and ireland. journal and proceedings. 1926. part vi",
"the international congress on tuberculosis",
+ "the introductory addresses.",
+ "the lancet 100 years ago",
"the last word",
"the lawyers' reports annotated. 1918d",
"the ledge",
@@ -2529,20 +3120,26 @@ TITLE_BLACKLIST = set([
"the literature of heterocyclic chemistry, part xvi, 2016",
"the mail herald",
"[the marine trust company of buffal, bank of buffalo branch, staff desks (neg. no. 4949)]",
+ "the mathematical association of america",
"the medical bookman",
+ "the medical directories.",
"the medical sickness and accident society",
"the morning ledger",
+ "the naval medical service.",
"the nelson tribune",
+ "the netherlands",
"the neuroscientist comments",
"the newsletter for newcomers to gift planning",
"the new westminster news",
"the open channel",
"the paystreak",
"the phytochemical society of europe",
+ "the plague",
"the post‐graduate committee in medicine in the university of sydney",
"the postgraduate committee in medicine in the university of sydney",
"the president 1978-1979",
"the propaganda for reform",
+ "the proposed nile reservoir.",
"the prospector",
"the public service",
"the public service.",
@@ -2555,14 +3152,17 @@ TITLE_BLACKLIST = set([
"the role of level and function of high density lipoprotein\n(hdl) in cardiovascular diseases",
"the royal african society",
"the scientific american",
+ "the secretary's office",
"the silvertonian",
"the state, seminar 401, 1952-1953",
"the twentieth century",
"the university of london and degrees for london medical students",
+ "the university of sydney",
"the use of fungi as food and in food processing",
"the use of graphene and its derivatives for liquid-phase transmission electron microscopy of radiation-sensitive specimens",
"the war",
"the war.",
+ "the war in south africa.",
"the way we were",
"the week",
"the weekly news",
@@ -2572,7 +3172,9 @@ TITLE_BLACKLIST = set([
"the year in review",
"this issue at a glance",
"this month in anesthesiology",
+ "this month in clinical urology",
"this month in the journal",
+ "this month in wjm",
"this week in business",
"this week in <emph>jama</emph>",
"this week in science",
@@ -2602,6 +3204,8 @@ TITLE_BLACKLIST = set([
"to the editor of the american historical review",
"touching base",
"toxicokinetics",
+ "toxoplasmosis",
+ "tpds information for authors",
"transactions",
"transactions and communications",
"transactions of branches",
@@ -2609,29 +3213,40 @@ TITLE_BLACKLIST = set([
"transcriptions",
"transitions",
"translation",
+ "translations",
"translators' preface",
"transport",
"transportation loads. measurement and evaluation of dynamic mechanical loads",
+ "transvaal",
"trauma",
+ "treasurer's report",
"treatment",
"trend: bureau of labor statistics. employment and unemployment - seasonally adjusted: unemployment rate - seasonally adjusted, 01/1948 - 07/2019. data planet™ statistical datasets: a sage publishing resource dataset-id: 002-001-001",
"trend of the times",
"trends and tangents",
"trends in measles cases in bayelsa state, nigeria: a five-year review of case-based surveillance data (2014- 2018).",
"tribune",
+ "trinidad and tobago",
"tropical medicine & international health",
+ "ts1-b030-1.mp3",
+ "ts1-b044-1.mp3",
"tuberculosis in childhood",
"tuberkulin",
+ "tuesday, may 9, 1854.",
+ "tutorials",
+ "tv‐tipps",
"tweets of the week",
"[ubc library staff meeting minutes]",
"übersicht",
"Übersicht",
"übersichtsreferat",
+ "ugi corp.",
"uk balance of payments",
"ukraine",
"ulcerative colitis",
":{unav)",
"uncertain (bowl)",
+ "unhcr and international concern",
"unidentified",
"[unidentified european travel photograph]",
"united kingdom",
@@ -2639,12 +3254,15 @@ TITLE_BLACKLIST = set([
"units",
"universal decimal classification. english full edition",
"university and educational intelligence",
+ "university and educational notes",
+ "university of durham",
"university of london : appointments",
"university of london: appointments",
"unreviewed reports",
"unsre noten und bilder",
"untersuchungen über die befallsbereitschaft von baumarten für sekundärschädlinge1",
"untersuchungsmethoden",
+ "upcoming critical care meetings",
"upcoming events",
"upcoming events 36.2",
"upcoming meetings",
@@ -2652,15 +3270,20 @@ TITLE_BLACKLIST = set([
"upcoming meetings related to alzheimer's disease",
"upcoming meetings related to alzheimer's disease*",
"urological letter 原発性尿道アミロイド症/尿道アミロイド症",
+ "urology",
"uro-telegramm",
"u. s. i. chemical news",
"u.s.i. chemical news",
+ "ut ita dicam",
"value of t1 mapping on gadoxetic acid-enhanced mri for microvascular invasion of hepatocellular carcinoma: a retrospective study",
+ "vancomycin",
"vapor-liquid equilibrium of the mixture c3h6o3 + c3h8o2 (lb4812, evlm 1231)",
"vapor-liquid equilibrium of the mixture ch4o + c6h14o (lb4908, evlm 1231)",
"vapor-liquid equilibrium of the mixture ch4o + c6h6 (lb5199, evlm 1231)",
"varia",
+ "varian",
"variantes",
+ "vd-02-4 胸腔鏡下手術における病変同定のためのゴールドマーカー挿入(胸腔鏡手術(2), 第24回日本呼吸器外科学会総会号)",
"vegetable physiology",
"venezuela",
"veranstaltungen",
@@ -2686,9 +3309,12 @@ TITLE_BLACKLIST = set([
"video views",
"vient de paraître",
"view in brief",
+ "vignettes",
+ "vinyls? you can make them better with gulf isooctyl alcohol",
"violence against nurses",
"virginia militia in the revolution (continued)",
"vital statistics table",
+ "vitamin b\n 1",
"volkshochschule altenburger land: frühjahrsprogramm 2006, s. 46",
"volkshochschule altenburger land: frühjahrsprogramm 2008, s. 18",
"volkshochschule altenburger land: frühjahrsprogramm 2010, s. 51",
@@ -2727,6 +3353,7 @@ TITLE_BLACKLIST = set([
"volume index",
"volumes in series",
"vom heute fürs morgen",
+ "vomiting in the newborn",
"voraussetzungen und hilfsmittel",
"vorbemerkungen",
"vorrede",
@@ -2734,6 +3361,8 @@ TITLE_BLACKLIST = set([
"vorwort der herausgeber",
"vorwort zur ersten auflage",
"vorwort zur zweiten auflage",
+ "v.—the later middle ages, 1200-c. 1500",
+ "wales and western counties notes.",
"wanted",
"war and government in the french provinces: picardy, 1470-1560",
"warfarin",
@@ -2750,18 +3379,23 @@ TITLE_BLACKLIST = set([
"welcome note",
"werbung",
"west florida and its attempt on mobile, 1810–1811",
+ "west london medico-chirurgical society.",
"westminster daily news",
"westminster hospital",
"westminster hospital,",
+ "westminster hospital.",
"what is onlineearly?",
"what's hot on newscientist.com",
"what's on the web",
"what's your diagnosis?",
+ "what's your diagnosis?[183] コブラツイスト",
"what they say",
"where are we going?",
+ "who's who in education",
"widmung",
"wiederkäuer",
"[william c. whitney house, construction view]",
+ "wind erosion",
"winds",
"winterthur, slm 2",
"wirtschaftlicher teil u. vereinsnachrichten",
@@ -2785,12 +3419,17 @@ TITLE_BLACKLIST = set([
"your medication information",
"zeitschriftenberichte",
"zeitschriftenschau",
+ "zeitschrift für philosophie und philosophische kritik",
"zeittafel",
"zinso-rechtsprechungsreport",
"zum neujahr",
"zur besprechung eingelaufen.",
+ "zur diskussion",
"zusatz des herausgebers",
"zwangsvollstreckungsrecht",
+ "игровое начало в русской религиозно-философской литературе конца xix-начала xx века",
+ "татары мусульмане и русские в мещанских общинах среднего поволжья в конце xix – начале xx века",
+ "унежева з.с., султанова а.м. женский национальный пояс конца xviii- первой половине xix в.в.",
"Унежева З.С., Султанова А.М. Женский национальный пояс конца xviii- первой половине xix в.в.",
"электронный журнал cloud of science",
"آينه افغان ايد : نشريه داخلي سه ماهه مؤسسه افغان ايد = afghanaid mirror",
@@ -2798,37 +3437,59 @@ TITLE_BLACKLIST = set([
"パネルディスカッション 2. 癌転移研究の進歩: translational research を目指して",
"ワークショップ(1〜7) 7 月 2 日・3 日 a・b・c 会場",
"���bio-care���[dow]",
+ "一般演題 macroangiopathy",
+ "人事行政 5(7);[jul・1954]",
+ "今月のkey画像",
+ "今月の主題 ウイルス肝炎 実地診療a to z ウイルス肝炎の的確な診断のために 肝障害患者の診断の進め方",
"今月の症例 helicobacter pylori陰性胃に発生した早期胃癌の1例",
+ "令和2年 基礎・材料・共通部門大会 情報/基礎・材料・共通部門 論文誌 2020年4月号(vol. 140, no. 4)目次/特集号の論文募集「電気電子工学関連分野における教育フロンティア」特集/上級会員制度のご案内/協賛・後援依頼",
"「低温生物工学会誌」(cryobiology and cryotechnology)編集要綱",
+ "『分子でよむ環境汚染』, 鈴木聡編著, 東海大学出版会, isbn978-4-486-01812-4, 3,500円",
"十二指腸線状潰瘍における十二指腸球部粘膜血行動態の検討 : 酸分泌抑制薬ならびにhelicobacter pylori除菌療法の意義",
"印刷雑誌 = japan printer 35(4)",
"印刷雑誌 = japan printer (総目次 36(1)-36(12))",
+ "基于物候记录的中国中部关中地区公元<bold>600~902</bold>年冬半年温度重建",
"基于系统发育分析的dna条形码技术在澄清芍药属牡丹组物种问题中的应用",
"増刊号 common disease200の治療戦略 循環器疾患 心臓神経症",
+ "増刊号 尿検査法 ii.各論 23.微生物検査 2)尿路感染症の検査法 (3)培養法-b.特殊な微生物",
+ "多層スラリーキャスト法による空気極基体管方式円筒型sofcの作製とその特性",
"大鼠骨骼肌肌质网非序列依赖性dna结合蛋白及其功能的初步研究*",
"奥付",
+ "学校図書館を考える会・近畿編, 学んだ,広げた,「学校図書館」-「考える会・近畿」20年-, 自費出版, 2012.10, 204p, 30cm, 定価1,000円",
"工業化と労働 : 1966年ilo第50回総会事務局長報告",
+ "<彙報>青年期の精神健康に関する研究 : 特に高校生に関する,maslow 理論からの動機論的アプローチ(昭和 47 年度修士論文概要)",
"急性膵炎における腎尿細管障害の発生機序に関する研究 : 特にphospholipase a[2]の作用と尿細管細胞膜流動性の変化について",
"投稿規定",
+ "整形外科philosophy 運動器外科を意識した整形外科の位置付け",
+ "松村高夫著, 『イギリスの鉄道争議と裁判-タフ・ヴェイル判決の労働史-』, ミネルヴァ書房, 2005年3月, xiv+239+42頁, 6,300円",
"特集 common disease インストラクションマニュアル-患者に何をどう説明するか 呼吸器疾患 アスベスト関連疾患",
"特集 common disease インストラクションマニュアル-患者に何をどう説明するか 物理・化学的因子による疾患 虫刺症",
+ "特集 医療の質を高めるpos-第11回pos研究会記録 ワークショップ 継続ケアのためのサマリー活用",
"特集 帰してはいけない「こども」を見逃さないために 今月のquestion & keyword index",
"特集 最近のトピックス2007 clinical dermatology 2007 5. 皮膚科医のための臨床トピックス 学校保健における皮膚科医の活動",
"特集 最近のトピックス clinical dermatology 1999 4 皮膚疾患治療のポイント 白癬菌の足底への付着とその予防",
"特集 理学療法士がめざす安心と安全 eoi(essences of the issue)",
+ "特集 第10回日本病院学会シンポジウム symposium 病院・診療書の連繋について 診療科目別にみた病院,診療所の地域的諸関係について",
"特集 第41回日本臨床眼科学会講演集 (6) 学術展示 vdt作業に伴う涙液量と瞬目数の経時的変化について",
"特集 脳のシステム障害と理学療法 eoi(essences of the issue)",
"特集 膵炎診療のcontroversy 慢性膵炎の手術適応と術式の選択 q & a",
+ "特集 診療力を上げる! 症例問題集 第3章 腎 臓 症例問題 血漿hco3−の著明な低下と低カリウム血症を呈した症例",
"特集 退院支援-理学療法士はその先が見えているか eoi(essences of the issue)",
"特集 顔の総合診療 顔をみればわかること 今月のquestion & keyword index",
"特集 高齢者診療アップグレード-コツとピットフォール 今月のquestion & keyword index",
"猪瀬優理著, 『信仰はどのように継承されるか-創価学会にみる次世代育成-』, 北海道大学出版会, 2011年10月刊, a5判, v+296頁, 3,990円(書評とリプライ)",
+ "电针显著增加大鼠纹状体和垂体中脑啡肽原mrna含量",
+ "症例報告 鼠径リンパ節腫脹を主訴としたchurg-strauss症候群の1例",
"目次",
+ "研究 頭蓋histiocytosis xの臨床病理学的検討",
+ "第110回「忘年会にて」(infosta forum)",
"第22回日本小児外科学会秋季シンポジウム : 膵・胆管合流異常-小児例でのconsensusを目指して-(プログラム)",
"経セミ : 経済セミナー : the keizai seminar (429)",
"『脳神経外科ジャーナル』(japanese journal of neurosurgery)投稿ならびに執筆規定",
"臨床報告 roux-en-y法で挙上した空腸内に24年後に発症した真性腸石の1例",
+ "英文誌journal of nutritional science and vitaminology vol. 35, no. 6掲載論文要旨",
"英文誌journal of nutritional science and vitaminology vol.46, no.1掲載論文要旨",
+ "英文誌 journal of nutritional science and vitaninology vol. 27 no. 2 掲載論文要旨",
"表紙",
"裏表紙",
"診断と治療 = diagnosis and treatment 臨時增刋 第四編",
@@ -2847,225 +3508,3 @@ CONTAINER_NAME_BLACKLIST = set([
PUBLISHER_BLACKLIST = set([
"test accounts",
])
-
-# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
-CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
-
-
-class Status(str, Enum):
- """
- Match status.
- """
- EXACT = 'exact'
- DIFFERENT = 'different'
- STRONG = 'strong'
- WEAK = 'weak'
- AMBIGUOUS = 'ambigiuous'
-
-
-class OK(str, Enum):
- """
- Reason for assuming we have a match.
- """
- ARXIV_VERSION = 'ok.arxiv_version'
- DUMMY = 'ok.dummy'
- TITLE_AUTHOR_MATCH = 'ok.title_author_match'
- PREPRINT_PUBLISHED = 'ok.preprint_published'
- SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
-
-
-class Miss(str, Enum):
- """
- Reasons indicating mismatch.
- """
- ARXIV_VERSION = 'miss.arxiv_version'
- BLACKLISTED = 'miss.blacklisted'
- CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty'
- SHORT_TITLE = 'miss.short_title'
- YEAR = 'miss.year'
- CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
- NUM_DIFF = 'miss.num_diff'
- DATASET_DOI = 'miss.dataset_doi'
- RELEASE_TYPE = 'miss.release_type'
- CHEM_FORMULA = 'miss.chem_formula'
- SUBTITLE = 'miss.subtitle'
- BOOK_CHAPTER = 'miss.book_chapter'
- TITLE_FILENAME = 'miss.title_filename'
-
-
-class GroupVerifier:
- """
- Verifier.
-
- Within a group, we could have multiple sub clusters, e.g.
-
- > [AABAB]
-
- We would need to compare each possible pair and decide whether they are the
- same.
- """
- def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10):
- self.iterable: collections.abc.Iterable = iterable
- self.max_cluster_size: int = 10
- self.counter = collections.Counter()
-
- def run(self):
- for i, line in enumerate(self.iterable):
- if i % 20000 == 0:
- print(i, file=sys.stderr)
- line = line.strip()
- if not line:
- continue
- doc = json.loads(line)
- k, vs = get_key_values(doc)
- if len(vs) < 2:
- self.counter["skip.unique"] += 1
- continue
- if len(vs) > self.max_cluster_size:
- self.counter["skip.too_large"] += 1
- continue
- for a, b in itertools.combinations(vs, r=2):
- for re in (a, b):
- if re.get("extra", {}).get("container_name",
- "").lower().strip() in CONTAINER_NAME_BLACKLIST:
- self.counter["skip.container_name_blacklist"] += 1
- continue
- if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST:
- self.counter["skip.publisher_blacklist"] += 1
- continue
- result, reason = compare(a, b)
- self.counter[reason] += 1
- print("https://fatcat.wiki/release/{}".format(a["ident"]),
- "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason)
-
- self.counter["total"] = sum(v for _, v in self.counter.items())
- print(json.dumps(dict(self.counter)), file=sys.stderr)
- with open("xxxx-todo", "w") as f:
- print(json.dumps(todo.most_common()), file=f)
-
-
-todo = collections.Counter()
-
-
-def compare(a, b):
- """
- Compare two entities, return match status.
- """
- if len(a.get("title", "")) < 5:
- return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
- if a.get("title", "").lower() in TITLE_BLACKLIST:
- return (Status.AMBIGUOUS, Miss.BLACKLISTED)
-
- if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
- return (Status.DIFFERENT, Miss.CUSTOM_VHS)
-
- if a.get("release_type") and b.get(
- "release_type") and a.get("release_type") != b.get("release_type"):
- return (Status.DIFFERENT, Miss.RELEASE_TYPE)
-
- if (a.get("release_type") == "dataset" and b.get("release_type") == "dataset"):
- if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi")
- and a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")):
- return (Status.DIFFERENT, Miss.DATASET_DOI)
-
- if (a.get("release_type") == "chapter" and b.get("release_type") == "chapter"
- and a.get("extra", {}).get("container_name")
- and b.get("extra", {}).get("container_name") and
- a.get("extra", {}).get("container_name") != b.get("extra", {}).get("container_name")):
- return (Status.DIFFERENT, Miss.BOOK_CHAPTER)
-
- if a.get("extra", {}).get("crossref", {}).get("type", "") == "component" and a.get("title") != b.get("title"):
- return (Status.DIFFERENT, Miss.COMPONENT)
-
- arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
- arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
-
- a_authors = set([v.get("raw_name") for v in a.get("contribs", [])])
- b_authors = set([v.get("raw_name") for v in b.get("contribs", [])])
- a_slug_authors = set((slugify_string(v) for v in a_authors if v))
- b_slug_authors = set((slugify_string(v) for v in b_authors if v))
- a_release_year = a.get("release_year")
- b_release_year = b.get("release_year")
-
- if a.get("title", "").lower() == b.get("title", "").lower():
- if a_authors and (a_authors == b_authors):
- if a_release_year and b_release_year and a_release_year != b_release_year:
- return (Status.DIFFERENT, Miss.YEAR)
- return (Status.EXACT, OK.TITLE_AUTHOR_MATCH)
-
- if (len(a.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{3,3}", a.get("title", "")) or
- len(b.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{3,3}", b.get("title", ""))):
- if a.get("title") != b.get("title"):
- return (Status.DIFFERENT, Miss.TITLE_FILENAME)
-
- if a.get("title") and a.get("title") == b.get("title"):
- if a_release_year and b_release_year:
- if abs(int(a_release_year) - int(b_release_year)) > 2:
- return (Status.DIFFERENT, Miss.YEAR)
-
- # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines
- a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
- b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
-
- if a_slug_title == b_slug_title:
- a_subtitles = a.get("extra", {}).get("subtitle", []) or []
- b_subtitles = b.get("extra", {}).get("subtitle", []) or []
- for a_sub in a_subtitles:
- for b_sub in b_subtitles:
- if slugify_string(a_sub) != slugify_string(b_sub):
- return (Status.DIFFERENT, Miss.SUBTITLE)
-
- if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (
- a_slug_title != b_slug_title):
- return (Status.DIFFERENT, Miss.CHEM_FORMULA)
-
- if len(a_slug_title) < 10 and a_slug_title != b_slug_title:
- return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
-
- if re.search(r'\d', a_slug_title) and a_slug_title != b_slug_title and num_project(
- a_slug_title) == num_project(b_slug_title):
- return (Status.DIFFERENT, Miss.NUM_DIFF)
-
- if a_slug_title and b_slug_title and a_slug_title == b_slug_title:
- if a_authors and len(a_authors & b_authors) > 0:
- if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None:
- return (Status.STRONG, OK.PREPRINT_PUBLISHED)
-
- if a_slug_title and b_slug_title and a_slug_title.strip().replace(
- " ", "") == b_slug_title.strip().replace(" ", ""):
- if len(a_slug_authors & b_slug_authors) > 0:
- return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
-
- arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
- arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
- if arxiv_id_a and arxiv_id_b:
- id_a, version_a = arxiv_id_a.split("v")
- id_b, version_b = arxiv_id_b.split("v")
- if id_a == id_b:
- return (Status.STRONG, OK.ARXIV_VERSION)
- else:
- return (Status.DIFFERENT, Miss.ARXIV_VERSION)
-
- if a_authors and len(a_slug_authors & b_slug_authors) == 0:
- return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY)
-
- todo[a.get("title")] += 1
- return (Status.AMBIGUOUS, OK.DUMMY)
-
-def num_project(s):
- """
- Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
- https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u
-
- Unify every occurence of a digit (or group of digits).
- """
- return re.sub('\d+', '<NUM>', s)
-
-
-def contains_chemical_formula(s):
- """
- Returns true, if we find C3H8O or the like in title.
- """
- for token in s.split():
- if CHEM_FORMULA.search(token):
- return True