#!/usr/bin/env python3 import sys import json with open('title_slug_blacklist.txt', 'r') as f: TITLE_BLACKLIST = [l.strip() for l in f] TITLE_BLACKLIST.extend(( 'editorial', 'advertisement', 'bookreviews', 'reviews', 'nr', 'abstractoriginalarticle', 'originalarticle', 'impactfactor', 'articlenumber', )) # The full name can't *entirely* be one of these NAME_BLACKLIST = ( 'phd', 'phdstudent', ) def tokenize(s, remove_whitespace=True): s.replace(''', "'") # Remove non-alphanumeric characters s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) if remove_whitespace: s = ''.join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').decode('utf8').replace('?', '') assert tokenize("Impact Factor: 2.114") == "impactfactor" assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST def filter_title(title): title = title.strip() if len(title) > 500: return None title_slug = tokenize(title, remove_whitespace=True) if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: return None if title_slug.startswith('nr'): return None if title.lower().replace('.', '').startswith('int j '): return None for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): if title.startswith(prefix): title.replace(prefix, '') if title.startswith("The Journal of "): return None if "volume" in title_slug and "issue" in title_slug: return None if "downloadedfrom" in title_slug: return None if title_slug.startswith("issn"): return None # titles with too many or too few words in title title_words = len(title.split()) if title_words > 50 or title_words < 2: return None # titles with spaces between every letter (more than N such single-char words) if len([True for w in title.split() if len(w) == 1]) > 12: return None # too deep subtitling/splitting if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: return None return title def filter_author_name(name): name = name['name'] if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: return None return ' '.join([t for t in name.split() if tokenize(t)]) def filter_authors(l): return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] def filter_refs(l): # TODO: return l def filter_journal_name(name): # same blacklist, for now if not name: return None name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') slug_name = tokenize(name) if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": return None for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): if name.startswith(prefix): name = name.replace(prefix, '') for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): if name.endswith(suffix): name = name.replace(suffix, '') if "====================" in name: return None if len(name) > 150: return None return ' '.join(name.split()) def filter_metadata(obj): if not (obj.get('title') and obj.get('authors')): return None title = filter_title(obj['title']) if not title: #sys.stderr.write("bad title\n") return None else: obj['title'] = title obj['authors'] = filter_authors(obj['authors']) obj['citations'] = filter_refs(obj['citations']) obj['journal']['name'] = filter_journal_name(obj['journal']['name']) return obj def run(invert=False): for line in sys.stdin: fields = line.split('\t') if len(fields) == 5: raw = fields[4] elif len(fields) == 1: raw = fields[0] else: sys.stderr.write("bad line\n") continue obj = json.loads(raw) processed = filter_metadata(obj) if processed: if not invert: processed = json.dumps(processed) if len(fields) == 5: fields[4] = processed else: fields[0] = processed print('\t'.join(fields)) elif invert: print(raw.strip()) if __name__=="__main__": run(invert="--invert" in sys.argv)