diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-09-26 01:55:22 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-09-26 01:55:22 +0000 |
commit | 0bbe8e1f6689da846944d60a53e620adc2b7622b (patch) | |
tree | 38cac67e061e4948d9cf1d17f64c25abca486635 | |
parent | 7159fdf1ec55a4c9c096afb5eb1ce57b9a51f1e8 (diff) | |
download | sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.tar.gz sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.zip |
some progress on a crude grobid metadata filter
-rwxr-xr-x | python/filter_grobid_metadata.py | 157 | ||||
l--------- | python/title_slug_blacklist.txt | 1 |
2 files changed, 151 insertions, 7 deletions
diff --git a/python/filter_grobid_metadata.py b/python/filter_grobid_metadata.py index 7f619db..c33ab86 100755 --- a/python/filter_grobid_metadata.py +++ b/python/filter_grobid_metadata.py @@ -3,14 +3,157 @@ import sys import json -def grobid_ok(obj): - return True +with open('title_slug_blacklist.txt', 'r') as f: + TITLE_BLACKLIST = [l.strip() for l in f] -def run(): +TITLE_BLACKLIST.extend(( + 'editorial', + 'advertisement', + 'bookreviews', + 'reviews', + 'nr', + 'abstractoriginalarticle', + 'originalarticle', + 'impactfactor', + 'articlenumber', +)) + +# The full name can't *entirely* be one of these +NAME_BLACKLIST = ( + 'phd', + 'phdstudent', +) + +def tokenize(s, remove_whitespace=True): + + s.replace(''', "'") + # Remove non-alphanumeric characters + s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) + + if remove_whitespace: + s = ''.join(s.split()) + + # Encode as dumb ASCII (TODO: this is horrible) + return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + +assert tokenize("Impact Factor: 2.114") == "impactfactor" +assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST + +def filter_title(title): + + title = title.strip() + if len(title) > 500: + return None + title_slug = tokenize(title, remove_whitespace=True) + if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: + return None + if title_slug.startswith('nr'): + return None + if title.lower().replace('.', '').startswith('int j '): + return None + + for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): + if title.startswith(prefix): + title.replace(prefix, '') + + if title.startswith("The Journal of "): + return None + + if "volume" in title_slug and "issue" in title_slug: + return None + + if "downloadedfrom" in title_slug: + return None + + if title_slug.startswith("issn"): + return None + + # titles with too many or too few words in title + title_words = len(title.split()) + if title_words > 50 or title_words < 2: + return None + + # titles with spaces between every letter (more than N such single-char words) + if len([True for w in title.split() if len(w) == 1]) > 12: + return None + + # too deep subtitling/splitting + if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: + return None + + return title + +def filter_author_name(name): + name = name['name'] + if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: + return None + return ' '.join([t for t in name.split() if tokenize(t)]) + +def filter_authors(l): + return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] + +def filter_refs(l): + # TODO: + return l + +def filter_journal_name(name): + # same blacklist, for now + if not name: + return None + name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') + slug_name = tokenize(name) + if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": + return None + for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): + if name.startswith(prefix): + name = name.replace(prefix, '') + for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): + if name.endswith(suffix): + name = name.replace(suffix, '') + if "====================" in name: + return None + if len(name) > 150: + return None + return ' '.join(name.split()) + +def filter_metadata(obj): + if not (obj.get('title') and obj.get('authors')): + return None + + title = filter_title(obj['title']) + if not title: + #sys.stderr.write("bad title\n") + return None + else: + obj['title'] = title + obj['authors'] = filter_authors(obj['authors']) + obj['citations'] = filter_refs(obj['citations']) + obj['journal']['name'] = filter_journal_name(obj['journal']['name']) + + return obj + +def run(invert=False): for line in sys.stdin: - obj = json.loads(line) - if grobid_ok(obj): - print(line.strip()) + fields = line.split('\t') + if len(fields) == 5: + raw = fields[4] + elif len(fields) == 1: + raw = fields[0] + else: + sys.stderr.write("bad line\n") + continue + obj = json.loads(raw) + processed = filter_metadata(obj) + if processed: + if not invert: + processed = json.dumps(processed) + if len(fields) == 5: + fields[4] = processed + else: + fields[0] = processed + print('\t'.join(fields)) + elif invert: + print(raw.strip()) if __name__=="__main__": - run() + run(invert="--invert" in sys.argv) diff --git a/python/title_slug_blacklist.txt b/python/title_slug_blacklist.txt new file mode 120000 index 0000000..5bca386 --- /dev/null +++ b/python/title_slug_blacklist.txt @@ -0,0 +1 @@ +../scalding/src/main/resources/slug-denylist.txt
\ No newline at end of file |