diff options
Diffstat (limited to 'python/scripts/filter_grobid_metadata.py')
-rwxr-xr-x | python/scripts/filter_grobid_metadata.py | 18 |
1 files changed, 15 insertions, 3 deletions
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index d0666ce..a474393 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -24,6 +24,7 @@ NAME_DENYLIST = ( 'phdstudent', ) + def tokenize(s, remove_whitespace=True): s.replace(''', "'") @@ -36,9 +37,11 @@ def tokenize(s, remove_whitespace=True): # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + assert tokenize("Impact Factor: 2.114") == "impactfactor" assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST + def filter_title(title): title = title.strip() @@ -83,19 +86,23 @@ def filter_title(title): return title + def filter_author_name(name): name = name['name'] if name.strip().lower().replace(' ', '') in NAME_DENYLIST: return None return ' '.join([t for t in name.split() if tokenize(t)]) + def filter_authors(l): return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] + def filter_refs(l): # TODO: return l + def filter_journal_name(name): # same denylist, for now if not name: @@ -104,10 +111,12 @@ def filter_journal_name(name): slug_name = tokenize(name) if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": return None - for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): + for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", + "Research Article ", "Available online www.jocpr.com "): if name.startswith(prefix): name = name.replace(prefix, '') - for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): + for suffix in (" Available online at www.sciarena.com", " Original Article", + " Available online at", " ISSN", " ISSUE"): if name.endswith(suffix): name = name.replace(suffix, '') if "====================" in name: @@ -116,6 +125,7 @@ def filter_journal_name(name): return None return ' '.join(name.split()) + def filter_metadata(obj): if not (obj.get('title') and obj.get('authors')): return None @@ -132,6 +142,7 @@ def filter_metadata(obj): return obj + def run(invert=False): for line in sys.stdin: fields = line.split('\t') @@ -155,5 +166,6 @@ def run(invert=False): elif invert: print(raw.strip()) -if __name__=="__main__": + +if __name__ == "__main__": run(invert="--invert" in sys.argv) |