aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/filter_grobid_metadata.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/filter_grobid_metadata.py')
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py111
1 files changed, 69 insertions, 42 deletions
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index dc4bea7..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,44 +1,49 @@
#!/usr/bin/env python3
-import sys
import json
+import sys
-with open('title_slug_denylist.txt', 'r') as f:
+with open("title_slug_denylist.txt", "r") as f:
TITLE_DENYLIST = [l.strip() for l in f]
-TITLE_DENYLIST.extend((
- 'editorial',
- 'advertisement',
- 'bookreviews',
- 'reviews',
- 'nr',
- 'abstractoriginalarticle',
- 'originalarticle',
- 'impactfactor',
- 'articlenumber',
-))
+TITLE_DENYLIST.extend(
+ (
+ "editorial",
+ "advertisement",
+ "bookreviews",
+ "reviews",
+ "nr",
+ "abstractoriginalarticle",
+ "originalarticle",
+ "impactfactor",
+ "articlenumber",
+ )
+)
# The full name can't *entirely* be one of these
NAME_DENYLIST = (
- 'phd',
- 'phdstudent',
+ "phd",
+ "phdstudent",
)
+
def tokenize(s, remove_whitespace=True):
- s.replace(''', "'")
+ s.replace("'", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+ return s.encode("ascii", "replace").decode("utf8").replace("?", "")
+
assert tokenize("Impact Factor: 2.114") == "impactfactor"
assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
def filter_title(title):
title = title.strip()
@@ -47,14 +52,14 @@ def filter_title(title):
title_slug = tokenize(title, remove_whitespace=True)
if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
- if title_slug.startswith('nr'):
+ if title_slug.startswith("nr"):
return None
- if title.lower().replace('.', '').startswith('int j '):
+ if title.lower().replace(".", "").startswith("int j "):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
- title.replace(prefix, '')
+ title.replace(prefix, "")
if title.startswith("The Journal of "):
return None
@@ -78,63 +83,84 @@ def filter_title(title):
return None
# too deep subtitling/splitting
- if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
return None
return title
+
def filter_author_name(name):
- name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+ name = name["name"]
+ if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
return None
- return ' '.join([t for t in name.split() if tokenize(t)])
+ return " ".join([t for t in name.split() if tokenize(t)])
+
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
def filter_refs(l):
# TODO:
return l
+
def filter_journal_name(name):
# same denylist, for now
if not name:
return None
- name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
slug_name = tokenize(name)
if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ for prefix in (
+ "/ ",
+ "~ ",
+ "& ",
+ "© ",
+ "Original Research Article ",
+ "Original Article ",
+ "Research Article ",
+ "Available online www.jocpr.com ",
+ ):
if name.startswith(prefix):
- name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ name = name.replace(prefix, "")
+ for suffix in (
+ " Available online at www.sciarena.com",
+ " Original Article",
+ " Available online at",
+ " ISSN",
+ " ISSUE",
+ ):
if name.endswith(suffix):
- name = name.replace(suffix, '')
+ name = name.replace(suffix, "")
if "====================" in name:
return None
if len(name) > 150:
return None
- return ' '.join(name.split())
+ return " ".join(name.split())
+
def filter_metadata(obj):
- if not (obj.get('title') and obj.get('authors')):
+ if not (obj.get("title") and obj.get("authors")):
return None
- title = filter_title(obj['title'])
+ title = filter_title(obj["title"])
if not title:
- #sys.stderr.write("bad title\n")
+ # sys.stderr.write("bad title\n")
return None
else:
- obj['title'] = title
- obj['authors'] = filter_authors(obj['authors'])
- obj['citations'] = filter_refs(obj['citations'])
- obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+ obj["title"] = title
+ obj["authors"] = filter_authors(obj["authors"])
+ obj["citations"] = filter_refs(obj["citations"])
+ obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
return obj
+
def run(invert=False):
for line in sys.stdin:
- fields = line.split('\t')
+ fields = line.split("\t")
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
@@ -151,9 +177,10 @@ def run(invert=False):
fields[4] = processed
else:
fields[0] = processed
- print('\t'.join(fields))
+ print("\t".join(fields))
elif invert:
print(raw.strip())
-if __name__=="__main__":
+
+if __name__ == "__main__":
run(invert="--invert" in sys.argv)