aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-09-26 01:55:22 +0000
committerBryan Newbold <bnewbold@archive.org>2018-09-26 01:55:22 +0000
commit0bbe8e1f6689da846944d60a53e620adc2b7622b (patch)
tree38cac67e061e4948d9cf1d17f64c25abca486635
parent7159fdf1ec55a4c9c096afb5eb1ce57b9a51f1e8 (diff)
downloadsandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.tar.gz
sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.zip
some progress on a crude grobid metadata filter
-rwxr-xr-xpython/filter_grobid_metadata.py157
l---------python/title_slug_blacklist.txt1
2 files changed, 151 insertions, 7 deletions
diff --git a/python/filter_grobid_metadata.py b/python/filter_grobid_metadata.py
index 7f619db..c33ab86 100755
--- a/python/filter_grobid_metadata.py
+++ b/python/filter_grobid_metadata.py
@@ -3,14 +3,157 @@
import sys
import json
-def grobid_ok(obj):
- return True
+with open('title_slug_blacklist.txt', 'r') as f:
+ TITLE_BLACKLIST = [l.strip() for l in f]
-def run():
+TITLE_BLACKLIST.extend((
+ 'editorial',
+ 'advertisement',
+ 'bookreviews',
+ 'reviews',
+ 'nr',
+ 'abstractoriginalarticle',
+ 'originalarticle',
+ 'impactfactor',
+ 'articlenumber',
+))
+
+# The full name can't *entirely* be one of these
+NAME_BLACKLIST = (
+ 'phd',
+ 'phdstudent',
+)
+
+def tokenize(s, remove_whitespace=True):
+
+ s.replace('&apos;', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+
+assert tokenize("Impact Factor: 2.114") == "impactfactor"
+assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+
+def filter_title(title):
+
+ title = title.strip()
+ if len(title) > 500:
+ return None
+ title_slug = tokenize(title, remove_whitespace=True)
+ if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+ return None
+ if title_slug.startswith('nr'):
+ return None
+ if title.lower().replace('.', '').startswith('int j '):
+ return None
+
+ for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
+ if title.startswith(prefix):
+ title.replace(prefix, '')
+
+ if title.startswith("The Journal of "):
+ return None
+
+ if "volume" in title_slug and "issue" in title_slug:
+ return None
+
+ if "downloadedfrom" in title_slug:
+ return None
+
+ if title_slug.startswith("issn"):
+ return None
+
+ # titles with too many or too few words in title
+ title_words = len(title.split())
+ if title_words > 50 or title_words < 2:
+ return None
+
+ # titles with spaces between every letter (more than N such single-char words)
+ if len([True for w in title.split() if len(w) == 1]) > 12:
+ return None
+
+ # too deep subtitling/splitting
+ if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ return None
+
+ return title
+
+def filter_author_name(name):
+ name = name['name']
+ if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+ return None
+ return ' '.join([t for t in name.split() if tokenize(t)])
+
+def filter_authors(l):
+ return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
+def filter_refs(l):
+ # TODO:
+ return l
+
+def filter_journal_name(name):
+ # same blacklist, for now
+ if not name:
+ return None
+ name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ slug_name = tokenize(name)
+ if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
+ return None
+ for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ if name.startswith(prefix):
+ name = name.replace(prefix, '')
+ for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ if name.endswith(suffix):
+ name = name.replace(suffix, '')
+ if "====================" in name:
+ return None
+ if len(name) > 150:
+ return None
+ return ' '.join(name.split())
+
+def filter_metadata(obj):
+ if not (obj.get('title') and obj.get('authors')):
+ return None
+
+ title = filter_title(obj['title'])
+ if not title:
+ #sys.stderr.write("bad title\n")
+ return None
+ else:
+ obj['title'] = title
+ obj['authors'] = filter_authors(obj['authors'])
+ obj['citations'] = filter_refs(obj['citations'])
+ obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+
+ return obj
+
+def run(invert=False):
for line in sys.stdin:
- obj = json.loads(line)
- if grobid_ok(obj):
- print(line.strip())
+ fields = line.split('\t')
+ if len(fields) == 5:
+ raw = fields[4]
+ elif len(fields) == 1:
+ raw = fields[0]
+ else:
+ sys.stderr.write("bad line\n")
+ continue
+ obj = json.loads(raw)
+ processed = filter_metadata(obj)
+ if processed:
+ if not invert:
+ processed = json.dumps(processed)
+ if len(fields) == 5:
+ fields[4] = processed
+ else:
+ fields[0] = processed
+ print('\t'.join(fields))
+ elif invert:
+ print(raw.strip())
if __name__=="__main__":
- run()
+ run(invert="--invert" in sys.argv)
diff --git a/python/title_slug_blacklist.txt b/python/title_slug_blacklist.txt
new file mode 120000
index 0000000..5bca386
--- /dev/null
+++ b/python/title_slug_blacklist.txt
@@ -0,0 +1 @@
+../scalding/src/main/resources/slug-denylist.txt \ No newline at end of file