some progress on a crude grobid metadata filter

author: Bryan Newbold <bnewbold@archive.org> 2018-09-26 01:55:22 +0000
committer: Bryan Newbold <bnewbold@archive.org> 2018-09-26 01:55:22 +0000
commit: 0bbe8e1f6689da846944d60a53e620adc2b7622b (patch)
tree: 38cac67e061e4948d9cf1d17f64c25abca486635
parent: 7159fdf1ec55a4c9c096afb5eb1ce57b9a51f1e8 (diff)
download: sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.tar.gz
sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.zip
2 files changed, 151 insertions, 7 deletions
diff --git a/python/filter_grobid_metadata.py b/python/filter_grobid_metadata.py
index 7f619db..c33ab86 100755
--- a/python/filter_grobid_metadata.py
+++ b/python/filter_grobid_metadata.py
@@ -3,14 +3,157 @@
 import sys
 import json
 
-def grobid_ok(obj):
-    return True
+with open('title_slug_blacklist.txt', 'r') as f:
+    TITLE_BLACKLIST = [l.strip() for l in f]
 
-def run():
+TITLE_BLACKLIST.extend((
+    'editorial',
+    'advertisement',
+    'bookreviews',
+    'reviews',
+    'nr',
+    'abstractoriginalarticle',
+    'originalarticle',
+    'impactfactor',
+    'articlenumber',
+))
+
+# The full name can't *entirely* be one of these
+NAME_BLACKLIST = (
+    'phd',
+    'phdstudent',
+)
+
+def tokenize(s, remove_whitespace=True):
+
+    s.replace('&apos;', "'")
+    # Remove non-alphanumeric characters
+    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+
+    if remove_whitespace:
+        s = ''.join(s.split())
+
+    # Encode as dumb ASCII (TODO: this is horrible)
+    return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+
+assert tokenize("Impact Factor: 2.114") == "impactfactor"
+assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+
+def filter_title(title):
+
+    title = title.strip()
+    if len(title) > 500:
+        return None
+    title_slug = tokenize(title, remove_whitespace=True)
+    if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+        return None
+    if title_slug.startswith('nr'):
+        return None
+    if title.lower().replace('.', '').startswith('int j '):
+        return None
+
+    for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
+        if title.startswith(prefix):
+            title.replace(prefix, '')
+
+    if title.startswith("The Journal of "):
+        return None
+
+    if "volume" in title_slug and "issue" in title_slug:
+        return None
+
+    if "downloadedfrom" in title_slug:
+        return None
+
+    if title_slug.startswith("issn"):
+        return None
+
+    # titles with too many or too few words in title
+    title_words = len(title.split())
+    if title_words > 50 or title_words < 2:
+        return None
+
+    # titles with spaces between every letter (more than N such single-char words)
+    if len([True for w in title.split() if len(w) == 1]) > 12:
+        return None
+
+    # too deep subtitling/splitting
+    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+        return None
+
+    return title
+
+def filter_author_name(name):
+    name = name['name']
+    if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+        return None
+    return ' '.join([t for t in name.split() if tokenize(t)])
+
+def filter_authors(l):
+    return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
+def filter_refs(l):
+    # TODO:
+    return l
+
+def filter_journal_name(name):
+    # same blacklist, for now
+    if not name:
+        return None
+    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+    slug_name = tokenize(name)
+    if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
+        return None
+    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+        if name.startswith(prefix):
+            name = name.replace(prefix, '')
+    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+        if name.endswith(suffix):
+            name = name.replace(suffix, '')
+    if "====================" in name:
+        return None
+    if len(name) > 150:
+        return None
+    return ' '.join(name.split())
+
+def filter_metadata(obj):
+    if not (obj.get('title') and obj.get('authors')):
+        return None
+
+    title = filter_title(obj['title'])
+    if not title:
+        #sys.stderr.write("bad title\n")
+        return None
+    else:
+        obj['title'] = title
+    obj['authors'] = filter_authors(obj['authors'])
+    obj['citations'] = filter_refs(obj['citations'])
+    obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+
+    return obj
+
+def run(invert=False):
     for line in sys.stdin:
-        obj = json.loads(line)
-        if grobid_ok(obj):
-            print(line.strip())
+        fields = line.split('\t')
+        if len(fields) == 5:
+            raw = fields[4]
+        elif len(fields) == 1:
+            raw = fields[0]
+        else:
+            sys.stderr.write("bad line\n")
+            continue
+        obj = json.loads(raw)
+        processed = filter_metadata(obj)
+        if processed:
+            if not invert:
+                processed = json.dumps(processed)
+                if len(fields) == 5:
+                    fields[4] = processed
+                else:
+                    fields[0] = processed
+                print('\t'.join(fields))
+        elif invert:
+            print(raw.strip())
 
 if __name__=="__main__":
-    run()
+    run(invert="--invert" in sys.argv)
diff --git a/python/title_slug_blacklist.txt b/python/title_slug_blacklist.txt
new file mode 120000
index 0000000..5bca386
--- /dev/null
+++ b/python/title_slug_blacklist.txt
@@ -0,0 +1 @@
+../scalding/src/main/resources/slug-denylist.txt
+\ No newline at end of file
author	Bryan Newbold <bnewbold@archive.org>	2018-09-26 01:55:22 +0000
committer	Bryan Newbold <bnewbold@archive.org>	2018-09-26 01:55:22 +0000
commit	0bbe8e1f6689da846944d60a53e620adc2b7622b (patch)
tree	38cac67e061e4948d9cf1d17f64c25abca486635
parent	7159fdf1ec55a4c9c096afb5eb1ce57b9a51f1e8 (diff)
download	sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.tar.gz sandcrawler-0bbe8e1f6689da846944d60a53e620adc2b7622b.zip