diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-09 18:27:47 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-09 18:27:49 -0700 |
commit | 807a0d3481f725a4fab81b84f6e7053bde5ef797 (patch) | |
tree | 761a53dd741de139473e579b9d5f2eaac4fa523a /fatcat_covid19/transform.py | |
parent | 020c29105393bda68e493be06c0c7d124b2e20ea (diff) | |
download | fatcat-covid19-807a0d3481f725a4fab81b84f6e7053bde5ef797.tar.gz fatcat-covid19-807a0d3481f725a4fab81b84f6e7053bde5ef797.zip |
attempt somewhat more robust abstract cleaning
Note: there is still a security and robustness issue here in that
highlights are marked "safe". Should come up with a better mechanism for
escaping/safing.
Diffstat (limited to 'fatcat_covid19/transform.py')
-rw-r--r-- | fatcat_covid19/transform.py | 11 |
1 files changed, 4 insertions, 7 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 7e1a47f..1d2fa7c 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -4,6 +4,8 @@ import json import argparse import datetime +from bs4 import BeautifulSoup + from fatcat_covid19.common import * @@ -80,13 +82,8 @@ def fulltext_to_elasticsearch(row, force_bool=True): for a in release['abstracts']: # hack to (partially) clean up common JATS abstract display case - if a.get('mimetype') == 'application/xml+jats': - for tag in ('p', 'b', 'i', 'br', 'jats', 'jats:p', 'jats:title'): - a['content'] = a['content'].replace('<{}>'.format(tag), '') - a['content'] = a['content'].replace('</{}>'.format(tag), '') - # ugh, double encoding happens - a['content'] = a['content'].replace('</{}>'.format(tag), '') - a['content'] = a['content'].replace('<{}>'.format(tag), '') + if a.get('mimetype') == 'application/xml+jats' or "</" in a['content']: + a['content'] = BeautifulSoup(a['content'], "lxml").get_text() # hack to remove abstract prefixes for prefix in UNWANTED_ABSTRACT_PREFIXES: |