aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-09 17:56:23 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-09 17:56:23 -0700
commit020c29105393bda68e493be06c0c7d124b2e20ea (patch)
treebc23f3a53a47b7d992860bec4477cf45217adf7f /fatcat_covid19
parent09e8e07602d04acef6fada5c82f1ae6268d03d3f (diff)
downloadfatcat-covid19-020c29105393bda68e493be06c0c7d124b2e20ea.tar.gz
fatcat-covid19-020c29105393bda68e493be06c0c7d124b2e20ea.zip
transform: remove more tags from abstracts
Diffstat (limited to 'fatcat_covid19')
-rw-r--r--fatcat_covid19/transform.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py
index 338b3e8..7e1a47f 100644
--- a/fatcat_covid19/transform.py
+++ b/fatcat_covid19/transform.py
@@ -81,7 +81,7 @@ def fulltext_to_elasticsearch(row, force_bool=True):
# hack to (partially) clean up common JATS abstract display case
if a.get('mimetype') == 'application/xml+jats':
- for tag in ('p', 'jats', 'jats:p', 'jats:title'):
+ for tag in ('p', 'b', 'i', 'br', 'jats', 'jats:p', 'jats:title'):
a['content'] = a['content'].replace('<{}>'.format(tag), '')
a['content'] = a['content'].replace('</{}>'.format(tag), '')
# ugh, double encoding happens