From 020c29105393bda68e493be06c0c7d124b2e20ea Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Apr 2020 17:56:23 -0700 Subject: transform: remove more tags from abstracts --- fatcat_covid19/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 338b3e8..7e1a47f 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -81,7 +81,7 @@ def fulltext_to_elasticsearch(row, force_bool=True): # hack to (partially) clean up common JATS abstract display case if a.get('mimetype') == 'application/xml+jats': - for tag in ('p', 'jats', 'jats:p', 'jats:title'): + for tag in ('p', 'b', 'i', 'br', 'jats', 'jats:p', 'jats:title'): a['content'] = a['content'].replace('<{}>'.format(tag), '') a['content'] = a['content'].replace(''.format(tag), '') # ugh, double encoding happens -- cgit v1.2.3