aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-08 15:08:04 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-08 15:08:04 -0700
commitb92aef0bb235e8e38af2fbc25e33639f3654b191 (patch)
treef9db84128aa05b3cf2683c6caa5ee67b8934e7c6 /fatcat_covid19
parent7f5a6e64a865ff2a4de9b5bafeed68a0195597bf (diff)
downloadfatcat-covid19-b92aef0bb235e8e38af2fbc25e33639f3654b191.tar.gz
fatcat-covid19-b92aef0bb235e8e38af2fbc25e33639f3654b191.zip
transform: try to cleanup abstracts
Diffstat (limited to 'fatcat_covid19')
-rw-r--r--fatcat_covid19/transform.py34
1 files changed, 31 insertions, 3 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py
index 3f942ba..0eb1b0a 100644
--- a/fatcat_covid19/transform.py
+++ b/fatcat_covid19/transform.py
@@ -7,6 +7,17 @@ import datetime
from fatcat_covid19.common import *
+UNWANTED_ABSTRACT_PREFIXES = [
+ # roughly sort this long to short
+ 'Abstract No Abstract ',
+ 'Publisher Summary ',
+ 'Abstract ',
+ 'ABSTRACT ',
+ 'Summary ',
+ 'Background: ',
+ 'Background ',
+]
+
def fulltext_to_elasticsearch(row, force_bool=True):
"""
Converts from fulltext content and release model/schema to elasticsearch
@@ -67,9 +78,26 @@ def fulltext_to_elasticsearch(row, force_bool=True):
if release.get('abstracts'):
for a in release['abstracts']:
- abstracts.append(a['content'])
- if a.get('lang'):
- abstract_langs.append(a['lang'])
+
+ # hack to (partially) clean up common JATS abstract display case
+ if a.get('mimetype') == 'application/xml+jats':
+ for tag in ('p', 'jats', 'jats:p', 'jats:title'):
+ a['content'] = a['content'].replace('<{}>'.format(tag), '')
+ a['content'] = a['content'].replace('</{}>'.format(tag), '')
+ # ugh, double encoding happens
+ a['content'] = a['content'].replace('&lt;/{}&gt;'.format(tag), '')
+ a['content'] = a['content'].replace('&lt;{}&gt;'.format(tag), '')
+
+ # hack to remove abstract prefixes
+ for prefix in UNWANTED_ABSTRACT_PREFIXES:
+ if a['content'].startswith(prefix):
+ a['content'] = a['content'][len(prefix):]
+ break
+ a['content'] = a['content'].strip()
+ if a['content']:
+ abstracts.append(a['content'].strip())
+ if a.get('lang'):
+ abstract_langs.append(a['lang'])
contrib_names = []
contrib_affiliations = []