diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-08 15:08:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-08 15:08:04 -0700 |
commit | b92aef0bb235e8e38af2fbc25e33639f3654b191 (patch) | |
tree | f9db84128aa05b3cf2683c6caa5ee67b8934e7c6 /fatcat_covid19 | |
parent | 7f5a6e64a865ff2a4de9b5bafeed68a0195597bf (diff) | |
download | fatcat-covid19-b92aef0bb235e8e38af2fbc25e33639f3654b191.tar.gz fatcat-covid19-b92aef0bb235e8e38af2fbc25e33639f3654b191.zip |
transform: try to cleanup abstracts
Diffstat (limited to 'fatcat_covid19')
-rw-r--r-- | fatcat_covid19/transform.py | 34 |
1 files changed, 31 insertions, 3 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 3f942ba..0eb1b0a 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -7,6 +7,17 @@ import datetime from fatcat_covid19.common import * +UNWANTED_ABSTRACT_PREFIXES = [ + # roughly sort this long to short + 'Abstract No Abstract ', + 'Publisher Summary ', + 'Abstract ', + 'ABSTRACT ', + 'Summary ', + 'Background: ', + 'Background ', +] + def fulltext_to_elasticsearch(row, force_bool=True): """ Converts from fulltext content and release model/schema to elasticsearch @@ -67,9 +78,26 @@ def fulltext_to_elasticsearch(row, force_bool=True): if release.get('abstracts'): for a in release['abstracts']: - abstracts.append(a['content']) - if a.get('lang'): - abstract_langs.append(a['lang']) + + # hack to (partially) clean up common JATS abstract display case + if a.get('mimetype') == 'application/xml+jats': + for tag in ('p', 'jats', 'jats:p', 'jats:title'): + a['content'] = a['content'].replace('<{}>'.format(tag), '') + a['content'] = a['content'].replace('</{}>'.format(tag), '') + # ugh, double encoding happens + a['content'] = a['content'].replace('</{}>'.format(tag), '') + a['content'] = a['content'].replace('<{}>'.format(tag), '') + + # hack to remove abstract prefixes + for prefix in UNWANTED_ABSTRACT_PREFIXES: + if a['content'].startswith(prefix): + a['content'] = a['content'][len(prefix):] + break + a['content'] = a['content'].strip() + if a['content']: + abstracts.append(a['content'].strip()) + if a.get('lang'): + abstract_langs.append(a['lang']) contrib_names = [] contrib_affiliations = [] |