aboutsummaryrefslogtreecommitdiffstats
path: root/python/parse_pubmed_xml.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-05 17:35:59 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-03-05 17:35:59 -0800
commitdcdebd34883d1a3afa568acea5ebc30842cda538 (patch)
tree19d7c4eb4013cffa9e84df3532778d7646429eba /python/parse_pubmed_xml.py
parentc8d24dd3c743a2413eb87ec02a5a9e5e67c4f7a1 (diff)
downloadfatcat-dcdebd34883d1a3afa568acea5ebc30842cda538.tar.gz
fatcat-dcdebd34883d1a3afa568acea5ebc30842cda538.zip
small bugfixes to pubmed xml parser
Diffstat (limited to 'python/parse_pubmed_xml.py')
-rw-r--r--python/parse_pubmed_xml.py24
1 files changed, 13 insertions, 11 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py
index 9350e9a4..413333cc 100644
--- a/python/parse_pubmed_xml.py
+++ b/python/parse_pubmed_xml.py
@@ -161,20 +161,22 @@ class PubMedParser():
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.string, # always present
- if type(title) is tuple:
- title = ': '.join(title)
- if title.endswith('.'):
- title = title[:-1]
- # this hides some "special" titles, but the vast majority are
- # translations; translations don't always include the original_title
- if title.startswith('[') and title.endswith(']'):
- title = title[1:-1]
+ title = medline.Article.ArticleTitle.string # always present
+ if title:
+ if title.endswith('.'):
+ title = title[:-1]
+ # this hides some "special" titles, but the vast majority are
+ # translations; translations don't always include the original_title
+ if title.startswith('[') and title.endswith(']'):
+ title = title[1:-1]
+ else:
+ # TODO: will filter out later
+ title = None
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
- original_title = original_title.string
- if original_title.endswith('.'):
+ original_title = original_title.string or None
+ if original_title and original_title.endswith('.'):
original_title = original_title[:-1]
# TODO: happening in alpha order, not handling multi-language well.