diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 17:35:59 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 17:35:59 -0800 |
commit | dcdebd34883d1a3afa568acea5ebc30842cda538 (patch) | |
tree | 19d7c4eb4013cffa9e84df3532778d7646429eba | |
parent | c8d24dd3c743a2413eb87ec02a5a9e5e67c4f7a1 (diff) | |
download | fatcat-dcdebd34883d1a3afa568acea5ebc30842cda538.tar.gz fatcat-dcdebd34883d1a3afa568acea5ebc30842cda538.zip |
small bugfixes to pubmed xml parser
-rw-r--r-- | python/parse_pubmed_xml.py | 24 |
1 files changed, 13 insertions, 11 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py index 9350e9a4..413333cc 100644 --- a/python/parse_pubmed_xml.py +++ b/python/parse_pubmed_xml.py @@ -161,20 +161,22 @@ class PubMedParser(): if pages: pages = pages.string - title = medline.Article.ArticleTitle.string, # always present - if type(title) is tuple: - title = ': '.join(title) - if title.endswith('.'): - title = title[:-1] - # this hides some "special" titles, but the vast majority are - # translations; translations don't always include the original_title - if title.startswith('[') and title.endswith(']'): - title = title[1:-1] + title = medline.Article.ArticleTitle.string # always present + if title: + if title.endswith('.'): + title = title[:-1] + # this hides some "special" titles, but the vast majority are + # translations; translations don't always include the original_title + if title.startswith('[') and title.endswith(']'): + title = title[1:-1] + else: + # TODO: will filter out later + title = None original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: - original_title = original_title.string - if original_title.endswith('.'): + original_title = original_title.string or None + if original_title and original_title.endswith('.'): original_title = original_title[:-1] # TODO: happening in alpha order, not handling multi-language well. |