aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-05 17:35:59 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commita987af927686725f7778475f4c383d59c8c494bf (patch)
treef620f4897ce810198c069fbb2f2f5a2570772ea9 /python
parent3ec275c7d78aa261027f35c26366a382c5dd7a6c (diff)
downloadfatcat-a987af927686725f7778475f4c383d59c8c494bf.tar.gz
fatcat-a987af927686725f7778475f4c383d59c8c494bf.zip
small bugfixes to pubmed xml parser
Diffstat (limited to 'python')
-rw-r--r--python/parse_pubmed_xml.py24
1 files changed, 13 insertions, 11 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py
index 9350e9a4..413333cc 100644
--- a/python/parse_pubmed_xml.py
+++ b/python/parse_pubmed_xml.py
@@ -161,20 +161,22 @@ class PubMedParser():
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.string, # always present
- if type(title) is tuple:
- title = ': '.join(title)
- if title.endswith('.'):
- title = title[:-1]
- # this hides some "special" titles, but the vast majority are
- # translations; translations don't always include the original_title
- if title.startswith('[') and title.endswith(']'):
- title = title[1:-1]
+ title = medline.Article.ArticleTitle.string # always present
+ if title:
+ if title.endswith('.'):
+ title = title[:-1]
+ # this hides some "special" titles, but the vast majority are
+ # translations; translations don't always include the original_title
+ if title.startswith('[') and title.endswith(']'):
+ title = title[1:-1]
+ else:
+ # TODO: will filter out later
+ title = None
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
- original_title = original_title.string
- if original_title.endswith('.'):
+ original_title = original_title.string or None
+ if original_title and original_title.endswith('.'):
original_title = original_title[:-1]
# TODO: happening in alpha order, not handling multi-language well.