small bugfixes to pubmed xml parser

author: Bryan Newbold <bnewbold@robocracy.org> 2019-03-05 17:35:59 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: a987af927686725f7778475f4c383d59c8c494bf (patch)
tree: f620f4897ce810198c069fbb2f2f5a2570772ea9
parent: 3ec275c7d78aa261027f35c26366a382c5dd7a6c (diff)
download: fatcat-a987af927686725f7778475f4c383d59c8c494bf.tar.gz
fatcat-a987af927686725f7778475f4c383d59c8c494bf.zip
1 files changed, 13 insertions, 11 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py
index 9350e9a4..413333cc 100644
--- a/python/parse_pubmed_xml.py
+++ b/python/parse_pubmed_xml.py
@@ -161,20 +161,22 @@ class PubMedParser():
         if pages:
             pages = pages.string
 
-        title = medline.Article.ArticleTitle.string, # always present
-        if type(title) is tuple:
-            title = ': '.join(title)
-        if title.endswith('.'):
-            title = title[:-1]
-        # this hides some "special" titles, but the vast majority are
-        # translations; translations don't always include the original_title
-        if title.startswith('[') and title.endswith(']'):
-            title = title[1:-1]
+        title = medline.Article.ArticleTitle.string # always present
+        if title:
+            if title.endswith('.'):
+                title = title[:-1]
+            # this hides some "special" titles, but the vast majority are
+            # translations; translations don't always include the original_title
+            if title.startswith('[') and title.endswith(']'):
+                title = title[1:-1]
+        else:
+            # TODO: will filter out later
+            title = None
 
         original_title = medline.Article.find("VernacularTitle", recurse=False)
         if original_title:
-            original_title = original_title.string
-            if original_title.endswith('.'):
+            original_title = original_title.string or None
+            if original_title and original_title.endswith('.'):
                 original_title = original_title[:-1]
 
         # TODO: happening in alpha order, not handling multi-language well.
author	Bryan Newbold <bnewbold@robocracy.org>	2019-03-05 17:35:59 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	a987af927686725f7778475f4c383d59c8c494bf (patch)
tree	f620f4897ce810198c069fbb2f2f5a2570772ea9
parent	3ec275c7d78aa261027f35c26366a382c5dd7a6c (diff)
download	fatcat-a987af927686725f7778475f4c383d59c8c494bf.tar.gz fatcat-a987af927686725f7778475f4c383d59c8c494bf.zip