diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 17:35:59 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 | 
| commit | a987af927686725f7778475f4c383d59c8c494bf (patch) | |
| tree | f620f4897ce810198c069fbb2f2f5a2570772ea9 | |
| parent | 3ec275c7d78aa261027f35c26366a382c5dd7a6c (diff) | |
| download | fatcat-a987af927686725f7778475f4c383d59c8c494bf.tar.gz fatcat-a987af927686725f7778475f4c383d59c8c494bf.zip | |
small bugfixes to pubmed xml parser
| -rw-r--r-- | python/parse_pubmed_xml.py | 24 | 
1 files changed, 13 insertions, 11 deletions
| diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py index 9350e9a4..413333cc 100644 --- a/python/parse_pubmed_xml.py +++ b/python/parse_pubmed_xml.py @@ -161,20 +161,22 @@ class PubMedParser():          if pages:              pages = pages.string -        title = medline.Article.ArticleTitle.string, # always present -        if type(title) is tuple: -            title = ': '.join(title) -        if title.endswith('.'): -            title = title[:-1] -        # this hides some "special" titles, but the vast majority are -        # translations; translations don't always include the original_title -        if title.startswith('[') and title.endswith(']'): -            title = title[1:-1] +        title = medline.Article.ArticleTitle.string # always present +        if title: +            if title.endswith('.'): +                title = title[:-1] +            # this hides some "special" titles, but the vast majority are +            # translations; translations don't always include the original_title +            if title.startswith('[') and title.endswith(']'): +                title = title[1:-1] +        else: +            # TODO: will filter out later +            title = None          original_title = medline.Article.find("VernacularTitle", recurse=False)          if original_title: -            original_title = original_title.string -            if original_title.endswith('.'): +            original_title = original_title.string or None +            if original_title and original_title.endswith('.'):                  original_title = original_title[:-1]          # TODO: happening in alpha order, not handling multi-language well. | 
