basic pubmed parser

author: Bryan Newbold <bnewbold@robocracy.org> 2019-03-05 14:27:31 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-03-05 14:27:31 -0800
commit: bee73c6adb8eea919e5581a5050ba0311dd5ed22 (patch)
tree: bd34dd968246fd39c48762270fa5888b4389bcba /python/parse_pubmed_xml.py
parent: 30a8fc5f78eb15f641d6e8da0e7c0741d3f82b13 (diff)
download: fatcat-bee73c6adb8eea919e5581a5050ba0311dd5ed22.tar.gz
fatcat-bee73c6adb8eea919e5581a5050ba0311dd5ed22.zip
1 files changed, 370 insertions, 0 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py
new file mode 100644
index 00000000..9350e9a4
--- /dev/null
+++ b/python/parse_pubmed_xml.py
@@ -0,0 +1,370 @@
+
+import sys
+import json
+import datetime
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+    #Adaptive Clinical Trial
+    "Address": "speech",
+    "Autobiography": "book",
+    #Bibliography
+    "Biography": "book",
+    #Case Reports
+    "Classical Article": "article-journal",
+    #Clinical Conference
+    #Clinical Study
+    #Clinical Trial
+    #Clinical Trial, Phase I
+    #Clinical Trial, Phase II
+    #Clinical Trial, Phase III
+    #Clinical Trial, Phase IV
+    #Clinical Trial Protocol
+    #Clinical Trial, Veterinary
+    #Collected Works
+    #Comparative Study
+    #Congress
+    #Consensus Development Conference
+    #Consensus Development Conference, NIH
+    #Controlled Clinical Trial
+    "Dataset": "dataset",
+    #Dictionary
+    #Directory
+    #Duplicate Publication
+    "Editorial": "editorial",
+    #English Abstract   # doesn't indicate that this is abstract-only
+    #Equivalence Trial
+    #Evaluation Studies
+    #Expression of Concern
+    #Festschrift
+    #Government Document
+    #Guideline
+    "Historical Article": "article-journal",
+    #Interactive Tutorial
+    "Interview": "interview",
+    "Introductory Journal Article": "article-journal",
+    "Journal Article": "article-journal",
+    "Lecture": "speech",
+    "Legal Case": "legal_case",
+    "Legislation": "legislation",
+    "Letter": "letter",
+    #Meta-Analysis
+    #Multicenter Study
+    #News
+    "Newspaper Article": "article-newspaper",
+    #Observational Study
+    #Observational Study, Veterinary
+    #Overall
+    #Patient Education Handout
+    #Periodical Index
+    #Personal Narrative
+    #Portrait
+    #Practice Guideline
+    #Pragmatic Clinical Trial
+    #Publication Components
+    #Publication Formats
+    #Publication Type Category
+    #Randomized Controlled Trial
+    #Research Support, American Recovery and Reinvestment Act
+    #Research Support, N.I.H., Extramural
+    #Research Support, N.I.H., Intramural
+    #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    #Research Support, U.S. Gov't, P.H.S.
+    #Review     # in the "literature review" sense, not "product review"
+    #Scientific Integrity Review
+    #Study Characteristics
+    #Support of Research
+    #Systematic Review
+    "Technical Report": "report",
+    #Twin Study
+    #Validation Studies
+    #Video-Audio Media
+    #Webcasts
+}
+
+MONTH_ABBR_MAP = {
+    "Jan":  1, "01":  1,
+    "Feb":  2, "02":  2,
+    "Mar":  3, "03":  3,
+    "Apr":  4, "04":  4,
+    "May":  5, "05":  5,
+    "Jun":  6, "06":  6,
+    "Jul":  7, "07":  7,
+    "Aug":  8, "08":  8,
+    "Sep":  9, "09":  9,
+    "Oct": 10, "10": 10,
+    "Nov": 11, "11": 11,
+    "Dec": 12, "12": 12,
+}
+
+class PubMedParser():
+    """
+    Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON)
+
+    TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
+    TODO: clean (ftfy) title, original title, etc
+    """
+
+    def __init__(self):
+        pass
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("PubmedArticle"):
+            resp = self.parse_article(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+    def parse_article(self, a):
+
+        medline = a.MedlineCitation
+        # PubmedData isn't required by DTD, but seems to always be present
+        pubmed = a.PubmedData
+        extra = dict()
+        extra_pubmed = dict()
+
+        identifiers = pubmed.ArticleIdList
+        doi = identifiers.find("ArticleId", IdType="doi")
+        if doi:
+            doi = doi.string.lower()
+
+        pmcid = identifiers.find("ArticleId", IdType="pmc")
+        if pmcid:
+            pmcid = pmcid.string
+
+        release_type = None
+        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
+            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
+                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
+            break
+        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+            release_type = "retraction"
+            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
+            if retraction_of:
+                extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+                extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+
+        # everything in medline is published
+        release_status = "published"
+        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
+            release_status = "updated"
+        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
+            release_status = "retracted"
+
+        pages = medline.find('MedlinePgn')
+        if pages:
+            pages = pages.string
+
+        title = medline.Article.ArticleTitle.string, # always present
+        if type(title) is tuple:
+            title = ': '.join(title)
+        if title.endswith('.'):
+            title = title[:-1]
+        # this hides some "special" titles, but the vast majority are
+        # translations; translations don't always include the original_title
+        if title.startswith('[') and title.endswith(']'):
+            title = title[1:-1]
+
+        original_title = medline.Article.find("VernacularTitle", recurse=False)
+        if original_title:
+            original_title = original_title.string
+            if original_title.endswith('.'):
+                original_title = original_title[:-1]
+
+        # TODO: happening in alpha order, not handling multi-language well.
+        # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html
+        language = medline.Article.Language
+        if language:
+            language = language.string
+            # TODO: map to two-letter
+            if language in ("und", "un"):
+                # "undetermined"
+                language = None
+
+        ### Journal/Issue Metadata
+        # MedlineJournalInfo is always present
+        container = dict()
+        container_extra = dict()
+        mji = medline.MedlineJournalInfo
+        if mji.find("Country"):
+            container_extra['country_name'] = mji.Country.string
+        if mji.find("ISSNLinking"):
+            container['issnl'] = mji.ISSNLinking.string
+
+        journal = medline.Article.Journal
+        issnp = journal.find("ISSN", IssnType="Print")
+        if issnp:
+            container_extra['issnp'] = issnp.string
+
+        pub_date = journal.PubDate
+        release_date = None
+        if pub_date.find("MedlineDate"):
+            release_year = int(pub_date.MedlineDate.string.split()[0][:4])
+        else:
+            release_year = int(pub_date.Year.string)
+            if pub_date.find("Day") and pub_date.find("Month"):
+                release_date = datetime.date(
+                    release_year,
+                    MONTH_ABBR_MAP[pub_date.Month.string],
+                    int(pub_date.Day.string))
+                release_date = release_date.isoformat()
+       
+        ji = journal.JournalIssue
+        volume = None
+        if ji.find("Volume"):
+            volume = ji.Volume.string
+        issue = None
+        if ji.find("Issue"):
+            issue = ji.Issue.string
+        if journal.find("Title"):
+            container['name'] = journal.Title.string
+
+        if extra_pubmed:
+            extra['pubmed'] = extra_pubmed
+        if not extra:
+            extra = None
+
+        ### Abstracts
+        # "All abstracts are in English"
+        abstracts = []
+        first_abstract = medline.find("AbstractText")
+        if first_abstract and first_abstract.get('NlmCategory'):
+            joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
+            abstracts.append(dict(
+                content=joined,
+                mimetype="text/plain",
+                lang="en",
+            ))
+        else:
+            for abstract in medline.find_all("AbstractText"):
+                abstracts.append(dict(
+                    content=abstract.get_text().strip(),
+                    mimetype="text/plain",
+                    lang="en",
+                ))
+                if abstract.find('math'):
+                    abstracts.append(dict(
+                        # strip the <AbstractText> tags
+                        content=str(abstract)[14:-15],
+                        mimetype="application/mathml+xml",
+                        lang="en",
+                    ))
+        if not abstracts:
+            abstracts = None
+
+        ### Contribs
+        contribs = []
+        if medline.AuthorList:
+            for author in medline.AuthorList.find_all("Author"):
+                contrib = dict(
+                    role="author",
+                )
+                if author.ForeName:
+                    contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string)
+                elif author.LastName:
+                    contrib['raw_name'] = author.LastName.string
+                contrib_extra = dict()
+                orcid = author.find("Identifier", Source="ORCID")
+                if orcid:
+                    # needs re-formatting from, eg, "0000000179841889"
+                    orcid = orcid.string
+                    if orcid.startswith("http://orcid.org/"):
+                        orcid = orcid.replace("http://orcid.org/", "")
+                    elif orcid.startswith("https://orcid.org/"):
+                        orcid = orcid.replace("https://orcid.org/", "")
+                    elif not '-' in orcid:
+                        orcid = "{}-{}-{}-{}".format(
+                            orcid[0:4],
+                            orcid[4:8],
+                            orcid[8:12],
+                            orcid[12:16],
+                        )
+                    contrib_extra['orcid'] = orcid
+                affiliation = author.find("Affiliation")
+                if affiliation:
+                    contrib['raw_affiliation'] = affiliation.string
+                if author.find("EqualContrib"):
+                    # TODO: schema for this?
+                    contrib_extra['equal_contrib'] = True
+                if contrib_extra:
+                    contrib['extra'] = contrib_extra
+                contribs.append(contrib)
+
+            if medline.AuthorList['CompleteYN'] == 'N':
+                contribs.append(dict(raw_name="et al."))
+        if not contribs:
+            contribs = None
+
+        ### References
+        refs = []
+        if pubmed.ReferenceList:
+            for ref in pubmed.ReferenceList.find_all('Reference'):
+                ref_obj = dict()
+                ref_extra = dict()
+                ref_pmid = ref.find("ArticleId", IdType="pubmed")
+                if ref_pmid:
+                    ref_extra['pmid'] = ref_pmid.string
+                ref_raw = ref.Citation
+                if ref_raw:
+                    ref_extra['raw'] = ref_raw.string
+                if ref_extra:
+                    ref_obj['extra'] = ref_extra
+                refs.append(ref_obj)
+        if not refs:
+            refs = None
+
+        re = dict(
+            work_id=None,
+            title=title,
+            original_title=original_title,
+            release_type=release_type,
+            release_status=release_status,
+            release_date=release_date,
+            release_year=release_year,
+            doi=doi,
+            pmid=int(medline.PMID.string), # always present
+            pmcid=pmcid,
+            #isbn13     # never in Article
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            #publisher  # not included?
+            language=language,
+            #license_slug   # not in MEDLINE
+
+            # content, mimetype, lang
+            abstracts=abstracts,
+
+            # raw_name, role, raw_affiliation, extra
+            contribs=contribs,
+
+            # key, year, container_name, title, locator
+            # extra: volume, authors, issue, publisher, identifiers
+            refs=refs,
+
+            #   name, type, publisher, issnl
+            #   extra: issnp, issne, original_name, languages, country
+            container=container,
+
+            # extra:
+            #   withdrawn_date
+            #   translation_of
+            #   subtitle
+            #   aliases
+            #   container_name
+            #   group-title
+            #   pubmed: retraction refs
+            extra=extra,
+        )
+
+        return re
+
+if __name__=='__main__':
+    parser = PubMedParser()
+    parser.parse_file(open(sys.argv[1]))
author	Bryan Newbold <bnewbold@robocracy.org>	2019-03-05 14:27:31 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-03-05 14:27:31 -0800
commit	bee73c6adb8eea919e5581a5050ba0311dd5ed22 (patch)
tree	bd34dd968246fd39c48762270fa5888b4389bcba /python/parse_pubmed_xml.py
parent	30a8fc5f78eb15f641d6e8da0e7c0741d3f82b13 (diff)
download	fatcat-bee73c6adb8eea919e5581a5050ba0311dd5ed22.tar.gz fatcat-bee73c6adb8eea919e5581a5050ba0311dd5ed22.zip