diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-05 14:27:31 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 5b4c51fbbe0ce78fe72b3b756dea2cf9bdcf7969 (patch) | |
tree | 03741236d944949277079bf148fc582c96408d26 /python/parse_pubmed_xml.py | |
parent | d1d04396c7a995eaf1129b123644bc52b14a73ac (diff) | |
download | fatcat-5b4c51fbbe0ce78fe72b3b756dea2cf9bdcf7969.tar.gz fatcat-5b4c51fbbe0ce78fe72b3b756dea2cf9bdcf7969.zip |
basic pubmed parser
Diffstat (limited to 'python/parse_pubmed_xml.py')
-rw-r--r-- | python/parse_pubmed_xml.py | 370 |
1 files changed, 370 insertions, 0 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py new file mode 100644 index 00000000..9350e9a4 --- /dev/null +++ b/python/parse_pubmed_xml.py @@ -0,0 +1,370 @@ + +import sys +import json +import datetime +from bs4 import BeautifulSoup +from bs4.element import NavigableString + +# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly +PUBMED_RELEASE_TYPE_MAP = { + #Adaptive Clinical Trial + "Address": "speech", + "Autobiography": "book", + #Bibliography + "Biography": "book", + #Case Reports + "Classical Article": "article-journal", + #Clinical Conference + #Clinical Study + #Clinical Trial + #Clinical Trial, Phase I + #Clinical Trial, Phase II + #Clinical Trial, Phase III + #Clinical Trial, Phase IV + #Clinical Trial Protocol + #Clinical Trial, Veterinary + #Collected Works + #Comparative Study + #Congress + #Consensus Development Conference + #Consensus Development Conference, NIH + #Controlled Clinical Trial + "Dataset": "dataset", + #Dictionary + #Directory + #Duplicate Publication + "Editorial": "editorial", + #English Abstract # doesn't indicate that this is abstract-only + #Equivalence Trial + #Evaluation Studies + #Expression of Concern + #Festschrift + #Government Document + #Guideline + "Historical Article": "article-journal", + #Interactive Tutorial + "Interview": "interview", + "Introductory Journal Article": "article-journal", + "Journal Article": "article-journal", + "Lecture": "speech", + "Legal Case": "legal_case", + "Legislation": "legislation", + "Letter": "letter", + #Meta-Analysis + #Multicenter Study + #News + "Newspaper Article": "article-newspaper", + #Observational Study + #Observational Study, Veterinary + #Overall + #Patient Education Handout + #Periodical Index + #Personal Narrative + #Portrait + #Practice Guideline + #Pragmatic Clinical Trial + #Publication Components + #Publication Formats + #Publication Type Category + #Randomized Controlled Trial + #Research Support, American Recovery and Reinvestment Act + #Research Support, N.I.H., Extramural + #Research Support, N.I.H., Intramural + #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + #Research Support, U.S. Gov't, P.H.S. + #Review # in the "literature review" sense, not "product review" + #Scientific Integrity Review + #Study Characteristics + #Support of Research + #Systematic Review + "Technical Report": "report", + #Twin Study + #Validation Studies + #Video-Audio Media + #Webcasts +} + +MONTH_ABBR_MAP = { + "Jan": 1, "01": 1, + "Feb": 2, "02": 2, + "Mar": 3, "03": 3, + "Apr": 4, "04": 4, + "May": 5, "05": 5, + "Jun": 6, "06": 6, + "Jul": 7, "07": 7, + "Aug": 8, "08": 8, + "Sep": 9, "09": 9, + "Oct": 10, "10": 10, + "Nov": 11, "11": 11, + "Dec": 12, "12": 12, +} + +class PubMedParser(): + """ + Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON) + + TODO: MEDLINE doesn't include PMC/OA license; could include in importer? + TODO: clean (ftfy) title, original title, etc + """ + + def __init__(self): + pass + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("PubmedArticle"): + resp = self.parse_article(article) + print(json.dumps(resp)) + #sys.exit(-1) + + def parse_article(self, a): + + medline = a.MedlineCitation + # PubmedData isn't required by DTD, but seems to always be present + pubmed = a.PubmedData + extra = dict() + extra_pubmed = dict() + + identifiers = pubmed.ArticleIdList + doi = identifiers.find("ArticleId", IdType="doi") + if doi: + doi = doi.string.lower() + + pmcid = identifiers.find("ArticleId", IdType="pmc") + if pmcid: + pmcid = pmcid.string + + release_type = None + for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): + if pub_type.string in PUBMED_RELEASE_TYPE_MAP: + release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] + break + if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): + release_type = "retraction" + retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") + if retraction_of: + extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string + extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string + + # everything in medline is published + release_status = "published" + if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): + release_status = "updated" + if medline.Article.PublicationTypeList.find(string="Retracted Publication"): + release_status = "retracted" + + pages = medline.find('MedlinePgn') + if pages: + pages = pages.string + + title = medline.Article.ArticleTitle.string, # always present + if type(title) is tuple: + title = ': '.join(title) + if title.endswith('.'): + title = title[:-1] + # this hides some "special" titles, but the vast majority are + # translations; translations don't always include the original_title + if title.startswith('[') and title.endswith(']'): + title = title[1:-1] + + original_title = medline.Article.find("VernacularTitle", recurse=False) + if original_title: + original_title = original_title.string + if original_title.endswith('.'): + original_title = original_title[:-1] + + # TODO: happening in alpha order, not handling multi-language well. + # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html + language = medline.Article.Language + if language: + language = language.string + # TODO: map to two-letter + if language in ("und", "un"): + # "undetermined" + language = None + + ### Journal/Issue Metadata + # MedlineJournalInfo is always present + container = dict() + container_extra = dict() + mji = medline.MedlineJournalInfo + if mji.find("Country"): + container_extra['country_name'] = mji.Country.string + if mji.find("ISSNLinking"): + container['issnl'] = mji.ISSNLinking.string + + journal = medline.Article.Journal + issnp = journal.find("ISSN", IssnType="Print") + if issnp: + container_extra['issnp'] = issnp.string + + pub_date = journal.PubDate + release_date = None + if pub_date.find("MedlineDate"): + release_year = int(pub_date.MedlineDate.string.split()[0][:4]) + else: + release_year = int(pub_date.Year.string) + if pub_date.find("Day") and pub_date.find("Month"): + release_date = datetime.date( + release_year, + MONTH_ABBR_MAP[pub_date.Month.string], + int(pub_date.Day.string)) + release_date = release_date.isoformat() + + ji = journal.JournalIssue + volume = None + if ji.find("Volume"): + volume = ji.Volume.string + issue = None + if ji.find("Issue"): + issue = ji.Issue.string + if journal.find("Title"): + container['name'] = journal.Title.string + + if extra_pubmed: + extra['pubmed'] = extra_pubmed + if not extra: + extra = None + + ### Abstracts + # "All abstracts are in English" + abstracts = [] + first_abstract = medline.find("AbstractText") + if first_abstract and first_abstract.get('NlmCategory'): + joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) + abstracts.append(dict( + content=joined, + mimetype="text/plain", + lang="en", + )) + else: + for abstract in medline.find_all("AbstractText"): + abstracts.append(dict( + content=abstract.get_text().strip(), + mimetype="text/plain", + lang="en", + )) + if abstract.find('math'): + abstracts.append(dict( + # strip the <AbstractText> tags + content=str(abstract)[14:-15], + mimetype="application/mathml+xml", + lang="en", + )) + if not abstracts: + abstracts = None + + ### Contribs + contribs = [] + if medline.AuthorList: + for author in medline.AuthorList.find_all("Author"): + contrib = dict( + role="author", + ) + if author.ForeName: + contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string) + elif author.LastName: + contrib['raw_name'] = author.LastName.string + contrib_extra = dict() + orcid = author.find("Identifier", Source="ORCID") + if orcid: + # needs re-formatting from, eg, "0000000179841889" + orcid = orcid.string + if orcid.startswith("http://orcid.org/"): + orcid = orcid.replace("http://orcid.org/", "") + elif orcid.startswith("https://orcid.org/"): + orcid = orcid.replace("https://orcid.org/", "") + elif not '-' in orcid: + orcid = "{}-{}-{}-{}".format( + orcid[0:4], + orcid[4:8], + orcid[8:12], + orcid[12:16], + ) + contrib_extra['orcid'] = orcid + affiliation = author.find("Affiliation") + if affiliation: + contrib['raw_affiliation'] = affiliation.string + if author.find("EqualContrib"): + # TODO: schema for this? + contrib_extra['equal_contrib'] = True + if contrib_extra: + contrib['extra'] = contrib_extra + contribs.append(contrib) + + if medline.AuthorList['CompleteYN'] == 'N': + contribs.append(dict(raw_name="et al.")) + if not contribs: + contribs = None + + ### References + refs = [] + if pubmed.ReferenceList: + for ref in pubmed.ReferenceList.find_all('Reference'): + ref_obj = dict() + ref_extra = dict() + ref_pmid = ref.find("ArticleId", IdType="pubmed") + if ref_pmid: + ref_extra['pmid'] = ref_pmid.string + ref_raw = ref.Citation + if ref_raw: + ref_extra['raw'] = ref_raw.string + if ref_extra: + ref_obj['extra'] = ref_extra + refs.append(ref_obj) + if not refs: + refs = None + + re = dict( + work_id=None, + title=title, + original_title=original_title, + release_type=release_type, + release_status=release_status, + release_date=release_date, + release_year=release_year, + doi=doi, + pmid=int(medline.PMID.string), # always present + pmcid=pmcid, + #isbn13 # never in Article + volume=volume, + issue=issue, + pages=pages, + #publisher # not included? + language=language, + #license_slug # not in MEDLINE + + # content, mimetype, lang + abstracts=abstracts, + + # raw_name, role, raw_affiliation, extra + contribs=contribs, + + # key, year, container_name, title, locator + # extra: volume, authors, issue, publisher, identifiers + refs=refs, + + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + container=container, + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + extra=extra, + ) + + return re + +if __name__=='__main__': + parser = PubMedParser() + parser.parse_file(open(sys.argv[1])) |