diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-15 22:36:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-21 11:41:29 -0700 |
commit | 300665927f578151321b0d91b28f8aadffcf227d (patch) | |
tree | 5df52bf64004adc52f8ebde5f75f549237d02a5c /python/parse_pubmed_xml.py | |
parent | e27e3f443ea35b145dd07c252cdc8619d7c2ab15 (diff) | |
download | fatcat-300665927f578151321b0d91b28f8aadffcf227d.tar.gz fatcat-300665927f578151321b0d91b28f8aadffcf227d.zip |
initial pubmed importer
Diffstat (limited to 'python/parse_pubmed_xml.py')
-rw-r--r-- | python/parse_pubmed_xml.py | 372 |
1 files changed, 0 insertions, 372 deletions
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py deleted file mode 100644 index 413333cc..00000000 --- a/python/parse_pubmed_xml.py +++ /dev/null @@ -1,372 +0,0 @@ - -import sys -import json -import datetime -from bs4 import BeautifulSoup -from bs4.element import NavigableString - -# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly -PUBMED_RELEASE_TYPE_MAP = { - #Adaptive Clinical Trial - "Address": "speech", - "Autobiography": "book", - #Bibliography - "Biography": "book", - #Case Reports - "Classical Article": "article-journal", - #Clinical Conference - #Clinical Study - #Clinical Trial - #Clinical Trial, Phase I - #Clinical Trial, Phase II - #Clinical Trial, Phase III - #Clinical Trial, Phase IV - #Clinical Trial Protocol - #Clinical Trial, Veterinary - #Collected Works - #Comparative Study - #Congress - #Consensus Development Conference - #Consensus Development Conference, NIH - #Controlled Clinical Trial - "Dataset": "dataset", - #Dictionary - #Directory - #Duplicate Publication - "Editorial": "editorial", - #English Abstract # doesn't indicate that this is abstract-only - #Equivalence Trial - #Evaluation Studies - #Expression of Concern - #Festschrift - #Government Document - #Guideline - "Historical Article": "article-journal", - #Interactive Tutorial - "Interview": "interview", - "Introductory Journal Article": "article-journal", - "Journal Article": "article-journal", - "Lecture": "speech", - "Legal Case": "legal_case", - "Legislation": "legislation", - "Letter": "letter", - #Meta-Analysis - #Multicenter Study - #News - "Newspaper Article": "article-newspaper", - #Observational Study - #Observational Study, Veterinary - #Overall - #Patient Education Handout - #Periodical Index - #Personal Narrative - #Portrait - #Practice Guideline - #Pragmatic Clinical Trial - #Publication Components - #Publication Formats - #Publication Type Category - #Randomized Controlled Trial - #Research Support, American Recovery and Reinvestment Act - #Research Support, N.I.H., Extramural - #Research Support, N.I.H., Intramural - #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - #Research Support, U.S. Gov't, P.H.S. - #Review # in the "literature review" sense, not "product review" - #Scientific Integrity Review - #Study Characteristics - #Support of Research - #Systematic Review - "Technical Report": "report", - #Twin Study - #Validation Studies - #Video-Audio Media - #Webcasts -} - -MONTH_ABBR_MAP = { - "Jan": 1, "01": 1, - "Feb": 2, "02": 2, - "Mar": 3, "03": 3, - "Apr": 4, "04": 4, - "May": 5, "05": 5, - "Jun": 6, "06": 6, - "Jul": 7, "07": 7, - "Aug": 8, "08": 8, - "Sep": 9, "09": 9, - "Oct": 10, "10": 10, - "Nov": 11, "11": 11, - "Dec": 12, "12": 12, -} - -class PubMedParser(): - """ - Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON) - - TODO: MEDLINE doesn't include PMC/OA license; could include in importer? - TODO: clean (ftfy) title, original title, etc - """ - - def __init__(self): - pass - - def parse_file(self, handle): - - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") - - # 2. iterate over articles, call parse_article on each - for article in soup.find_all("PubmedArticle"): - resp = self.parse_article(article) - print(json.dumps(resp)) - #sys.exit(-1) - - def parse_article(self, a): - - medline = a.MedlineCitation - # PubmedData isn't required by DTD, but seems to always be present - pubmed = a.PubmedData - extra = dict() - extra_pubmed = dict() - - identifiers = pubmed.ArticleIdList - doi = identifiers.find("ArticleId", IdType="doi") - if doi: - doi = doi.string.lower() - - pmcid = identifiers.find("ArticleId", IdType="pmc") - if pmcid: - pmcid = pmcid.string - - release_type = None - for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): - if pub_type.string in PUBMED_RELEASE_TYPE_MAP: - release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] - break - if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): - release_type = "retraction" - retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") - if retraction_of: - extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string - extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string - - # everything in medline is published - release_status = "published" - if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): - release_status = "updated" - if medline.Article.PublicationTypeList.find(string="Retracted Publication"): - release_status = "retracted" - - pages = medline.find('MedlinePgn') - if pages: - pages = pages.string - - title = medline.Article.ArticleTitle.string # always present - if title: - if title.endswith('.'): - title = title[:-1] - # this hides some "special" titles, but the vast majority are - # translations; translations don't always include the original_title - if title.startswith('[') and title.endswith(']'): - title = title[1:-1] - else: - # TODO: will filter out later - title = None - - original_title = medline.Article.find("VernacularTitle", recurse=False) - if original_title: - original_title = original_title.string or None - if original_title and original_title.endswith('.'): - original_title = original_title[:-1] - - # TODO: happening in alpha order, not handling multi-language well. - # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html - language = medline.Article.Language - if language: - language = language.string - # TODO: map to two-letter - if language in ("und", "un"): - # "undetermined" - language = None - - ### Journal/Issue Metadata - # MedlineJournalInfo is always present - container = dict() - container_extra = dict() - mji = medline.MedlineJournalInfo - if mji.find("Country"): - container_extra['country_name'] = mji.Country.string - if mji.find("ISSNLinking"): - container['issnl'] = mji.ISSNLinking.string - - journal = medline.Article.Journal - issnp = journal.find("ISSN", IssnType="Print") - if issnp: - container_extra['issnp'] = issnp.string - - pub_date = journal.PubDate - release_date = None - if pub_date.find("MedlineDate"): - release_year = int(pub_date.MedlineDate.string.split()[0][:4]) - else: - release_year = int(pub_date.Year.string) - if pub_date.find("Day") and pub_date.find("Month"): - release_date = datetime.date( - release_year, - MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) - release_date = release_date.isoformat() - - ji = journal.JournalIssue - volume = None - if ji.find("Volume"): - volume = ji.Volume.string - issue = None - if ji.find("Issue"): - issue = ji.Issue.string - if journal.find("Title"): - container['name'] = journal.Title.string - - if extra_pubmed: - extra['pubmed'] = extra_pubmed - if not extra: - extra = None - - ### Abstracts - # "All abstracts are in English" - abstracts = [] - first_abstract = medline.find("AbstractText") - if first_abstract and first_abstract.get('NlmCategory'): - joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) - abstracts.append(dict( - content=joined, - mimetype="text/plain", - lang="en", - )) - else: - for abstract in medline.find_all("AbstractText"): - abstracts.append(dict( - content=abstract.get_text().strip(), - mimetype="text/plain", - lang="en", - )) - if abstract.find('math'): - abstracts.append(dict( - # strip the <AbstractText> tags - content=str(abstract)[14:-15], - mimetype="application/mathml+xml", - lang="en", - )) - if not abstracts: - abstracts = None - - ### Contribs - contribs = [] - if medline.AuthorList: - for author in medline.AuthorList.find_all("Author"): - contrib = dict( - role="author", - ) - if author.ForeName: - contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string) - elif author.LastName: - contrib['raw_name'] = author.LastName.string - contrib_extra = dict() - orcid = author.find("Identifier", Source="ORCID") - if orcid: - # needs re-formatting from, eg, "0000000179841889" - orcid = orcid.string - if orcid.startswith("http://orcid.org/"): - orcid = orcid.replace("http://orcid.org/", "") - elif orcid.startswith("https://orcid.org/"): - orcid = orcid.replace("https://orcid.org/", "") - elif not '-' in orcid: - orcid = "{}-{}-{}-{}".format( - orcid[0:4], - orcid[4:8], - orcid[8:12], - orcid[12:16], - ) - contrib_extra['orcid'] = orcid - affiliation = author.find("Affiliation") - if affiliation: - contrib['raw_affiliation'] = affiliation.string - if author.find("EqualContrib"): - # TODO: schema for this? - contrib_extra['equal_contrib'] = True - if contrib_extra: - contrib['extra'] = contrib_extra - contribs.append(contrib) - - if medline.AuthorList['CompleteYN'] == 'N': - contribs.append(dict(raw_name="et al.")) - if not contribs: - contribs = None - - ### References - refs = [] - if pubmed.ReferenceList: - for ref in pubmed.ReferenceList.find_all('Reference'): - ref_obj = dict() - ref_extra = dict() - ref_pmid = ref.find("ArticleId", IdType="pubmed") - if ref_pmid: - ref_extra['pmid'] = ref_pmid.string - ref_raw = ref.Citation - if ref_raw: - ref_extra['raw'] = ref_raw.string - if ref_extra: - ref_obj['extra'] = ref_extra - refs.append(ref_obj) - if not refs: - refs = None - - re = dict( - work_id=None, - title=title, - original_title=original_title, - release_type=release_type, - release_status=release_status, - release_date=release_date, - release_year=release_year, - doi=doi, - pmid=int(medline.PMID.string), # always present - pmcid=pmcid, - #isbn13 # never in Article - volume=volume, - issue=issue, - pages=pages, - #publisher # not included? - language=language, - #license_slug # not in MEDLINE - - # content, mimetype, lang - abstracts=abstracts, - - # raw_name, role, raw_affiliation, extra - contribs=contribs, - - # key, year, container_name, title, locator - # extra: volume, authors, issue, publisher, identifiers - refs=refs, - - # name, type, publisher, issnl - # extra: issnp, issne, original_name, languages, country - container=container, - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs - extra=extra, - ) - - return re - -if __name__=='__main__': - parser = PubMedParser() - parser.parse_file(open(sys.argv[1])) |