import sys import json import sqlite3 import datetime import warnings from bs4 import BeautifulSoup from bs4.element import NavigableString import fatcat_client from .common import EntityImporter, clean, LANG_MAP_MARC # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly PUBMED_RELEASE_TYPE_MAP = { #Adaptive Clinical Trial "Address": "speech", "Autobiography": "book", #Bibliography "Biography": "book", #Case Reports "Classical Article": "article-journal", #Clinical Conference #Clinical Study #Clinical Trial #Clinical Trial, Phase I #Clinical Trial, Phase II #Clinical Trial, Phase III #Clinical Trial, Phase IV #Clinical Trial Protocol #Clinical Trial, Veterinary #Collected Works #Comparative Study #Congress #Consensus Development Conference #Consensus Development Conference, NIH #Controlled Clinical Trial "Dataset": "dataset", #Dictionary #Directory #Duplicate Publication "Editorial": "editorial", #English Abstract # doesn't indicate that this is abstract-only #Equivalence Trial #Evaluation Studies #Expression of Concern #Festschrift #Government Document #Guideline "Historical Article": "article-journal", #Interactive Tutorial "Interview": "interview", "Introductory Journal Article": "article-journal", "Journal Article": "article-journal", "Lecture": "speech", "Legal Case": "legal_case", "Legislation": "legislation", "Letter": "letter", #Meta-Analysis #Multicenter Study #News "Newspaper Article": "article-newspaper", #Observational Study #Observational Study, Veterinary #Overall #Patient Education Handout #Periodical Index #Personal Narrative #Portrait #Practice Guideline #Pragmatic Clinical Trial #Publication Components #Publication Formats #Publication Type Category #Randomized Controlled Trial #Research Support, American Recovery and Reinvestment Act #Research Support, N.I.H., Extramural #Research Support, N.I.H., Intramural #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. #Research Support, U.S. Gov't, P.H.S. #Review # in the "literature review" sense, not "product review" #Scientific Integrity Review #Study Characteristics #Support of Research #Systematic Review "Technical Report": "report", #Twin Study #Validation Studies #Video-Audio Media #Webcasts } MONTH_ABBR_MAP = { "Jan": 1, "01": 1, "Feb": 2, "02": 2, "Mar": 3, "03": 3, "Apr": 4, "04": 4, "May": 5, "05": 5, "Jun": 6, "06": 6, "Jul": 7, "07": 7, "Aug": 8, "08": 8, "Sep": 9, "09": 9, "Oct": 10, "10": 10, "Nov": 11, "11": 11, "Dec": 12, "12": 12, } class PubmedImporter(EntityImporter): """ Importer for PubMed/MEDLINE XML metadata. TODO: MEDLINE doesn't include PMC/OA license; could include in importer? TODO: clean (ftfy) title, original title, etc XXX: withdrawn XXX: full author names """ def __init__(self, api, issn_map_file, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of PubMed/MEDLINE XML metadata") eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') super().__init__(api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) extid_map_file = kwargs.get('extid_map_file') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) print("Using external ID map: {}".format(db_uri)) self.extid_map_db = sqlite3.connect(db_uri, uri=True) else: print("Not using external ID map") self.create_containers = kwargs.get('create_containers') self.read_issn_map_file(issn_map_file) def lookup_ext_ids(self, pmid): if self.extid_map_db is None: return dict(doi=None, core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1", [pmid]).fetchone() if row is None: return dict(doi=None, core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], doi=row[1], pmcid=row[2], wikidata_qid=row[3], # TODO: arxiv_id=None, jstor_id=None, ) def want(self, obj): return True def parse_record(self, a): medline = a.MedlineCitation # PubmedData isn't required by DTD, but seems to always be present pubmed = a.PubmedData extra = dict() extra_pubmed = dict() identifiers = pubmed.ArticleIdList pmid = medline.PMID.string.strip() doi = identifiers.find("ArticleId", IdType="doi") if doi: doi = doi.string.lower() pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: # XXX: strip the version part? or retain? pmcid = pmcid.string.split('.')[0] release_type = None pub_types = [] for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): pub_types.append(pub_type.string) if pub_type.string in PUBMED_RELEASE_TYPE_MAP: release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] break if pub_types: extra_pubmed['pub_types'] = pub_types if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_type = "retraction" retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") if retraction_of: extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string # everything in medline is published release_stage = "published" if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): release_stage = "updated" if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_stage = "retraction" if medline.Article.PublicationTypeList.find(string="Retracted Publication"): withdrawn_status = "retracted" pages = medline.find('MedlinePgn') if pages: pages = pages.string title = medline.Article.ArticleTitle.string # always present if title: if title.endswith('.'): title = title[:-1] # this hides some "special" titles, but the vast majority are # translations; translations don't always include the original_title if title.startswith('[') and title.endswith(']'): title = title[1:-1] else: # TODO: will filter out later title = None original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.string or None if original_title and original_title.endswith('.'): original_title = original_title[:-1] # TODO: happening in alpha order, not handling multi-language well. # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html language = medline.Article.Language if language: language = language.string # TODO: map to two-letter if language in ("und", "un"): # "undetermined" language = None else: language = LANG_MAP_MARC.get(language) if not language: warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) ### Journal/Issue Metadata # MedlineJournalInfo is always present issnl = None container_id = None container_name = None container_extra = dict() mji = medline.MedlineJournalInfo if mji.find("Country"): container_extra['country_name'] = mji.Country.string if mji.find("ISSNLinking"): issnl = mji.ISSNLinking.string journal = medline.Article.Journal issnp = journal.find("ISSN", IssnType="Print") if issnp: container_extra['issnp'] = issnp.string if not issnl: issnll = self.issn2issnl(issnp) if issnl: container_id = self.lookup_issnl(issnl) pub_date = journal.PubDate release_date = None release_year = None if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): release_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], int(pub_date.Day.string)) release_date = release_date.isoformat() elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing? release_year = int(pub_date.MedlineDate.string.split()[0][:4]) if journal.find("Title"): container_name = journal.Title.string if (container_id is None and self.create_containers and (issnl is not None) and container_name): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country ce = fatcat_client.ContainerEntity( name=container_name, container_type='journal', #XXX: publisher not included? issnl=issnl, extra=(container_extra or None)) ce_edit = self.create_container(ce) container_id = ce_edit.ident ji = journal.JournalIssue volume = None if ji.find("Volume"): volume = ji.Volume.string issue = None if ji.find("Issue"): issue = ji.Issue.string ### Abstracts # "All abstracts are in English" abstracts = [] first_abstract = medline.find("AbstractText") if first_abstract and first_abstract.get('NlmCategory'): joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) abstracts.append(fatcat_client.ReleaseAbstract( content=joined, mimetype="text/plain", lang="en", )) else: for abstract in medline.find_all("AbstractText"): abstracts.append(fatcat_client.ReleaseAbstract( content=abstract.get_text().strip(), mimetype="text/plain", lang="en", )) if abstract.find('math'): abstracts.append(fatcat_client.ReleaseAbstract( # strip the tags content=str(abstract)[14:-15], mimetype="application/mathml+xml", lang="en", )) if not abstracts: abstracts = None ### Contribs contribs = [] if medline.AuthorList: for author in medline.AuthorList.find_all("Author"): given_name = None surname = None raw_name = None if author.ForeName: given_name = author.ForeName.string if author.LastName: surname = author.LastName.string if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: # needs re-formatting from, eg, "0000000179841889" orcid = orcid.string if orcid.startswith("http://orcid.org/"): orcid = orcid.replace("http://orcid.org/", "") elif orcid.startswith("https://orcid.org/"): orcid = orcid.replace("https://orcid.org/", "") elif not '-' in orcid: orcid = "{}-{}-{}-{}".format( orcid[0:4], orcid[4:8], orcid[8:12], orcid[12:16], ) # XXX: do lookup by ORCID #contrib_extra['orcid'] = orcid affiliation = author.find("Affiliation") raw_affiliation = None if affiliation: raw_affiliation = affiliation.string if author.find("EqualContrib"): # TODO: schema for this? contrib_extra['equal_contrib'] = True contribs.append(fatcat_client.ReleaseContrib( raw_name=raw_name, given_name=given_name, surname=surname, role="author", raw_affiliation=raw_affiliation, extra=contrib_extra, )) if medline.AuthorList['CompleteYN'] == 'N': contribs.append(fatcat_client.ReleaseContrib(raw_name="et al.")) if not contribs: contribs = None ### References refs = [] if pubmed.ReferenceList: for ref in pubmed.ReferenceList.find_all('Reference'): ref_obj = dict() ref_extra = dict() ref_pmid = ref.find("ArticleId", IdType="pubmed") if ref_pmid: ref_extra['pmid'] = ref_pmid.string # TODO: do reference lookups here based on PMID/DOI ref_raw = ref.Citation if ref_raw: ref_extra['unstructured'] = ref_raw.string if ref_extra: ref_obj['extra'] = ref_extra refs.append(fatcat_client.ReleaseRef( extra=ref_obj.get('extra'), )) if not refs: refs = None # extra: # withdrawn_date # translation_of # subtitle # aliases # container_name # group-title # pubmed: retraction refs if extra_pubmed: extra['pubmed'] = extra_pubmed if not extra: extra = None re = fatcat_client.ReleaseEntity( work_id=None, title=clean(title), original_title=clean(original_title), release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, ext_ids=fatcat_client.ReleaseExtIds( doi=doi, pmid=pmid, pmcid=pmcid, #isbn13 # never in Article ), volume=volume, issue=issue, pages=pages, #publisher # not included? language=language, #license_slug # not in MEDLINE abstracts=abstracts, contribs=contribs, refs=refs, container_id=container_id, extra=extra, ) return re def try_update(self, re): # first, lookup existing by PMID (which must be defined) existing = None try: existing = self.api.lookup_release(pmid=re.ext_ids.pmid) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err # then try DOI lookup if there is one if not existing and re.ext_ids.doi: try: existing = self.api.lookup_release(doi=re.ext_ids.doi) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format( existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) self.counts['exists-pmid-doi-mismatch'] += 1 return False if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID self.counts['exists'] += 1 return False elif existing: # but do update if only DOI was set existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid existing.refs = existing.refs or re.refs existing.extra['pubmed'] = re.extra['pubmed'] self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 return False return True def insert_batch(self, batch): self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( editgroup=fatcat_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) def parse_file(self, handle): # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") # 2. iterate over articles, call parse_article on each for article in soup.find_all("PubmedArticle"): resp = self.parse_article(article) print(json.dumps(resp)) #sys.exit(-1) if __name__=='__main__': parser = PubmedImporter(None, None) parser.parse_file(open(sys.argv[1]))