From 300665927f578151321b0d91b28f8aadffcf227d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 May 2019 22:36:01 -0700 Subject: initial pubmed importer --- python/fatcat_tools/importers/__init__.py | 5 +- python/fatcat_tools/importers/pubmed.py | 512 ++++++++++++++++++++++++++++++ python/parse_pubmed_xml.py | 372 ---------------------- python/tests/import_pubmed.py | 80 +++++ 4 files changed, 595 insertions(+), 374 deletions(-) create mode 100644 python/fatcat_tools/importers/pubmed.py delete mode 100644 python/parse_pubmed_xml.py create mode 100644 python/tests/import_pubmed.py diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 8ec219f8..6f8849d6 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,11 +12,12 @@ To run an import you combine two classes; one each of: """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk -from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug from .jalc import JalcImporter from .jstor import JstorImporter from .arxiv import ArxivRawImporter +from .pubmed import PubmedImporter from .grobid_metadata import GrobidMetadataImporter from .journal_metadata import JournalMetadataImporter from .matched import MatchedImporter diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py new file mode 100644 index 00000000..1feb41cd --- /dev/null +++ b/python/fatcat_tools/importers/pubmed.py @@ -0,0 +1,512 @@ + +import sys +import json +import sqlite3 +import datetime +import warnings +from bs4 import BeautifulSoup +from bs4.element import NavigableString + +import fatcat_client +from .common import EntityImporter, clean, LANG_MAP_MARC + +# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly +PUBMED_RELEASE_TYPE_MAP = { + #Adaptive Clinical Trial + "Address": "speech", + "Autobiography": "book", + #Bibliography + "Biography": "book", + #Case Reports + "Classical Article": "article-journal", + #Clinical Conference + #Clinical Study + #Clinical Trial + #Clinical Trial, Phase I + #Clinical Trial, Phase II + #Clinical Trial, Phase III + #Clinical Trial, Phase IV + #Clinical Trial Protocol + #Clinical Trial, Veterinary + #Collected Works + #Comparative Study + #Congress + #Consensus Development Conference + #Consensus Development Conference, NIH + #Controlled Clinical Trial + "Dataset": "dataset", + #Dictionary + #Directory + #Duplicate Publication + "Editorial": "editorial", + #English Abstract # doesn't indicate that this is abstract-only + #Equivalence Trial + #Evaluation Studies + #Expression of Concern + #Festschrift + #Government Document + #Guideline + "Historical Article": "article-journal", + #Interactive Tutorial + "Interview": "interview", + "Introductory Journal Article": "article-journal", + "Journal Article": "article-journal", + "Lecture": "speech", + "Legal Case": "legal_case", + "Legislation": "legislation", + "Letter": "letter", + #Meta-Analysis + #Multicenter Study + #News + "Newspaper Article": "article-newspaper", + #Observational Study + #Observational Study, Veterinary + #Overall + #Patient Education Handout + #Periodical Index + #Personal Narrative + #Portrait + #Practice Guideline + #Pragmatic Clinical Trial + #Publication Components + #Publication Formats + #Publication Type Category + #Randomized Controlled Trial + #Research Support, American Recovery and Reinvestment Act + #Research Support, N.I.H., Extramural + #Research Support, N.I.H., Intramural + #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + #Research Support, U.S. Gov't, P.H.S. + #Review # in the "literature review" sense, not "product review" + #Scientific Integrity Review + #Study Characteristics + #Support of Research + #Systematic Review + "Technical Report": "report", + #Twin Study + #Validation Studies + #Video-Audio Media + #Webcasts +} + +MONTH_ABBR_MAP = { + "Jan": 1, "01": 1, + "Feb": 2, "02": 2, + "Mar": 3, "03": 3, + "Apr": 4, "04": 4, + "May": 5, "05": 5, + "Jun": 6, "06": 6, + "Jul": 7, "07": 7, + "Aug": 8, "08": 8, + "Sep": 9, "09": 9, + "Oct": 10, "10": 10, + "Nov": 11, "11": 11, + "Dec": 12, "12": 12, +} + + +class PubmedImporter(EntityImporter): + """ + Importer for PubMed/MEDLINE XML metadata. + + TODO: MEDLINE doesn't include PMC/OA license; could include in importer? + TODO: clean (ftfy) title, original title, etc + XXX: withdrawn + XXX: full author names + """ + + def __init__(self): + pass + + def __init__(self, api, issn_map_file, **kwargs): + + eg_desc = kwargs.get('editgroup_description', + "Automated import of PubMed/MEDLINE XML metadata") + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + extid_map_file = kwargs.get('extid_map_file') + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri)) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map") + + self.create_containers = kwargs.get('create_containers') + self.read_issn_map_file(issn_map_file) + + def lookup_ext_ids(self, pmid): + if self.extid_map_db is None: + return dict(doi=None, core_id=None, pmid=None, pmcid=None, + wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1", + [pmid]).fetchone() + if row is None: + return dict(doi=None, core_id=None, pmid=None, pmcid=None, + wikidata_qid=None, arxiv_id=None, jstor_id=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + doi=row[1], + pmcid=row[2], + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) + + def want(self, obj): + return True + + def parse_record(self, a): + + medline = a.MedlineCitation + # PubmedData isn't required by DTD, but seems to always be present + pubmed = a.PubmedData + extra = dict() + extra_pubmed = dict() + + identifiers = pubmed.ArticleIdList + pmid = medline.PMID.string.strip() + doi = identifiers.find("ArticleId", IdType="doi") + if doi: + doi = doi.string.lower() + + pmcid = identifiers.find("ArticleId", IdType="pmc") + if pmcid: + pmcid = pmcid.string + + release_type = None + pub_types = [] + for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): + pub_types.append(pub_type.string) + if pub_type.string in PUBMED_RELEASE_TYPE_MAP: + release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] + break + if pub_types: + extra_pubmed['pub_types'] = pub_types + if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): + release_type = "retraction" + retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") + if retraction_of: + extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string + extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string + + # everything in medline is published + release_stage = "published" + if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): + release_stage = "updated" + if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): + release_stage = "retraction" + if medline.Article.PublicationTypeList.find(string="Retracted Publication"): + withdrawn_status = "retracted" + + pages = medline.find('MedlinePgn') + if pages: + pages = pages.string + + title = medline.Article.ArticleTitle.string # always present + if title: + if title.endswith('.'): + title = title[:-1] + # this hides some "special" titles, but the vast majority are + # translations; translations don't always include the original_title + if title.startswith('[') and title.endswith(']'): + title = title[1:-1] + else: + # TODO: will filter out later + title = None + + original_title = medline.Article.find("VernacularTitle", recurse=False) + if original_title: + original_title = original_title.string or None + if original_title and original_title.endswith('.'): + original_title = original_title[:-1] + + # TODO: happening in alpha order, not handling multi-language well. + # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html + language = medline.Article.Language + if language: + language = language.string + # TODO: map to two-letter + if language in ("und", "un"): + # "undetermined" + language = None + else: + language = LANG_MAP_MARC.get(language) + if not language: + warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) + + ### Journal/Issue Metadata + # MedlineJournalInfo is always present + issnl = None + container_id = None + container_name = None + container_extra = dict() + mji = medline.MedlineJournalInfo + if mji.find("Country"): + container_extra['country_name'] = mji.Country.string + if mji.find("ISSNLinking"): + issnl = mji.ISSNLinking.string + + journal = medline.Article.Journal + issnp = journal.find("ISSN", IssnType="Print") + if issnp: + container_extra['issnp'] = issnp.string + if not issnl: + issnll = self.issn2issnl(issnp) + + if issnl: + container_id = self.lookup_issnl(issnl) + + pub_date = journal.PubDate + release_date = None + release_year = None + if pub_date.Year: + release_year = int(pub_date.Year.string) + if pub_date.find("Day") and pub_date.find("Month"): + release_date = datetime.date( + release_year, + MONTH_ABBR_MAP[pub_date.Month.string], + int(pub_date.Day.string)) + release_date = release_date.isoformat() + elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing? + release_year = int(pub_date.MedlineDate.string.split()[0][:4]) + + if journal.find("Title"): + container_name = journal.Title.string + + if (container_id is None and self.create_containers and (issnl is not None) + and container_name): + # name, type, publisher, issnl + # extra: issnp, issne, original_name, languages, country + ce = fatcat_client.ContainerEntity( + name=container_name, + container_type='journal', + #XXX: publisher not included? + issnl=issnl, + extra=(container_extra or None)) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + + ji = journal.JournalIssue + volume = None + if ji.find("Volume"): + volume = ji.Volume.string + issue = None + if ji.find("Issue"): + issue = ji.Issue.string + + ### Abstracts + # "All abstracts are in English" + abstracts = [] + first_abstract = medline.find("AbstractText") + if first_abstract and first_abstract.get('NlmCategory'): + joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) + abstracts.append(fatcat_client.ReleaseAbstract( + content=joined, + mimetype="text/plain", + lang="en", + )) + else: + for abstract in medline.find_all("AbstractText"): + abstracts.append(fatcat_client.ReleaseAbstract( + content=abstract.get_text().strip(), + mimetype="text/plain", + lang="en", + )) + if abstract.find('math'): + abstracts.append(fatcat_client.ReleaseAbstract( + # strip the tags + content=str(abstract)[14:-15], + mimetype="application/mathml+xml", + lang="en", + )) + if not abstracts: + abstracts = None + + ### Contribs + contribs = [] + if medline.AuthorList: + for author in medline.AuthorList.find_all("Author"): + given_name = None + surname = None + raw_name = None + if author.ForeName: + given_name = author.ForeName.string + if author.LastName: + surname = author.LastName.string + if given_name and surname: + raw_name = "{} {}".format(given_name, surname) + elif surname: + raw_name = surname + contrib_extra = dict() + orcid = author.find("Identifier", Source="ORCID") + if orcid: + # needs re-formatting from, eg, "0000000179841889" + orcid = orcid.string + if orcid.startswith("http://orcid.org/"): + orcid = orcid.replace("http://orcid.org/", "") + elif orcid.startswith("https://orcid.org/"): + orcid = orcid.replace("https://orcid.org/", "") + elif not '-' in orcid: + orcid = "{}-{}-{}-{}".format( + orcid[0:4], + orcid[4:8], + orcid[8:12], + orcid[12:16], + ) + # XXX: do lookup by ORCID + #contrib_extra['orcid'] = orcid + affiliation = author.find("Affiliation") + raw_affiliation = None + if affiliation: + raw_affiliation = affiliation.string + if author.find("EqualContrib"): + # TODO: schema for this? + contrib_extra['equal_contrib'] = True + contribs.append(fatcat_client.ReleaseContrib( + raw_name=raw_name, + given_name=given_name, + surname=surname, + role="author", + raw_affiliation=raw_affiliation, + extra=contrib_extra, + )) + + if medline.AuthorList['CompleteYN'] == 'N': + contribs.append(fatcat_client.ReleaseContrib(raw_name="et al.")) + if not contribs: + contribs = None + + ### References + refs = [] + if pubmed.ReferenceList: + for ref in pubmed.ReferenceList.find_all('Reference'): + ref_obj = dict() + ref_extra = dict() + ref_pmid = ref.find("ArticleId", IdType="pubmed") + if ref_pmid: + ref_extra['pmid'] = ref_pmid.string + # TODO: do reference lookups here based on PMID/DOI + ref_raw = ref.Citation + if ref_raw: + ref_extra['unstructured'] = ref_raw.string + if ref_extra: + ref_obj['extra'] = ref_extra + refs.append(fatcat_client.ReleaseRef( + extra=ref_obj.get('extra'), + )) + if not refs: + refs = None + + # extra: + # withdrawn_date + # translation_of + # subtitle + # aliases + # container_name + # group-title + # pubmed: retraction refs + if extra_pubmed: + extra['pubmed'] = extra_pubmed + if not extra: + extra = None + + re = fatcat_client.ReleaseEntity( + work_id=None, + title=clean(title), + original_title=clean(original_title), + release_type=release_type, + release_stage=release_stage, + release_date=release_date, + release_year=release_year, + ext_ids=fatcat_client.ReleaseExtIds( + doi=doi, + pmid=pmid, + pmcid=pmcid, + #isbn13 # never in Article + ), + volume=volume, + issue=issue, + pages=pages, + #publisher # not included? + language=language, + #license_slug # not in MEDLINE + abstracts=abstracts, + contribs=contribs, + refs=refs, + container_id=container_id, + extra=extra, + ) + return re + + def try_update(self, re): + + # first, lookup existing by PMID (which must be defined) + existing = None + try: + existing = self.api.lookup_release(pmid=re.ext_ids.pmid) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # then try DOI lookup if there is one + if not existing and re.ext_ids.doi: + try: + existing = self.api.lookup_release(doi=re.ext_ids.doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: + warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format( + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) + self.counts['exists-pmid-doi-mismatch'] += 1 + return False + + if existing and existing.ext_ids.pmid and existing.refs: + # TODO: any other reasons to do an update? + # don't update if it already has PMID + self.counts['exists'] += 1 + return False + elif existing: + # but do update if only DOI was set + existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi + existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid + existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid + existing.refs = existing.refs or re.refs + existing.extra['pubmed'] = re.extra['pubmed'] + self.api.update_release(self.get_editgroup_id(), existing.ident, existing) + self.counts['update'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( + editgroup=fatcat_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_file(self, handle): + + # 1. open with beautiful soup + soup = BeautifulSoup(handle, "xml") + + # 2. iterate over articles, call parse_article on each + for article in soup.find_all("PubmedArticle"): + resp = self.parse_article(article) + print(json.dumps(resp)) + #sys.exit(-1) + +if __name__=='__main__': + parser = PubMedParser() + parser.parse_file(open(sys.argv[1])) diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py deleted file mode 100644 index 413333cc..00000000 --- a/python/parse_pubmed_xml.py +++ /dev/null @@ -1,372 +0,0 @@ - -import sys -import json -import datetime -from bs4 import BeautifulSoup -from bs4.element import NavigableString - -# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly -PUBMED_RELEASE_TYPE_MAP = { - #Adaptive Clinical Trial - "Address": "speech", - "Autobiography": "book", - #Bibliography - "Biography": "book", - #Case Reports - "Classical Article": "article-journal", - #Clinical Conference - #Clinical Study - #Clinical Trial - #Clinical Trial, Phase I - #Clinical Trial, Phase II - #Clinical Trial, Phase III - #Clinical Trial, Phase IV - #Clinical Trial Protocol - #Clinical Trial, Veterinary - #Collected Works - #Comparative Study - #Congress - #Consensus Development Conference - #Consensus Development Conference, NIH - #Controlled Clinical Trial - "Dataset": "dataset", - #Dictionary - #Directory - #Duplicate Publication - "Editorial": "editorial", - #English Abstract # doesn't indicate that this is abstract-only - #Equivalence Trial - #Evaluation Studies - #Expression of Concern - #Festschrift - #Government Document - #Guideline - "Historical Article": "article-journal", - #Interactive Tutorial - "Interview": "interview", - "Introductory Journal Article": "article-journal", - "Journal Article": "article-journal", - "Lecture": "speech", - "Legal Case": "legal_case", - "Legislation": "legislation", - "Letter": "letter", - #Meta-Analysis - #Multicenter Study - #News - "Newspaper Article": "article-newspaper", - #Observational Study - #Observational Study, Veterinary - #Overall - #Patient Education Handout - #Periodical Index - #Personal Narrative - #Portrait - #Practice Guideline - #Pragmatic Clinical Trial - #Publication Components - #Publication Formats - #Publication Type Category - #Randomized Controlled Trial - #Research Support, American Recovery and Reinvestment Act - #Research Support, N.I.H., Extramural - #Research Support, N.I.H., Intramural - #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - #Research Support, U.S. Gov't, P.H.S. - #Review # in the "literature review" sense, not "product review" - #Scientific Integrity Review - #Study Characteristics - #Support of Research - #Systematic Review - "Technical Report": "report", - #Twin Study - #Validation Studies - #Video-Audio Media - #Webcasts -} - -MONTH_ABBR_MAP = { - "Jan": 1, "01": 1, - "Feb": 2, "02": 2, - "Mar": 3, "03": 3, - "Apr": 4, "04": 4, - "May": 5, "05": 5, - "Jun": 6, "06": 6, - "Jul": 7, "07": 7, - "Aug": 8, "08": 8, - "Sep": 9, "09": 9, - "Oct": 10, "10": 10, - "Nov": 11, "11": 11, - "Dec": 12, "12": 12, -} - -class PubMedParser(): - """ - Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON) - - TODO: MEDLINE doesn't include PMC/OA license; could include in importer? - TODO: clean (ftfy) title, original title, etc - """ - - def __init__(self): - pass - - def parse_file(self, handle): - - # 1. open with beautiful soup - soup = BeautifulSoup(handle, "xml") - - # 2. iterate over articles, call parse_article on each - for article in soup.find_all("PubmedArticle"): - resp = self.parse_article(article) - print(json.dumps(resp)) - #sys.exit(-1) - - def parse_article(self, a): - - medline = a.MedlineCitation - # PubmedData isn't required by DTD, but seems to always be present - pubmed = a.PubmedData - extra = dict() - extra_pubmed = dict() - - identifiers = pubmed.ArticleIdList - doi = identifiers.find("ArticleId", IdType="doi") - if doi: - doi = doi.string.lower() - - pmcid = identifiers.find("ArticleId", IdType="pmc") - if pmcid: - pmcid = pmcid.string - - release_type = None - for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): - if pub_type.string in PUBMED_RELEASE_TYPE_MAP: - release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] - break - if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): - release_type = "retraction" - retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") - if retraction_of: - extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string - extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string - - # everything in medline is published - release_status = "published" - if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): - release_status = "updated" - if medline.Article.PublicationTypeList.find(string="Retracted Publication"): - release_status = "retracted" - - pages = medline.find('MedlinePgn') - if pages: - pages = pages.string - - title = medline.Article.ArticleTitle.string # always present - if title: - if title.endswith('.'): - title = title[:-1] - # this hides some "special" titles, but the vast majority are - # translations; translations don't always include the original_title - if title.startswith('[') and title.endswith(']'): - title = title[1:-1] - else: - # TODO: will filter out later - title = None - - original_title = medline.Article.find("VernacularTitle", recurse=False) - if original_title: - original_title = original_title.string or None - if original_title and original_title.endswith('.'): - original_title = original_title[:-1] - - # TODO: happening in alpha order, not handling multi-language well. - # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html - language = medline.Article.Language - if language: - language = language.string - # TODO: map to two-letter - if language in ("und", "un"): - # "undetermined" - language = None - - ### Journal/Issue Metadata - # MedlineJournalInfo is always present - container = dict() - container_extra = dict() - mji = medline.MedlineJournalInfo - if mji.find("Country"): - container_extra['country_name'] = mji.Country.string - if mji.find("ISSNLinking"): - container['issnl'] = mji.ISSNLinking.string - - journal = medline.Article.Journal - issnp = journal.find("ISSN", IssnType="Print") - if issnp: - container_extra['issnp'] = issnp.string - - pub_date = journal.PubDate - release_date = None - if pub_date.find("MedlineDate"): - release_year = int(pub_date.MedlineDate.string.split()[0][:4]) - else: - release_year = int(pub_date.Year.string) - if pub_date.find("Day") and pub_date.find("Month"): - release_date = datetime.date( - release_year, - MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) - release_date = release_date.isoformat() - - ji = journal.JournalIssue - volume = None - if ji.find("Volume"): - volume = ji.Volume.string - issue = None - if ji.find("Issue"): - issue = ji.Issue.string - if journal.find("Title"): - container['name'] = journal.Title.string - - if extra_pubmed: - extra['pubmed'] = extra_pubmed - if not extra: - extra = None - - ### Abstracts - # "All abstracts are in English" - abstracts = [] - first_abstract = medline.find("AbstractText") - if first_abstract and first_abstract.get('NlmCategory'): - joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) - abstracts.append(dict( - content=joined, - mimetype="text/plain", - lang="en", - )) - else: - for abstract in medline.find_all("AbstractText"): - abstracts.append(dict( - content=abstract.get_text().strip(), - mimetype="text/plain", - lang="en", - )) - if abstract.find('math'): - abstracts.append(dict( - # strip the tags - content=str(abstract)[14:-15], - mimetype="application/mathml+xml", - lang="en", - )) - if not abstracts: - abstracts = None - - ### Contribs - contribs = [] - if medline.AuthorList: - for author in medline.AuthorList.find_all("Author"): - contrib = dict( - role="author", - ) - if author.ForeName: - contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string) - elif author.LastName: - contrib['raw_name'] = author.LastName.string - contrib_extra = dict() - orcid = author.find("Identifier", Source="ORCID") - if orcid: - # needs re-formatting from, eg, "0000000179841889" - orcid = orcid.string - if orcid.startswith("http://orcid.org/"): - orcid = orcid.replace("http://orcid.org/", "") - elif orcid.startswith("https://orcid.org/"): - orcid = orcid.replace("https://orcid.org/", "") - elif not '-' in orcid: - orcid = "{}-{}-{}-{}".format( - orcid[0:4], - orcid[4:8], - orcid[8:12], - orcid[12:16], - ) - contrib_extra['orcid'] = orcid - affiliation = author.find("Affiliation") - if affiliation: - contrib['raw_affiliation'] = affiliation.string - if author.find("EqualContrib"): - # TODO: schema for this? - contrib_extra['equal_contrib'] = True - if contrib_extra: - contrib['extra'] = contrib_extra - contribs.append(contrib) - - if medline.AuthorList['CompleteYN'] == 'N': - contribs.append(dict(raw_name="et al.")) - if not contribs: - contribs = None - - ### References - refs = [] - if pubmed.ReferenceList: - for ref in pubmed.ReferenceList.find_all('Reference'): - ref_obj = dict() - ref_extra = dict() - ref_pmid = ref.find("ArticleId", IdType="pubmed") - if ref_pmid: - ref_extra['pmid'] = ref_pmid.string - ref_raw = ref.Citation - if ref_raw: - ref_extra['raw'] = ref_raw.string - if ref_extra: - ref_obj['extra'] = ref_extra - refs.append(ref_obj) - if not refs: - refs = None - - re = dict( - work_id=None, - title=title, - original_title=original_title, - release_type=release_type, - release_status=release_status, - release_date=release_date, - release_year=release_year, - doi=doi, - pmid=int(medline.PMID.string), # always present - pmcid=pmcid, - #isbn13 # never in Article - volume=volume, - issue=issue, - pages=pages, - #publisher # not included? - language=language, - #license_slug # not in MEDLINE - - # content, mimetype, lang - abstracts=abstracts, - - # raw_name, role, raw_affiliation, extra - contribs=contribs, - - # key, year, container_name, title, locator - # extra: volume, authors, issue, publisher, identifiers - refs=refs, - - # name, type, publisher, issnl - # extra: issnp, issne, original_name, languages, country - container=container, - - # extra: - # withdrawn_date - # translation_of - # subtitle - # aliases - # container_name - # group-title - # pubmed: retraction refs - extra=extra, - ) - - return re - -if __name__=='__main__': - parser = PubMedParser() - parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py new file mode 100644 index 00000000..eacc3815 --- /dev/null +++ b/python/tests/import_pubmed.py @@ -0,0 +1,80 @@ + +import json, gzip +import pytest +from fatcat_tools.importers import PubmedImporter, Bs4XmlFilePusher +from fixtures import api +from bs4 import BeautifulSoup + + +@pytest.fixture(scope="function") +def pubmed_importer(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) + +@pytest.fixture(scope="function") +def pubmed_importer_existing(api): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) + +def test_pubmed_importer(pubmed_importer): + last_index = pubmed_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/pubmedsample_2019.xml', 'r') as f: + pubmed_importer.bezerk_mode = True + counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() + assert counts['insert'] == 1 + assert counts['exists'] == 0 + assert counts['skip'] == 0 + + # fetch most recent editgroup + change = pubmed_importer.api.get_changelog_entry(index=last_index+1) + eg = change.editgroup + assert eg.description + assert "pubmed" in eg.description.lower() + assert eg.extra['git_rev'] + assert "fatcat_tools.PubmedImporter" in eg.extra['agent'] + + last_index = pubmed_importer.api.get_changelog(limit=1)[0].index + with open('tests/files/pubmedsample_2019.xml', 'r') as f: + pubmed_importer.bezerk_mode = False + pubmed_importer.reset() + counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() + assert counts['insert'] == 0 + assert counts['exists'] == 1 + assert counts['skip'] == 0 + assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index + +def test_pubmed_xml_parse(pubmed_importer): + with open('tests/files/pubmedsample_2019.xml', 'r') as f: + soup = BeautifulSoup(f, "xml") + r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0]) + r2 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[-1]) + + assert r1.title == "Hospital debt management and cost reimbursement" + assert r1.subtitle == None + assert r1.original_title == None + assert r1.publisher == None + assert r1.release_type == "article-journal" + assert r1.release_stage == "published" + assert r1.license_slug == None + assert r1.ext_ids.doi == None + assert r1.ext_ids.pmid == "973217" + assert r1.language == "en" + assert r1.volume == "3" + assert r1.issue == "1" + assert r1.pages == "69-81" + assert r1.release_date == None # not "1976-12-03", which is medline ingest date + assert r1.release_year == 1976 + # matched by ISSN, so shouldn't be in there? + #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London" + assert len(r1.contribs) == 1 + + assert r1.contribs[0].raw_name == "F R Blume" + assert r1.contribs[0].given_name == "F R" + assert r1.contribs[0].surname == "Blume" + + print(r1.extra) + # TODO: assert r1.extra['pubmed']['mesh_topics'] == ['Accounting', 'Economics, Hospital', 'Hospital Administration'] + assert r1.extra['pubmed']['pub_types'] == ['Journal Article'] + assert not r1.refs + + # XXX: r2 tests -- cgit v1.2.3