diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:59 -0700 |
commit | 31d1a6a713d177990609767d508209ced19ca396 (patch) | |
tree | a628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/pubmed.py | |
parent | 9dc891b8098542bb089c8c47098b60a8beb76a53 (diff) | |
download | fatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz fatcat-31d1a6a713d177990609767d508209ced19ca396.zip |
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 355 |
1 files changed, 197 insertions, 158 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 00ad54d0..cfdafcf7 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,4 +1,3 @@ - import datetime import json import sys @@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly PUBMED_RELEASE_TYPE_MAP = { - #Adaptive Clinical Trial + # Adaptive Clinical Trial "Address": "speech", "Autobiography": "book", - #Bibliography + # Bibliography "Biography": "book", - #Case Reports + # Case Reports "Classical Article": "article-journal", - #Clinical Conference - #Clinical Study - #Clinical Trial - #Clinical Trial, Phase I - #Clinical Trial, Phase II - #Clinical Trial, Phase III - #Clinical Trial, Phase IV - #Clinical Trial Protocol - #Clinical Trial, Veterinary - #Collected Works - #Comparative Study - #Congress - #Consensus Development Conference - #Consensus Development Conference, NIH - #Controlled Clinical Trial + # Clinical Conference + # Clinical Study + # Clinical Trial + # Clinical Trial, Phase I + # Clinical Trial, Phase II + # Clinical Trial, Phase III + # Clinical Trial, Phase IV + # Clinical Trial Protocol + # Clinical Trial, Veterinary + # Collected Works + # Comparative Study + # Congress + # Consensus Development Conference + # Consensus Development Conference, NIH + # Controlled Clinical Trial "Dataset": "dataset", - #Dictionary - #Directory - #Duplicate Publication + # Dictionary + # Directory + # Duplicate Publication "Editorial": "editorial", - #English Abstract # doesn't indicate that this is abstract-only - #Equivalence Trial - #Evaluation Studies - #Expression of Concern - #Festschrift - #Government Document - #Guideline + # English Abstract # doesn't indicate that this is abstract-only + # Equivalence Trial + # Evaluation Studies + # Expression of Concern + # Festschrift + # Government Document + # Guideline "Historical Article": "article-journal", - #Interactive Tutorial + # Interactive Tutorial "Interview": "interview", "Introductory Journal Article": "article-journal", "Journal Article": "article-journal", @@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = { "Legal Case": "legal_case", "Legislation": "legislation", "Letter": "letter", - #Meta-Analysis - #Multicenter Study - #News + # Meta-Analysis + # Multicenter Study + # News "Newspaper Article": "article-newspaper", - #Observational Study - #Observational Study, Veterinary - #Overall - #Patient Education Handout - #Periodical Index - #Personal Narrative - #Portrait - #Practice Guideline - #Pragmatic Clinical Trial - #Publication Components - #Publication Formats - #Publication Type Category - #Randomized Controlled Trial - #Research Support, American Recovery and Reinvestment Act - #Research Support, N.I.H., Extramural - #Research Support, N.I.H., Intramural - #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. - #Research Support, U.S. Gov't, P.H.S. - #Review # in the "literature review" sense, not "product review" - #Scientific Integrity Review - #Study Characteristics - #Support of Research - #Systematic Review + # Observational Study + # Observational Study, Veterinary + # Overall + # Patient Education Handout + # Periodical Index + # Personal Narrative + # Portrait + # Practice Guideline + # Pragmatic Clinical Trial + # Publication Components + # Publication Formats + # Publication Type Category + # Randomized Controlled Trial + # Research Support, American Recovery and Reinvestment Act + # Research Support, N.I.H., Extramural + # Research Support, N.I.H., Intramural + # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S. + # Research Support, U.S. Gov't, P.H.S. + # Review # in the "literature review" sense, not "product review" + # Scientific Integrity Review + # Study Characteristics + # Support of Research + # Systematic Review "Technical Report": "report", - #Twin Study - #Validation Studies - #Video-Audio Media - #Webcasts + # Twin Study + # Validation Studies + # Video-Audio Media + # Webcasts } MONTH_ABBR_MAP = { - "Jan": 1, "01": 1, - "Feb": 2, "02": 2, - "Mar": 3, "03": 3, - "Apr": 4, "04": 4, - "May": 5, "05": 5, - "Jun": 6, "06": 6, - "Jul": 7, "07": 7, - "Aug": 8, "08": 8, - "Sep": 9, "09": 9, - "Oct": 10, "10": 10, - "Nov": 11, "11": 11, - "Dec": 12, "12": 12, + "Jan": 1, + "01": 1, + "Feb": 2, + "02": 2, + "Mar": 3, + "03": 3, + "Apr": 4, + "04": 4, + "May": 5, + "05": 5, + "Jun": 6, + "06": 6, + "Jul": 7, + "07": 7, + "Aug": 8, + "08": 8, + "Sep": 9, + "09": 9, + "Oct": 10, + "10": 10, + "Nov": 11, + "11": 11, + "Dec": 12, + "12": 12, } # From: https://www.ncbi.nlm.nih.gov/books/NBK7249/ @@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = { "United Kingdom": "gb", "United States": "us", "Uruguay": "uy", - # Additions from running over large files "Bosnia and Herzegovina": "ba", - #"International" - "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn + # "International" + "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn "Russia (Federation)": "ru", "Scotland": "gb", "England": "gb", @@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter): def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): - eg_desc = kwargs.get('editgroup_description', - "Automated import of PubMed/MEDLINE XML metadata") - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') - super().__init__(api, + eg_desc = kwargs.get( + "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata" + ) + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) self.lookup_refs = lookup_refs - self.create_containers = kwargs.get('create_containers', True) + self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) def want(self, obj): @@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter): release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] break if pub_types: - extra_pubmed['pub_types'] = pub_types + extra_pubmed["pub_types"] = pub_types if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_type = "retraction" retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") if retraction_of: if retraction_of.RefSource: - extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string + extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string if retraction_of.PMID: - extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string + extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string # everything in medline is published release_stage = "published" @@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter): elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"): withdrawn_status = "concern" - pages = medline.find('MedlinePgn') + pages = medline.find("MedlinePgn") if pages: pages = pages.string - title = medline.Article.ArticleTitle.get_text() # always present + title = medline.Article.ArticleTitle.get_text() # always present if title: - title = title.replace('\n', ' ') - if title.endswith('.'): + title = title.replace("\n", " ") + if title.endswith("."): title = title[:-1] # this hides some "special" titles, but the vast majority are # translations; translations don't always include the original_title - if title.startswith('[') and title.endswith(']'): + if title.startswith("[") and title.endswith("]"): title = title[1:-1] else: # will filter out later @@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter): original_title = medline.Article.find("VernacularTitle", recurse=False) if original_title: original_title = original_title.get_text() or None - original_title = original_title.replace('\n', ' ') - if original_title and original_title.endswith('.'): + original_title = original_title.replace("\n", " ") + if original_title and original_title.endswith("."): original_title = original_title[:-1] if original_title and not title: @@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter): else: language = LANG_MAP_MARC.get(language) if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC): - warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string)) + warnings.warn( + "MISSING MARC LANG: {}".format(medline.Article.Language.string) + ) ### Journal/Issue Metadata # MedlineJournalInfo is always present @@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter): country_name = mji.Country.string.strip() country_code = COUNTRY_NAME_MAP.get(country_name) if country_code: - container_extra['country'] = country_code + container_extra["country"] = country_code elif country_name: - container_extra['country_name'] = country_name + container_extra["country_name"] = country_name if mji.find("ISSNLinking"): issnl = mji.ISSNLinking.string @@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter): if issnl: container_id = self.lookup_issnl(issnl) - pub_date = medline.Article.find('ArticleDate') + pub_date = medline.Article.find("ArticleDate") if not pub_date: pub_date = journal.PubDate if not pub_date: @@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter): release_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], - int(pub_date.Day.string)) + int(pub_date.Day.string), + ) release_date = release_date.isoformat() except ValueError as ve: print("bad date, skipping: {}".format(ve), file=sys.stderr) @@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter): if len(medline_date) >= 4 and medline_date[:4].isdigit(): release_year = int(medline_date[:4]) if release_year < 1300 or release_year > 2040: - print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) + print( + "bad medline year, skipping: {}".format(release_year), file=sys.stderr + ) release_year = None else: - print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) + print( + "unparsable medline date, skipping: {}".format(medline_date), + file=sys.stderr, + ) if journal.find("Title"): container_name = journal.Title.get_text() - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): # name, type, publisher, issnl # extra: original_name, languages, country ce = fatcat_openapi_client.ContainerEntity( name=container_name, - container_type='journal', - #NOTE: publisher not included + container_type="journal", + # NOTE: publisher not included issnl=issnl, issnp=issnp, - extra=(container_extra or None)) + extra=(container_extra or None), + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter): # "All abstracts are in English" abstracts = [] primary_abstract = medline.find("Abstract") - if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'): - joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")]) + if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"): + joined = "\n".join( + [m.get_text() for m in primary_abstract.find_all("AbstractText")] + ) abst = fatcat_openapi_client.ReleaseAbstract( content=joined, mimetype="text/plain", @@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter): ) if abst.content: abstracts.append(abst) - if abstract.find('math'): + if abstract.find("math"): abst = fatcat_openapi_client.ReleaseAbstract( # strip the <AbstractText> tags content=str(abstract)[14:-15], @@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter): other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: lang = "en" - if other.get('Language'): - lang = LANG_MAP_MARC.get(other['Language']) + if other.get("Language"): + lang = LANG_MAP_MARC.get(other["Language"]) abst = fatcat_openapi_client.ReleaseAbstract( content=other.AbstractText.get_text().strip(), mimetype="text/plain", @@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter): surname = None raw_name = None if author.ForeName: - given_name = author.ForeName.get_text().replace('\n', ' ') + given_name = author.ForeName.get_text().replace("\n", " ") if author.LastName: - surname = author.LastName.get_text().replace('\n', ' ') + surname = author.LastName.get_text().replace("\n", " ") if given_name and surname: raw_name = "{} {}".format(given_name, surname) elif surname: raw_name = surname if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): - raw_name = author.CollectiveName.get_text().replace('\n', ' ') + raw_name = author.CollectiveName.get_text().replace("\n", " ") contrib_extra = dict() orcid = author.find("Identifier", Source="ORCID") if orcid: @@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter): orcid = orcid.replace("http://orcid.org/", "") elif orcid.startswith("https://orcid.org/"): orcid = orcid.replace("https://orcid.org/", "") - elif '-' not in orcid: + elif "-" not in orcid: orcid = "{}-{}-{}-{}".format( orcid[0:4], orcid[4:8], @@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter): orcid[12:16], ) creator_id = self.lookup_orcid(orcid) - contrib_extra['orcid'] = orcid + contrib_extra["orcid"] = orcid affiliations = author.find_all("Affiliation") raw_affiliation = None if affiliations: - raw_affiliation = affiliations[0].get_text().replace('\n', ' ') + raw_affiliation = affiliations[0].get_text().replace("\n", " ") if len(affiliations) > 1: - contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]] + contrib_extra["more_affiliations"] = [ + ra.get_text().replace("\n", " ") for ra in affiliations[1:] + ] if author.find("EqualContrib"): # TODO: schema for this? - contrib_extra['equal'] = True - contribs.append(fatcat_openapi_client.ReleaseContrib( - raw_name=raw_name, - given_name=given_name, - surname=surname, - role="author", - raw_affiliation=raw_affiliation, - creator_id=creator_id, - extra=contrib_extra, - )) - - if medline.AuthorList['CompleteYN'] == 'N': + contrib_extra["equal"] = True + contribs.append( + fatcat_openapi_client.ReleaseContrib( + raw_name=raw_name, + given_name=given_name, + surname=surname, + role="author", + raw_affiliation=raw_affiliation, + creator_id=creator_id, + extra=contrib_extra, + ) + ) + + if medline.AuthorList["CompleteYN"] == "N": contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al.")) for i, contrib in enumerate(contribs): @@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter): # note that Reference always exists within a ReferenceList, but # that there may be multiple ReferenceList (eg, sometimes one per # Reference) - for ref in pubmed.find_all('Reference'): + for ref in pubmed.find_all("Reference"): ref_extra = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: @@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter): ref_pmid = clean_pmid(ref_pmid.string) ref_release_id = None if ref_doi: - ref_extra['doi'] = ref_doi + ref_extra["doi"] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) if ref_pmid: - ref_extra['pmid'] = ref_pmid + ref_extra["pmid"] = ref_pmid if self.lookup_refs: ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: - ref_extra['unstructured'] = ref_raw.get_text() + ref_extra["unstructured"] = ref_raw.get_text() if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - target_release_id=ref_release_id, - extra=ref_extra, - )) + refs.append( + fatcat_openapi_client.ReleaseRef( + target_release_id=ref_release_id, + extra=ref_extra, + ) + ) if not refs: refs = None @@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter): # group-title # pubmed: retraction refs if extra_pubmed: - extra['pubmed'] = extra_pubmed + extra["pubmed"] = extra_pubmed if not extra: extra = None @@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter): doi=doi, pmid=pmid, pmcid=pmcid, - #isbn13 # never in Article + # isbn13 # never in Article ), volume=volume, issue=issue, pages=pages, - #publisher # not included? + # publisher # not included? language=language, - #license_slug # not in MEDLINE + # license_slug # not in MEDLINE abstracts=abstracts, contribs=contribs, refs=refs, @@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter): raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( - existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid + ) warnings.warn(warn_str) - self.counts['warn-pmid-doi-mismatch'] += 1 + self.counts["warn-pmid-doi-mismatch"] += 1 # don't clobber DOI, but do group together re.ext_ids.doi = None re.work_id = existing.work_id if existing and not self.do_updates: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False elif existing: # but do update if only DOI was set @@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter): existing.container_id = existing.container_id or re.container_id existing.refs = existing.refs or re.refs existing.abstracts = existing.abstracts or re.abstracts - existing.extra['pubmed'] = re.extra['pubmed'] + existing.extra["pubmed"] = re.extra["pubmed"] # fix stub titles if existing.title in [ - "OUP accepted manuscript", - ]: + "OUP accepted manuscript", + ]: existing.title = re.title existing.original_title = existing.original_title or re.original_title @@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter): existing.language = existing.language or re.language # update subtitle in-place first - if not existing.subtitle and existing.extra.get('subtitle'): - subtitle = existing.extra.pop('subtitle') + if not existing.subtitle and existing.extra.get("subtitle"): + subtitle = existing.extra.pop("subtitle") if type(subtitle) == list: subtitle = subtitle[0] if subtitle: @@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter): try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter): return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def parse_file(self, handle): @@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter): for article in soup.find_all("PubmedArticle"): resp = self.parse_record(article) print(json.dumps(resp)) - #sys.exit(-1) + # sys.exit(-1) + -if __name__=='__main__': +if __name__ == "__main__": parser = PubmedImporter(None, None) parser.parse_file(open(sys.argv[1])) |