aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/pubmed.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 18:14:59 -0700
commit31d1a6a713d177990609767d508209ced19ca396 (patch)
treea628a57bdb373669394a6b520102b1b4b5ffe7da /python/fatcat_tools/importers/pubmed.py
parent9dc891b8098542bb089c8c47098b60a8beb76a53 (diff)
downloadfatcat-31d1a6a713d177990609767d508209ced19ca396.tar.gz
fatcat-31d1a6a713d177990609767d508209ced19ca396.zip
fmt (black): fatcat_tools/
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r--python/fatcat_tools/importers/pubmed.py355
1 files changed, 197 insertions, 158 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 00ad54d0..cfdafcf7 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -1,4 +1,3 @@
-
import datetime
import json
import sys
@@ -13,42 +12,42 @@ from .common import LANG_MAP_MARC, EntityImporter, clean
# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
PUBMED_RELEASE_TYPE_MAP = {
- #Adaptive Clinical Trial
+ # Adaptive Clinical Trial
"Address": "speech",
"Autobiography": "book",
- #Bibliography
+ # Bibliography
"Biography": "book",
- #Case Reports
+ # Case Reports
"Classical Article": "article-journal",
- #Clinical Conference
- #Clinical Study
- #Clinical Trial
- #Clinical Trial, Phase I
- #Clinical Trial, Phase II
- #Clinical Trial, Phase III
- #Clinical Trial, Phase IV
- #Clinical Trial Protocol
- #Clinical Trial, Veterinary
- #Collected Works
- #Comparative Study
- #Congress
- #Consensus Development Conference
- #Consensus Development Conference, NIH
- #Controlled Clinical Trial
+ # Clinical Conference
+ # Clinical Study
+ # Clinical Trial
+ # Clinical Trial, Phase I
+ # Clinical Trial, Phase II
+ # Clinical Trial, Phase III
+ # Clinical Trial, Phase IV
+ # Clinical Trial Protocol
+ # Clinical Trial, Veterinary
+ # Collected Works
+ # Comparative Study
+ # Congress
+ # Consensus Development Conference
+ # Consensus Development Conference, NIH
+ # Controlled Clinical Trial
"Dataset": "dataset",
- #Dictionary
- #Directory
- #Duplicate Publication
+ # Dictionary
+ # Directory
+ # Duplicate Publication
"Editorial": "editorial",
- #English Abstract # doesn't indicate that this is abstract-only
- #Equivalence Trial
- #Evaluation Studies
- #Expression of Concern
- #Festschrift
- #Government Document
- #Guideline
+ # English Abstract # doesn't indicate that this is abstract-only
+ # Equivalence Trial
+ # Evaluation Studies
+ # Expression of Concern
+ # Festschrift
+ # Government Document
+ # Guideline
"Historical Article": "article-journal",
- #Interactive Tutorial
+ # Interactive Tutorial
"Interview": "interview",
"Introductory Journal Article": "article-journal",
"Journal Article": "article-journal",
@@ -56,53 +55,65 @@ PUBMED_RELEASE_TYPE_MAP = {
"Legal Case": "legal_case",
"Legislation": "legislation",
"Letter": "letter",
- #Meta-Analysis
- #Multicenter Study
- #News
+ # Meta-Analysis
+ # Multicenter Study
+ # News
"Newspaper Article": "article-newspaper",
- #Observational Study
- #Observational Study, Veterinary
- #Overall
- #Patient Education Handout
- #Periodical Index
- #Personal Narrative
- #Portrait
- #Practice Guideline
- #Pragmatic Clinical Trial
- #Publication Components
- #Publication Formats
- #Publication Type Category
- #Randomized Controlled Trial
- #Research Support, American Recovery and Reinvestment Act
- #Research Support, N.I.H., Extramural
- #Research Support, N.I.H., Intramural
- #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
- #Research Support, U.S. Gov't, P.H.S.
- #Review # in the "literature review" sense, not "product review"
- #Scientific Integrity Review
- #Study Characteristics
- #Support of Research
- #Systematic Review
+ # Observational Study
+ # Observational Study, Veterinary
+ # Overall
+ # Patient Education Handout
+ # Periodical Index
+ # Personal Narrative
+ # Portrait
+ # Practice Guideline
+ # Pragmatic Clinical Trial
+ # Publication Components
+ # Publication Formats
+ # Publication Type Category
+ # Randomized Controlled Trial
+ # Research Support, American Recovery and Reinvestment Act
+ # Research Support, N.I.H., Extramural
+ # Research Support, N.I.H., Intramural
+ # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+ # Research Support, U.S. Gov't, P.H.S.
+ # Review # in the "literature review" sense, not "product review"
+ # Scientific Integrity Review
+ # Study Characteristics
+ # Support of Research
+ # Systematic Review
"Technical Report": "report",
- #Twin Study
- #Validation Studies
- #Video-Audio Media
- #Webcasts
+ # Twin Study
+ # Validation Studies
+ # Video-Audio Media
+ # Webcasts
}
MONTH_ABBR_MAP = {
- "Jan": 1, "01": 1,
- "Feb": 2, "02": 2,
- "Mar": 3, "03": 3,
- "Apr": 4, "04": 4,
- "May": 5, "05": 5,
- "Jun": 6, "06": 6,
- "Jul": 7, "07": 7,
- "Aug": 8, "08": 8,
- "Sep": 9, "09": 9,
- "Oct": 10, "10": 10,
- "Nov": 11, "11": 11,
- "Dec": 12, "12": 12,
+ "Jan": 1,
+ "01": 1,
+ "Feb": 2,
+ "02": 2,
+ "Mar": 3,
+ "03": 3,
+ "Apr": 4,
+ "04": 4,
+ "May": 5,
+ "05": 5,
+ "Jun": 6,
+ "06": 6,
+ "Jul": 7,
+ "07": 7,
+ "Aug": 8,
+ "08": 8,
+ "Sep": 9,
+ "09": 9,
+ "Oct": 10,
+ "10": 10,
+ "Nov": 11,
+ "11": 11,
+ "Dec": 12,
+ "12": 12,
}
# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
@@ -295,11 +306,10 @@ COUNTRY_NAME_MAP = {
"United Kingdom": "gb",
"United States": "us",
"Uruguay": "uy",
-
# Additions from running over large files
"Bosnia and Herzegovina": "ba",
- #"International"
- "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
+ # "International"
+ "China (Republic : 1949- )": "tw", # pretty sure this is tw not cn
"Russia (Federation)": "ru",
"Scotland": "gb",
"England": "gb",
@@ -320,18 +330,21 @@ class PubmedImporter(EntityImporter):
def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
- "Automated import of PubMed/MEDLINE XML metadata")
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
- super().__init__(api,
+ eg_desc = kwargs.get(
+ "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata"
+ )
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter")
+ super().__init__(
+ api,
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
- **kwargs)
+ **kwargs
+ )
self.lookup_refs = lookup_refs
- self.create_containers = kwargs.get('create_containers', True)
+ self.create_containers = kwargs.get("create_containers", True)
self.read_issn_map_file(issn_map_file)
def want(self, obj):
@@ -365,15 +378,15 @@ class PubmedImporter(EntityImporter):
release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
break
if pub_types:
- extra_pubmed['pub_types'] = pub_types
+ extra_pubmed["pub_types"] = pub_types
if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
release_type = "retraction"
retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
if retraction_of:
if retraction_of.RefSource:
- extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+ extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string
if retraction_of.PMID:
- extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+ extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string
# everything in medline is published
release_stage = "published"
@@ -388,18 +401,18 @@ class PubmedImporter(EntityImporter):
elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
withdrawn_status = "concern"
- pages = medline.find('MedlinePgn')
+ pages = medline.find("MedlinePgn")
if pages:
pages = pages.string
- title = medline.Article.ArticleTitle.get_text() # always present
+ title = medline.Article.ArticleTitle.get_text() # always present
if title:
- title = title.replace('\n', ' ')
- if title.endswith('.'):
+ title = title.replace("\n", " ")
+ if title.endswith("."):
title = title[:-1]
# this hides some "special" titles, but the vast majority are
# translations; translations don't always include the original_title
- if title.startswith('[') and title.endswith(']'):
+ if title.startswith("[") and title.endswith("]"):
title = title[1:-1]
else:
# will filter out later
@@ -408,8 +421,8 @@ class PubmedImporter(EntityImporter):
original_title = medline.Article.find("VernacularTitle", recurse=False)
if original_title:
original_title = original_title.get_text() or None
- original_title = original_title.replace('\n', ' ')
- if original_title and original_title.endswith('.'):
+ original_title = original_title.replace("\n", " ")
+ if original_title and original_title.endswith("."):
original_title = original_title[:-1]
if original_title and not title:
@@ -428,7 +441,9 @@ class PubmedImporter(EntityImporter):
else:
language = LANG_MAP_MARC.get(language)
if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
- warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+ warnings.warn(
+ "MISSING MARC LANG: {}".format(medline.Article.Language.string)
+ )
### Journal/Issue Metadata
# MedlineJournalInfo is always present
@@ -441,9 +456,9 @@ class PubmedImporter(EntityImporter):
country_name = mji.Country.string.strip()
country_code = COUNTRY_NAME_MAP.get(country_name)
if country_code:
- container_extra['country'] = country_code
+ container_extra["country"] = country_code
elif country_name:
- container_extra['country_name'] = country_name
+ container_extra["country_name"] = country_name
if mji.find("ISSNLinking"):
issnl = mji.ISSNLinking.string
@@ -462,7 +477,7 @@ class PubmedImporter(EntityImporter):
if issnl:
container_id = self.lookup_issnl(issnl)
- pub_date = medline.Article.find('ArticleDate')
+ pub_date = medline.Article.find("ArticleDate")
if not pub_date:
pub_date = journal.PubDate
if not pub_date:
@@ -476,7 +491,8 @@ class PubmedImporter(EntityImporter):
release_date = datetime.date(
release_year,
MONTH_ABBR_MAP[pub_date.Month.string],
- int(pub_date.Day.string))
+ int(pub_date.Day.string),
+ )
release_date = release_date.isoformat()
except ValueError as ve:
print("bad date, skipping: {}".format(ve), file=sys.stderr)
@@ -486,25 +502,35 @@ class PubmedImporter(EntityImporter):
if len(medline_date) >= 4 and medline_date[:4].isdigit():
release_year = int(medline_date[:4])
if release_year < 1300 or release_year > 2040:
- print("bad medline year, skipping: {}".format(release_year), file=sys.stderr)
+ print(
+ "bad medline year, skipping: {}".format(release_year), file=sys.stderr
+ )
release_year = None
else:
- print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)
+ print(
+ "unparsable medline date, skipping: {}".format(medline_date),
+ file=sys.stderr,
+ )
if journal.find("Title"):
container_name = journal.Title.get_text()
- if (container_id is None and self.create_containers and (issnl is not None)
- and container_name):
+ if (
+ container_id is None
+ and self.create_containers
+ and (issnl is not None)
+ and container_name
+ ):
# name, type, publisher, issnl
# extra: original_name, languages, country
ce = fatcat_openapi_client.ContainerEntity(
name=container_name,
- container_type='journal',
- #NOTE: publisher not included
+ container_type="journal",
+ # NOTE: publisher not included
issnl=issnl,
issnp=issnp,
- extra=(container_extra or None))
+ extra=(container_extra or None),
+ )
ce_edit = self.create_container(ce)
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
@@ -521,8 +547,10 @@ class PubmedImporter(EntityImporter):
# "All abstracts are in English"
abstracts = []
primary_abstract = medline.find("Abstract")
- if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
- joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
+ if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"):
+ joined = "\n".join(
+ [m.get_text() for m in primary_abstract.find_all("AbstractText")]
+ )
abst = fatcat_openapi_client.ReleaseAbstract(
content=joined,
mimetype="text/plain",
@@ -539,7 +567,7 @@ class PubmedImporter(EntityImporter):
)
if abst.content:
abstracts.append(abst)
- if abstract.find('math'):
+ if abstract.find("math"):
abst = fatcat_openapi_client.ReleaseAbstract(
# strip the <AbstractText> tags
content=str(abstract)[14:-15],
@@ -551,8 +579,8 @@ class PubmedImporter(EntityImporter):
other_abstracts = medline.find_all("OtherAbstract")
for other in other_abstracts:
lang = "en"
- if other.get('Language'):
- lang = LANG_MAP_MARC.get(other['Language'])
+ if other.get("Language"):
+ lang = LANG_MAP_MARC.get(other["Language"])
abst = fatcat_openapi_client.ReleaseAbstract(
content=other.AbstractText.get_text().strip(),
mimetype="text/plain",
@@ -572,15 +600,15 @@ class PubmedImporter(EntityImporter):
surname = None
raw_name = None
if author.ForeName:
- given_name = author.ForeName.get_text().replace('\n', ' ')
+ given_name = author.ForeName.get_text().replace("\n", " ")
if author.LastName:
- surname = author.LastName.get_text().replace('\n', ' ')
+ surname = author.LastName.get_text().replace("\n", " ")
if given_name and surname:
raw_name = "{} {}".format(given_name, surname)
elif surname:
raw_name = surname
if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
- raw_name = author.CollectiveName.get_text().replace('\n', ' ')
+ raw_name = author.CollectiveName.get_text().replace("\n", " ")
contrib_extra = dict()
orcid = author.find("Identifier", Source="ORCID")
if orcid:
@@ -590,7 +618,7 @@ class PubmedImporter(EntityImporter):
orcid = orcid.replace("http://orcid.org/", "")
elif orcid.startswith("https://orcid.org/"):
orcid = orcid.replace("https://orcid.org/", "")
- elif '-' not in orcid:
+ elif "-" not in orcid:
orcid = "{}-{}-{}-{}".format(
orcid[0:4],
orcid[4:8],
@@ -598,27 +626,31 @@ class PubmedImporter(EntityImporter):
orcid[12:16],
)
creator_id = self.lookup_orcid(orcid)
- contrib_extra['orcid'] = orcid
+ contrib_extra["orcid"] = orcid
affiliations = author.find_all("Affiliation")
raw_affiliation = None
if affiliations:
- raw_affiliation = affiliations[0].get_text().replace('\n', ' ')
+ raw_affiliation = affiliations[0].get_text().replace("\n", " ")
if len(affiliations) > 1:
- contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]
+ contrib_extra["more_affiliations"] = [
+ ra.get_text().replace("\n", " ") for ra in affiliations[1:]
+ ]
if author.find("EqualContrib"):
# TODO: schema for this?
- contrib_extra['equal'] = True
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- raw_name=raw_name,
- given_name=given_name,
- surname=surname,
- role="author",
- raw_affiliation=raw_affiliation,
- creator_id=creator_id,
- extra=contrib_extra,
- ))
-
- if medline.AuthorList['CompleteYN'] == 'N':
+ contrib_extra["equal"] = True
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ raw_name=raw_name,
+ given_name=given_name,
+ surname=surname,
+ role="author",
+ raw_affiliation=raw_affiliation,
+ creator_id=creator_id,
+ extra=contrib_extra,
+ )
+ )
+
+ if medline.AuthorList["CompleteYN"] == "N":
contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))
for i, contrib in enumerate(contribs):
@@ -633,7 +665,7 @@ class PubmedImporter(EntityImporter):
# note that Reference always exists within a ReferenceList, but
# that there may be multiple ReferenceList (eg, sometimes one per
# Reference)
- for ref in pubmed.find_all('Reference'):
+ for ref in pubmed.find_all("Reference"):
ref_extra = dict()
ref_doi = ref.find("ArticleId", IdType="doi")
if ref_doi:
@@ -643,22 +675,24 @@ class PubmedImporter(EntityImporter):
ref_pmid = clean_pmid(ref_pmid.string)
ref_release_id = None
if ref_doi:
- ref_extra['doi'] = ref_doi
+ ref_extra["doi"] = ref_doi
if self.lookup_refs:
ref_release_id = self.lookup_doi(ref_doi)
if ref_pmid:
- ref_extra['pmid'] = ref_pmid
+ ref_extra["pmid"] = ref_pmid
if self.lookup_refs:
ref_release_id = self.lookup_pmid(ref_pmid)
ref_raw = ref.Citation
if ref_raw:
- ref_extra['unstructured'] = ref_raw.get_text()
+ ref_extra["unstructured"] = ref_raw.get_text()
if not ref_extra:
ref_extra = None
- refs.append(fatcat_openapi_client.ReleaseRef(
- target_release_id=ref_release_id,
- extra=ref_extra,
- ))
+ refs.append(
+ fatcat_openapi_client.ReleaseRef(
+ target_release_id=ref_release_id,
+ extra=ref_extra,
+ )
+ )
if not refs:
refs = None
@@ -669,7 +703,7 @@ class PubmedImporter(EntityImporter):
# group-title
# pubmed: retraction refs
if extra_pubmed:
- extra['pubmed'] = extra_pubmed
+ extra["pubmed"] = extra_pubmed
if not extra:
extra = None
@@ -690,14 +724,14 @@ class PubmedImporter(EntityImporter):
doi=doi,
pmid=pmid,
pmcid=pmcid,
- #isbn13 # never in Article
+ # isbn13 # never in Article
),
volume=volume,
issue=issue,
pages=pages,
- #publisher # not included?
+ # publisher # not included?
language=language,
- #license_slug # not in MEDLINE
+ # license_slug # not in MEDLINE
abstracts=abstracts,
contribs=contribs,
refs=refs,
@@ -725,21 +759,22 @@ class PubmedImporter(EntityImporter):
raise err
if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
- existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid
+ )
warnings.warn(warn_str)
- self.counts['warn-pmid-doi-mismatch'] += 1
+ self.counts["warn-pmid-doi-mismatch"] += 1
# don't clobber DOI, but do group together
re.ext_ids.doi = None
re.work_id = existing.work_id
if existing and not self.do_updates:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
# TODO: any other reasons to do an update?
# don't update if it already has PMID
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
elif existing:
# but do update if only DOI was set
@@ -750,12 +785,12 @@ class PubmedImporter(EntityImporter):
existing.container_id = existing.container_id or re.container_id
existing.refs = existing.refs or re.refs
existing.abstracts = existing.abstracts or re.abstracts
- existing.extra['pubmed'] = re.extra['pubmed']
+ existing.extra["pubmed"] = re.extra["pubmed"]
# fix stub titles
if existing.title in [
- "OUP accepted manuscript",
- ]:
+ "OUP accepted manuscript",
+ ]:
existing.title = re.title
existing.original_title = existing.original_title or re.original_title
@@ -770,8 +805,8 @@ class PubmedImporter(EntityImporter):
existing.language = existing.language or re.language
# update subtitle in-place first
- if not existing.subtitle and existing.extra.get('subtitle'):
- subtitle = existing.extra.pop('subtitle')
+ if not existing.subtitle and existing.extra.get("subtitle"):
+ subtitle = existing.extra.pop("subtitle")
if type(subtitle) == list:
subtitle = subtitle[0]
if subtitle:
@@ -781,13 +816,13 @@ class PubmedImporter(EntityImporter):
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -797,11 +832,14 @@ class PubmedImporter(EntityImporter):
return True
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def parse_file(self, handle):
@@ -812,8 +850,9 @@ class PubmedImporter(EntityImporter):
for article in soup.find_all("PubmedArticle"):
resp = self.parse_record(article)
print(json.dumps(resp))
- #sys.exit(-1)
+ # sys.exit(-1)
+
-if __name__=='__main__':
+if __name__ == "__main__":
parser = PubmedImporter(None, None)
parser.parse_file(open(sys.argv[1]))