diff options
Diffstat (limited to 'python/fatcat_tools/importers/pubmed.py')
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 92 |
1 files changed, 40 insertions, 52 deletions
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 80cf986c..3611a299 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -8,6 +8,7 @@ from bs4 import BeautifulSoup from bs4.element import NavigableString import fatcat_openapi_client +from fatcat_tools.normal import * from .common import EntityImporter, clean, LANG_MAP_MARC # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly @@ -317,7 +318,7 @@ class PubmedImporter(EntityImporter): TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ - def __init__(self, api, issn_map_file, lookup_refs=False, **kwargs): + def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of PubMed/MEDLINE XML metadata") @@ -330,38 +331,9 @@ class PubmedImporter(EntityImporter): **kwargs) self.lookup_refs = lookup_refs - extid_map_file = kwargs.get('extid_map_file') - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.create_containers = kwargs.get('create_containers', True) self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, pmid): - if self.extid_map_db is None: - return dict(doi=None, core_id=None, pmid=None, pmcid=None, - wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1", - [pmid]).fetchone() - if row is None: - return dict(doi=None, core_id=None, pmid=None, pmcid=None, - wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] - return dict( - core_id=row[0], - doi=row[1], - pmcid=row[2], - wikidata_qid=row[3], - # TODO: - arxiv_id=None, - jstor_id=None, - ) - def want(self, obj): return True @@ -376,20 +348,14 @@ class PubmedImporter(EntityImporter): identifiers = pubmed.ArticleIdList pmid = medline.PMID.string.strip() doi = identifiers.find("ArticleId", IdType="doi") - if doi: - doi = doi.string.lower().strip() - if doi.startswith('doi:'): - doi = doi[4:] - if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]) and len(doi.split()) == 1: - sys.stderr.write("BOGUS DOI: {}\n".format(doi)) - doi = None + if doi and doi.string: + doi = clean_doi(doi.string) + else: + doi = None pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: - pmcid = pmcid.string.strip().upper() - # got a bunch of weird ones like "wst_2018_399" in the 2019 baseline - if not pmcid.startswith("PMC"): - pmcid = None + pmcid = clean_pmcid(pmcid.string.strip().upper()) release_type = None pub_types = [] @@ -486,6 +452,8 @@ class PubmedImporter(EntityImporter): pub_date = medline.Article.find('ArticleDate') if not pub_date: pub_date = journal.PubDate + if not pub_date: + pub_date = journal.JournalIssue.PubDate release_date = None release_year = None if pub_date.Year: @@ -498,8 +466,17 @@ class PubmedImporter(EntityImporter): int(pub_date.Day.string)) release_date = release_date.isoformat() except ValueError as ve: - sys.stderr.write("bad date, skipping: {}\n".format(ve)) + print("bad date, skipping: {}".format(ve), file=sys.stderr) release_date = None + elif pub_date.MedlineDate: + medline_date = pub_date.MedlineDate.string.strip() + if len(medline_date) >= 4 and medline_date[:4].isdigit(): + release_year = int(medline_date[:4]) + if release_year < 1300 or release_year > 2040: + print("bad medline year, skipping: {}".format(release_year), file=sys.stderr) + release_year = None + else: + print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr) if journal.find("Title"): container_name = journal.Title.string @@ -641,19 +618,21 @@ class PubmedImporter(EntityImporter): if pubmed.ReferenceList: for ref in pubmed.ReferenceList.find_all('Reference'): ref_extra = dict() - ref_pmid = ref.find("ArticleId", IdType="pubmed") ref_doi = ref.find("ArticleId", IdType="doi") - ref_release_id = None + if ref_doi: + ref_doi = clean_doi(ref_doi.string) + ref_pmid = ref.find("ArticleId", IdType="pubmed") if ref_pmid: - ref_pmid = ref_pmid.string.strip() - ref_extra['pmid'] = ref_pmid - if self.lookup_refs: - ref_release_id = self.lookup_pmid(ref_pmid) + ref_pmid = clean_pmid(ref_pmid.string) + ref_release_id = None if ref_doi: - ref_doi = ref_doi.string.lower().strip() ref_extra['doi'] = ref_doi if self.lookup_refs: ref_release_id = self.lookup_doi(ref_doi) + if ref_pmid: + ref_extra['pmid'] = ref_pmid + if self.lookup_refs: + ref_release_id = self.lookup_pmid(ref_pmid) ref_raw = ref.Citation if ref_raw: ref_extra['unstructured'] = ref_raw.string @@ -668,7 +647,6 @@ class PubmedImporter(EntityImporter): # extra: # translation_of - # subtitle # aliases # container_name # group-title @@ -729,8 +707,9 @@ class PubmedImporter(EntityImporter): if err.status != 404: raise err if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: - warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format( - existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) + warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format( + existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid) + warnings.warn(warn_str) self.counts['warn-pmid-doi-mismatch'] += 1 # don't clobber DOI, but do group together re.ext_ids.doi = None @@ -748,6 +727,15 @@ class PubmedImporter(EntityImporter): existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid existing.refs = existing.refs or re.refs existing.extra['pubmed'] = re.extra['pubmed'] + # update subtitle in-place first + if not existing.subtitle and existing.extra.get('subtitle'): + subtitle = existing.extra.pop('subtitle') + if type(subtitle) == list: + subtitle = subtitle[0] + if subtitle: + existing.subtitle = subtitle + if not existing.subtitle: + existing.subtitle = re.subtitle try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 |