From 0ec3fc58b4394102ffaaf385e6048a6412a9c9b7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 May 2019 11:03:31 -0700 Subject: updates to pubmed importer --- python/fatcat_tools/importers/common.py | 21 +++++++- python/fatcat_tools/importers/pubmed.py | 92 +++++++++++++++++++++------------ python/tests/import_pubmed.py | 49 ++++++++++++++++-- 3 files changed, 125 insertions(+), 37 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index e37d57ec..6e0c5caf 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -263,6 +263,7 @@ class EntityImporter: self._orcid_id_map = dict() self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") self._doi_id_map = dict() + self._pmid_id_map = dict() def reset(self): self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) @@ -410,7 +411,9 @@ class EntityImporter: return doi.startswith("10.") and doi.count("/") >= 1 def lookup_doi(self, doi): - """Caches calls to the doi lookup API endpoint in a local dict""" + """Caches calls to the doi lookup API endpoint in a local dict + + For identifier lookups only (not full object fetches)""" assert self.is_doi(doi) doi = doi.lower() if doi in self._doi_id_map: @@ -425,6 +428,22 @@ class EntityImporter: self._doi_id_map[doi] = release_id # might be None return release_id + def lookup_pmid(self, pmid): + """Caches calls to the pmid lookup API endpoint in a local dict + + For identifier lookups only (not full object fetches)""" + if pmid in self._pmid_id_map: + return self._pmid_id_map[pmid] + release_id = None + try: + rv = self.api.lookup_release(pmid=pmid) + release_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._pmid_id_map[pmid] = release_id # might be None + return release_id + def is_issnl(self, issnl): return len(issnl) == 9 and issnl[4] == '-' diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 7c4c67eb..7c4e8311 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -108,14 +108,13 @@ MONTH_ABBR_MAP = { class PubmedImporter(EntityImporter): """ Importer for PubMed/MEDLINE XML metadata. + + If lookup_refs is true, will do identifer-based lookups for all references. TODO: MEDLINE doesn't include PMC/OA license; could include in importer? - TODO: clean (ftfy) title, original title, etc - XXX: withdrawn - XXX: full author names """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api, issn_map_file, lookup_refs=False, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of PubMed/MEDLINE XML metadata") @@ -127,6 +126,7 @@ class PubmedImporter(EntityImporter): editgroup_extra=eg_extra, **kwargs) + self.lookup_refs = lookup_refs extid_map_file = kwargs.get('extid_map_file') self.extid_map_db = None if extid_map_file: @@ -178,8 +178,7 @@ class PubmedImporter(EntityImporter): pmcid = identifiers.find("ArticleId", IdType="pmc") if pmcid: - # XXX: strip the version part? or retain? - pmcid = pmcid.string.split('.')[0] + pmcid = pmcid.string release_type = None pub_types = [] @@ -203,8 +202,12 @@ class PubmedImporter(EntityImporter): release_stage = "updated" if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): release_stage = "retraction" + + withdrawn_status = None if medline.Article.PublicationTypeList.find(string="Retracted Publication"): withdrawn_status = "retracted" + elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"): + withdrawn_status = "concern" pages = medline.find('MedlinePgn') if pages: @@ -219,7 +222,7 @@ class PubmedImporter(EntityImporter): if title.startswith('[') and title.endswith(']'): title = title[1:-1] else: - # TODO: will filter out later + # will filter out later title = None original_title = medline.Article.find("VernacularTitle", recurse=False) @@ -229,11 +232,9 @@ class PubmedImporter(EntityImporter): original_title = original_title[:-1] # TODO: happening in alpha order, not handling multi-language well. - # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html language = medline.Article.Language if language: language = language.string - # TODO: map to two-letter if language in ("und", "un"): # "undetermined" language = None @@ -264,7 +265,9 @@ class PubmedImporter(EntityImporter): if issnl: container_id = self.lookup_issnl(issnl) - pub_date = journal.PubDate + pub_date = medline.Article.find('ArticleDate') + if not pub_date: + pub_date = journal.PubDate release_date = None release_year = None if pub_date.Year: @@ -275,8 +278,6 @@ class PubmedImporter(EntityImporter): MONTH_ABBR_MAP[pub_date.Month.string], int(pub_date.Day.string)) release_date = release_date.isoformat() - elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing? - release_year = int(pub_date.MedlineDate.string.split()[0][:4]) if journal.find("Title"): container_name = journal.Title.string @@ -288,7 +289,7 @@ class PubmedImporter(EntityImporter): ce = fatcat_client.ContainerEntity( name=container_name, container_type='journal', - #XXX: publisher not included? + #NOTE: publisher not included issnl=issnl, extra=(container_extra or None)) ce_edit = self.create_container(ce) @@ -305,16 +306,16 @@ class PubmedImporter(EntityImporter): ### Abstracts # "All abstracts are in English" abstracts = [] - first_abstract = medline.find("AbstractText") - if first_abstract and first_abstract.get('NlmCategory'): - joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) + primary_abstract = medline.find("Abstract") + if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'): + joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")]) abstracts.append(fatcat_client.ReleaseAbstract( content=joined, mimetype="text/plain", lang="en", )) - else: - for abstract in medline.find_all("AbstractText"): + elif primary_abstract: + for abstract in primary_abstract.find_all("AbstractText"): abstracts.append(fatcat_client.ReleaseAbstract( content=abstract.get_text().strip(), mimetype="text/plain", @@ -327,6 +328,16 @@ class PubmedImporter(EntityImporter): mimetype="application/mathml+xml", lang="en", )) + other_abstracts = medline.find_all("OtherAbstract") + for other in other_abstracts: + lang = "en" + if other.get('Language'): + lang = LANG_MAP_MARC.get(other['Language']) + abstracts.append(fatcat_client.ReleaseAbstract( + content=other.AbstractText.get_text().strip(), + mimetype="text/plain", + lang=lang, + )) if not abstracts: abstracts = None @@ -334,6 +345,7 @@ class PubmedImporter(EntityImporter): contribs = [] if medline.AuthorList: for author in medline.AuthorList.find_all("Author"): + creator_id = None given_name = None surname = None raw_name = None @@ -361,21 +373,24 @@ class PubmedImporter(EntityImporter): orcid[8:12], orcid[12:16], ) - # XXX: do lookup by ORCID - #contrib_extra['orcid'] = orcid - affiliation = author.find("Affiliation") + creator_id = self.lookup_orcid(orcid) + contrib_extra['orcid'] = orcid + affiliations = author.find_all("Affiliation") raw_affiliation = None - if affiliation: - raw_affiliation = affiliation.string + if affiliations: + raw_affiliation = affiliations[0].string + if len(affiliations) > 1: + contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]] if author.find("EqualContrib"): # TODO: schema for this? - contrib_extra['equal_contrib'] = True + contrib_extra['equal'] = True contribs.append(fatcat_client.ReleaseContrib( raw_name=raw_name, given_name=given_name, surname=surname, role="author", raw_affiliation=raw_affiliation, + creator_id=creator_id, extra=contrib_extra, )) @@ -388,25 +403,33 @@ class PubmedImporter(EntityImporter): refs = [] if pubmed.ReferenceList: for ref in pubmed.ReferenceList.find_all('Reference'): - ref_obj = dict() ref_extra = dict() ref_pmid = ref.find("ArticleId", IdType="pubmed") + ref_doi = ref.find("ArticleId", IdType="doi") + ref_release_id = None if ref_pmid: - ref_extra['pmid'] = ref_pmid.string - # TODO: do reference lookups here based on PMID/DOI + ref_pmid = ref_pmid.string.strip() + ref_extra['pmid'] = ref_pmid + if self.lookup_refs: + ref_release_id = self.lookup_pmid(ref_pmid) + if ref_doi: + ref_doi = ref_doi.string.lower().strip() + ref_extra['doi'] = ref_doi + if self.lookup_refs: + ref_release_id = self.lookup_doi(ref_doi) ref_raw = ref.Citation if ref_raw: ref_extra['unstructured'] = ref_raw.string - if ref_extra: - ref_obj['extra'] = ref_extra + if not ref_extra: + ref_extra = None refs.append(fatcat_client.ReleaseRef( - extra=ref_obj.get('extra'), + target_release_id=ref_release_id, + extra=ref_extra, )) if not refs: refs = None # extra: - # withdrawn_date # translation_of # subtitle # aliases @@ -418,14 +441,19 @@ class PubmedImporter(EntityImporter): if not extra: extra = None + title = clean(title) + if not title: + return None + re = fatcat_client.ReleaseEntity( work_id=None, - title=clean(title), + title=title, original_title=clean(original_title), release_type=release_type, release_stage=release_stage, release_date=release_date, release_year=release_year, + withdrawn_status=withdrawn_status, ext_ids=fatcat_client.ReleaseExtIds( doi=doi, pmid=pmid, diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index 05a77599..0185c8c4 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -9,12 +9,12 @@ from bs4 import BeautifulSoup @pytest.fixture(scope="function") def pubmed_importer(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) + yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True, lookup_refs=True) @pytest.fixture(scope="function") def pubmed_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) + yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False, lookup_refs=True) def test_pubmed_importer(pubmed_importer): last_index = pubmed_importer.api.get_changelog(limit=1)[0].index @@ -73,8 +73,49 @@ def test_pubmed_xml_parse(pubmed_importer): assert r1.contribs[0].surname == "Blume" print(r1.extra) - # TODO: assert r1.extra['pubmed']['mesh_topics'] == ['Accounting', 'Economics, Hospital', 'Hospital Administration'] assert r1.extra['pubmed']['pub_types'] == ['Journal Article'] assert not r1.refs - # XXX: r2 tests + assert r2.title == "Synthesis and Antibacterial Activity of Metal(loid) Nanostructures by Environmental Multi-Metal(loid) Resistant Bacteria and Metal(loid)-Reducing Flavoproteins" + assert r2.subtitle == None + assert r2.original_title == None + assert r2.publisher == None + assert r2.release_type == "article-journal" + assert r2.release_stage == "published" + assert r2.license_slug == None + assert r2.ext_ids.doi == "10.3389/fmicb.2018.00959" + assert r2.ext_ids.pmid == "29869640" + assert r2.ext_ids.pmcid == "PMC5962736" + assert r2.language == "en" + assert r2.volume == "9" + assert r2.issue == None + assert r2.pages == "959" + assert str(r2.release_date) == "2018-05-15" + assert r2.release_year == 2018 + # matched by ISSN, so shouldn't be in there? + #assert extra['container_name'] == "Frontiers in microbiology" + + assert len(r2.contribs) > 3 + assert r2.contribs[0].raw_name == "Maximiliano Figueroa" + assert r2.contribs[0].given_name == "Maximiliano" + assert r2.contribs[0].surname == "Figueroa" + assert r2.contribs[0].raw_affiliation == "Laboratorio Microbiología Molecular, Departamento de Biología, Facultad de Química y Biología, Universidad de Santiago de Chile, Santiago, Chile." + assert r2.contribs[4].surname == "Muñoz-Villagrán" + assert r2.contribs[7].surname == "Latorre" + assert r2.contribs[7].raw_affiliation == "Mathomics, Centro de Modelamiento Matemático, Universidad de Chile, Beauchef, Santiago, Chile." + assert r2.contribs[7].extra['more_affiliations'] == [ + "Fondap-Center of Genome Regulation, Facultad de Ciencias, Universidad de Chile, Santiago, Chile.", + "Laboratorio de Bioinformática y Expresión Génica, INTA, Universidad de Chile, Santiago, Chile.", + "Instituto de Ciencias de la Ingeniería, Universidad de O'Higgins, Rancagua, Chile.", + ] + assert r2.contribs[-1].raw_name == "Felipe Arenas" + + assert r2.abstracts[0].content.startswith("Microbes are suitable candidates to recover and decontaminate different environments from soluble metal ions, either via reduction") + assert r2.abstracts[0].lang == "en" + + print(r2.extra) + assert r2.extra['pubmed']['pub_types'] == ['Journal Article'] + + assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6" + assert r2.refs[0].extra['pmid'] == "19383690" + -- cgit v1.2.3