aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:03:31 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit0ec3fc58b4394102ffaaf385e6048a6412a9c9b7 (patch)
treec83c9266f8fe5599ba95a95d47430b1b96fce247
parent1829eee6e01a4d21604ce1ec6c7a5230467b4b63 (diff)
downloadfatcat-0ec3fc58b4394102ffaaf385e6048a6412a9c9b7.tar.gz
fatcat-0ec3fc58b4394102ffaaf385e6048a6412a9c9b7.zip
updates to pubmed importer
-rw-r--r--python/fatcat_tools/importers/common.py21
-rw-r--r--python/fatcat_tools/importers/pubmed.py92
-rw-r--r--python/tests/import_pubmed.py49
3 files changed, 125 insertions, 37 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index e37d57ec..6e0c5caf 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -263,6 +263,7 @@ class EntityImporter:
self._orcid_id_map = dict()
self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
self._doi_id_map = dict()
+ self._pmid_id_map = dict()
def reset(self):
self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
@@ -410,7 +411,9 @@ class EntityImporter:
return doi.startswith("10.") and doi.count("/") >= 1
def lookup_doi(self, doi):
- """Caches calls to the doi lookup API endpoint in a local dict"""
+ """Caches calls to the doi lookup API endpoint in a local dict
+
+ For identifier lookups only (not full object fetches)"""
assert self.is_doi(doi)
doi = doi.lower()
if doi in self._doi_id_map:
@@ -425,6 +428,22 @@ class EntityImporter:
self._doi_id_map[doi] = release_id # might be None
return release_id
+ def lookup_pmid(self, pmid):
+ """Caches calls to the pmid lookup API endpoint in a local dict
+
+ For identifier lookups only (not full object fetches)"""
+ if pmid in self._pmid_id_map:
+ return self._pmid_id_map[pmid]
+ release_id = None
+ try:
+ rv = self.api.lookup_release(pmid=pmid)
+ release_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._pmid_id_map[pmid] = release_id # might be None
+ return release_id
+
def is_issnl(self, issnl):
return len(issnl) == 9 and issnl[4] == '-'
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 7c4c67eb..7c4e8311 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -108,14 +108,13 @@ MONTH_ABBR_MAP = {
class PubmedImporter(EntityImporter):
"""
Importer for PubMed/MEDLINE XML metadata.
+
+ If lookup_refs is true, will do identifer-based lookups for all references.
TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
- TODO: clean (ftfy) title, original title, etc
- XXX: withdrawn
- XXX: full author names
"""
- def __init__(self, api, issn_map_file, **kwargs):
+ def __init__(self, api, issn_map_file, lookup_refs=False, **kwargs):
eg_desc = kwargs.get('editgroup_description',
"Automated import of PubMed/MEDLINE XML metadata")
@@ -127,6 +126,7 @@ class PubmedImporter(EntityImporter):
editgroup_extra=eg_extra,
**kwargs)
+ self.lookup_refs = lookup_refs
extid_map_file = kwargs.get('extid_map_file')
self.extid_map_db = None
if extid_map_file:
@@ -178,8 +178,7 @@ class PubmedImporter(EntityImporter):
pmcid = identifiers.find("ArticleId", IdType="pmc")
if pmcid:
- # XXX: strip the version part? or retain?
- pmcid = pmcid.string.split('.')[0]
+ pmcid = pmcid.string
release_type = None
pub_types = []
@@ -203,8 +202,12 @@ class PubmedImporter(EntityImporter):
release_stage = "updated"
if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
release_stage = "retraction"
+
+ withdrawn_status = None
if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
withdrawn_status = "retracted"
+ elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
+ withdrawn_status = "concern"
pages = medline.find('MedlinePgn')
if pages:
@@ -219,7 +222,7 @@ class PubmedImporter(EntityImporter):
if title.startswith('[') and title.endswith(']'):
title = title[1:-1]
else:
- # TODO: will filter out later
+ # will filter out later
title = None
original_title = medline.Article.find("VernacularTitle", recurse=False)
@@ -229,11 +232,9 @@ class PubmedImporter(EntityImporter):
original_title = original_title[:-1]
# TODO: happening in alpha order, not handling multi-language well.
- # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html
language = medline.Article.Language
if language:
language = language.string
- # TODO: map to two-letter
if language in ("und", "un"):
# "undetermined"
language = None
@@ -264,7 +265,9 @@ class PubmedImporter(EntityImporter):
if issnl:
container_id = self.lookup_issnl(issnl)
- pub_date = journal.PubDate
+ pub_date = medline.Article.find('ArticleDate')
+ if not pub_date:
+ pub_date = journal.PubDate
release_date = None
release_year = None
if pub_date.Year:
@@ -275,8 +278,6 @@ class PubmedImporter(EntityImporter):
MONTH_ABBR_MAP[pub_date.Month.string],
int(pub_date.Day.string))
release_date = release_date.isoformat()
- elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing?
- release_year = int(pub_date.MedlineDate.string.split()[0][:4])
if journal.find("Title"):
container_name = journal.Title.string
@@ -288,7 +289,7 @@ class PubmedImporter(EntityImporter):
ce = fatcat_client.ContainerEntity(
name=container_name,
container_type='journal',
- #XXX: publisher not included?
+ #NOTE: publisher not included
issnl=issnl,
extra=(container_extra or None))
ce_edit = self.create_container(ce)
@@ -305,16 +306,16 @@ class PubmedImporter(EntityImporter):
### Abstracts
# "All abstracts are in English"
abstracts = []
- first_abstract = medline.find("AbstractText")
- if first_abstract and first_abstract.get('NlmCategory'):
- joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
+ primary_abstract = medline.find("Abstract")
+ if primary_abstract and primary_abstract.AbstractText.get('NlmCategory'):
+ joined = "\n".join([m.get_text() for m in primary_abstract.find_all("AbstractText")])
abstracts.append(fatcat_client.ReleaseAbstract(
content=joined,
mimetype="text/plain",
lang="en",
))
- else:
- for abstract in medline.find_all("AbstractText"):
+ elif primary_abstract:
+ for abstract in primary_abstract.find_all("AbstractText"):
abstracts.append(fatcat_client.ReleaseAbstract(
content=abstract.get_text().strip(),
mimetype="text/plain",
@@ -327,6 +328,16 @@ class PubmedImporter(EntityImporter):
mimetype="application/mathml+xml",
lang="en",
))
+ other_abstracts = medline.find_all("OtherAbstract")
+ for other in other_abstracts:
+ lang = "en"
+ if other.get('Language'):
+ lang = LANG_MAP_MARC.get(other['Language'])
+ abstracts.append(fatcat_client.ReleaseAbstract(
+ content=other.AbstractText.get_text().strip(),
+ mimetype="text/plain",
+ lang=lang,
+ ))
if not abstracts:
abstracts = None
@@ -334,6 +345,7 @@ class PubmedImporter(EntityImporter):
contribs = []
if medline.AuthorList:
for author in medline.AuthorList.find_all("Author"):
+ creator_id = None
given_name = None
surname = None
raw_name = None
@@ -361,21 +373,24 @@ class PubmedImporter(EntityImporter):
orcid[8:12],
orcid[12:16],
)
- # XXX: do lookup by ORCID
- #contrib_extra['orcid'] = orcid
- affiliation = author.find("Affiliation")
+ creator_id = self.lookup_orcid(orcid)
+ contrib_extra['orcid'] = orcid
+ affiliations = author.find_all("Affiliation")
raw_affiliation = None
- if affiliation:
- raw_affiliation = affiliation.string
+ if affiliations:
+ raw_affiliation = affiliations[0].string
+ if len(affiliations) > 1:
+ contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]]
if author.find("EqualContrib"):
# TODO: schema for this?
- contrib_extra['equal_contrib'] = True
+ contrib_extra['equal'] = True
contribs.append(fatcat_client.ReleaseContrib(
raw_name=raw_name,
given_name=given_name,
surname=surname,
role="author",
raw_affiliation=raw_affiliation,
+ creator_id=creator_id,
extra=contrib_extra,
))
@@ -388,25 +403,33 @@ class PubmedImporter(EntityImporter):
refs = []
if pubmed.ReferenceList:
for ref in pubmed.ReferenceList.find_all('Reference'):
- ref_obj = dict()
ref_extra = dict()
ref_pmid = ref.find("ArticleId", IdType="pubmed")
+ ref_doi = ref.find("ArticleId", IdType="doi")
+ ref_release_id = None
if ref_pmid:
- ref_extra['pmid'] = ref_pmid.string
- # TODO: do reference lookups here based on PMID/DOI
+ ref_pmid = ref_pmid.string.strip()
+ ref_extra['pmid'] = ref_pmid
+ if self.lookup_refs:
+ ref_release_id = self.lookup_pmid(ref_pmid)
+ if ref_doi:
+ ref_doi = ref_doi.string.lower().strip()
+ ref_extra['doi'] = ref_doi
+ if self.lookup_refs:
+ ref_release_id = self.lookup_doi(ref_doi)
ref_raw = ref.Citation
if ref_raw:
ref_extra['unstructured'] = ref_raw.string
- if ref_extra:
- ref_obj['extra'] = ref_extra
+ if not ref_extra:
+ ref_extra = None
refs.append(fatcat_client.ReleaseRef(
- extra=ref_obj.get('extra'),
+ target_release_id=ref_release_id,
+ extra=ref_extra,
))
if not refs:
refs = None
# extra:
- # withdrawn_date
# translation_of
# subtitle
# aliases
@@ -418,14 +441,19 @@ class PubmedImporter(EntityImporter):
if not extra:
extra = None
+ title = clean(title)
+ if not title:
+ return None
+
re = fatcat_client.ReleaseEntity(
work_id=None,
- title=clean(title),
+ title=title,
original_title=clean(original_title),
release_type=release_type,
release_stage=release_stage,
release_date=release_date,
release_year=release_year,
+ withdrawn_status=withdrawn_status,
ext_ids=fatcat_client.ReleaseExtIds(
doi=doi,
pmid=pmid,
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 05a77599..0185c8c4 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -9,12 +9,12 @@ from bs4 import BeautifulSoup
@pytest.fixture(scope="function")
def pubmed_importer(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
+ yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True, lookup_refs=True)
@pytest.fixture(scope="function")
def pubmed_importer_existing(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
+ yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False, lookup_refs=True)
def test_pubmed_importer(pubmed_importer):
last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
@@ -73,8 +73,49 @@ def test_pubmed_xml_parse(pubmed_importer):
assert r1.contribs[0].surname == "Blume"
print(r1.extra)
- # TODO: assert r1.extra['pubmed']['mesh_topics'] == ['Accounting', 'Economics, Hospital', 'Hospital Administration']
assert r1.extra['pubmed']['pub_types'] == ['Journal Article']
assert not r1.refs
- # XXX: r2 tests
+ assert r2.title == "Synthesis and Antibacterial Activity of Metal(loid) Nanostructures by Environmental Multi-Metal(loid) Resistant Bacteria and Metal(loid)-Reducing Flavoproteins"
+ assert r2.subtitle == None
+ assert r2.original_title == None
+ assert r2.publisher == None
+ assert r2.release_type == "article-journal"
+ assert r2.release_stage == "published"
+ assert r2.license_slug == None
+ assert r2.ext_ids.doi == "10.3389/fmicb.2018.00959"
+ assert r2.ext_ids.pmid == "29869640"
+ assert r2.ext_ids.pmcid == "PMC5962736"
+ assert r2.language == "en"
+ assert r2.volume == "9"
+ assert r2.issue == None
+ assert r2.pages == "959"
+ assert str(r2.release_date) == "2018-05-15"
+ assert r2.release_year == 2018
+ # matched by ISSN, so shouldn't be in there?
+ #assert extra['container_name'] == "Frontiers in microbiology"
+
+ assert len(r2.contribs) > 3
+ assert r2.contribs[0].raw_name == "Maximiliano Figueroa"
+ assert r2.contribs[0].given_name == "Maximiliano"
+ assert r2.contribs[0].surname == "Figueroa"
+ assert r2.contribs[0].raw_affiliation == "Laboratorio Microbiología Molecular, Departamento de Biología, Facultad de Química y Biología, Universidad de Santiago de Chile, Santiago, Chile."
+ assert r2.contribs[4].surname == "Muñoz-Villagrán"
+ assert r2.contribs[7].surname == "Latorre"
+ assert r2.contribs[7].raw_affiliation == "Mathomics, Centro de Modelamiento Matemático, Universidad de Chile, Beauchef, Santiago, Chile."
+ assert r2.contribs[7].extra['more_affiliations'] == [
+ "Fondap-Center of Genome Regulation, Facultad de Ciencias, Universidad de Chile, Santiago, Chile.",
+ "Laboratorio de Bioinformática y Expresión Génica, INTA, Universidad de Chile, Santiago, Chile.",
+ "Instituto de Ciencias de la Ingeniería, Universidad de O'Higgins, Rancagua, Chile.",
+ ]
+ assert r2.contribs[-1].raw_name == "Felipe Arenas"
+
+ assert r2.abstracts[0].content.startswith("Microbes are suitable candidates to recover and decontaminate different environments from soluble metal ions, either via reduction")
+ assert r2.abstracts[0].lang == "en"
+
+ print(r2.extra)
+ assert r2.extra['pubmed']['pub_types'] == ['Journal Article']
+
+ assert r2.refs[0].extra['unstructured'] == "Microbiology. 2009 Jun;155(Pt 6):1840-6"
+ assert r2.refs[0].extra['pmid'] == "19383690"
+