aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-15 22:36:01 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit300665927f578151321b0d91b28f8aadffcf227d (patch)
tree5df52bf64004adc52f8ebde5f75f549237d02a5c
parente27e3f443ea35b145dd07c252cdc8619d7c2ab15 (diff)
downloadfatcat-300665927f578151321b0d91b28f8aadffcf227d.tar.gz
fatcat-300665927f578151321b0d91b28f8aadffcf227d.zip
initial pubmed importer
-rw-r--r--python/fatcat_tools/importers/__init__.py5
-rw-r--r--python/fatcat_tools/importers/pubmed.py (renamed from python/parse_pubmed_xml.py)290
-rw-r--r--python/tests/import_pubmed.py80
3 files changed, 298 insertions, 77 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 8ec219f8..6f8849d6 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,11 +12,12 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
-from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
from .jalc import JalcImporter
from .jstor import JstorImporter
from .arxiv import ArxivRawImporter
+from .pubmed import PubmedImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/parse_pubmed_xml.py b/python/fatcat_tools/importers/pubmed.py
index 413333cc..1feb41cd 100644
--- a/python/parse_pubmed_xml.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -1,10 +1,15 @@
import sys
import json
+import sqlite3
import datetime
+import warnings
from bs4 import BeautifulSoup
from bs4.element import NavigableString
+import fatcat_client
+from .common import EntityImporter, clean, LANG_MAP_MARC
+
# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
PUBMED_RELEASE_TYPE_MAP = {
#Adaptive Clinical Trial
@@ -99,29 +104,68 @@ MONTH_ABBR_MAP = {
"Dec": 12, "12": 12,
}
-class PubMedParser():
- """
- Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON)
+class PubmedImporter(EntityImporter):
+ """
+ Importer for PubMed/MEDLINE XML metadata.
+
TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
TODO: clean (ftfy) title, original title, etc
+ XXX: withdrawn
+ XXX: full author names
"""
def __init__(self):
pass
- def parse_file(self, handle):
-
- # 1. open with beautiful soup
- soup = BeautifulSoup(handle, "xml")
+ def __init__(self, api, issn_map_file, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of PubMed/MEDLINE XML metadata")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ extid_map_file = kwargs.get('extid_map_file')
+ self.extid_map_db = None
+ if extid_map_file:
+ db_uri = "file:{}?mode=ro".format(extid_map_file)
+ print("Using external ID map: {}".format(db_uri))
+ self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+ else:
+ print("Not using external ID map")
+
+ self.create_containers = kwargs.get('create_containers')
+ self.read_issn_map_file(issn_map_file)
+
+ def lookup_ext_ids(self, pmid):
+ if self.extid_map_db is None:
+ return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+ wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1",
+ [pmid]).fetchone()
+ if row is None:
+ return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+ wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=row[0],
+ doi=row[1],
+ pmcid=row[2],
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
- # 2. iterate over articles, call parse_article on each
- for article in soup.find_all("PubmedArticle"):
- resp = self.parse_article(article)
- print(json.dumps(resp))
- #sys.exit(-1)
+ def want(self, obj):
+ return True
- def parse_article(self, a):
+ def parse_record(self, a):
medline = a.MedlineCitation
# PubmedData isn't required by DTD, but seems to always be present
@@ -130,6 +174,7 @@ class PubMedParser():
extra_pubmed = dict()
identifiers = pubmed.ArticleIdList
+ pmid = medline.PMID.string.strip()
doi = identifiers.find("ArticleId", IdType="doi")
if doi:
doi = doi.string.lower()
@@ -139,10 +184,14 @@ class PubMedParser():
pmcid = pmcid.string
release_type = None
+ pub_types = []
for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
+ pub_types.append(pub_type.string)
if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
- break
+ break
+ if pub_types:
+ extra_pubmed['pub_types'] = pub_types
if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
release_type = "retraction"
retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
@@ -151,11 +200,13 @@ class PubMedParser():
extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
# everything in medline is published
- release_status = "published"
+ release_stage = "published"
if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
- release_status = "updated"
+ release_stage = "updated"
+ if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+ release_stage = "retraction"
if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
- release_status = "retracted"
+ withdrawn_status = "retracted"
pages = medline.find('MedlinePgn')
if pages:
@@ -188,27 +239,37 @@ class PubMedParser():
if language in ("und", "un"):
# "undetermined"
language = None
+ else:
+ language = LANG_MAP_MARC.get(language)
+ if not language:
+ warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
### Journal/Issue Metadata
# MedlineJournalInfo is always present
- container = dict()
+ issnl = None
+ container_id = None
+ container_name = None
container_extra = dict()
mji = medline.MedlineJournalInfo
if mji.find("Country"):
container_extra['country_name'] = mji.Country.string
if mji.find("ISSNLinking"):
- container['issnl'] = mji.ISSNLinking.string
+ issnl = mji.ISSNLinking.string
journal = medline.Article.Journal
issnp = journal.find("ISSN", IssnType="Print")
if issnp:
container_extra['issnp'] = issnp.string
+ if not issnl:
+ issnll = self.issn2issnl(issnp)
+
+ if issnl:
+ container_id = self.lookup_issnl(issnl)
pub_date = journal.PubDate
release_date = None
- if pub_date.find("MedlineDate"):
- release_year = int(pub_date.MedlineDate.string.split()[0][:4])
- else:
+ release_year = None
+ if pub_date.Year:
release_year = int(pub_date.Year.string)
if pub_date.find("Day") and pub_date.find("Month"):
release_date = datetime.date(
@@ -216,6 +277,24 @@ class PubMedParser():
MONTH_ABBR_MAP[pub_date.Month.string],
int(pub_date.Day.string))
release_date = release_date.isoformat()
+ elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing?
+ release_year = int(pub_date.MedlineDate.string.split()[0][:4])
+
+ if journal.find("Title"):
+ container_name = journal.Title.string
+
+ if (container_id is None and self.create_containers and (issnl is not None)
+ and container_name):
+ # name, type, publisher, issnl
+ # extra: issnp, issne, original_name, languages, country
+ ce = fatcat_client.ContainerEntity(
+ name=container_name,
+ container_type='journal',
+ #XXX: publisher not included?
+ issnl=issnl,
+ extra=(container_extra or None))
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
ji = journal.JournalIssue
volume = None
@@ -224,13 +303,6 @@ class PubMedParser():
issue = None
if ji.find("Issue"):
issue = ji.Issue.string
- if journal.find("Title"):
- container['name'] = journal.Title.string
-
- if extra_pubmed:
- extra['pubmed'] = extra_pubmed
- if not extra:
- extra = None
### Abstracts
# "All abstracts are in English"
@@ -238,20 +310,20 @@ class PubMedParser():
first_abstract = medline.find("AbstractText")
if first_abstract and first_abstract.get('NlmCategory'):
joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
- abstracts.append(dict(
+ abstracts.append(fatcat_client.ReleaseAbstract(
content=joined,
mimetype="text/plain",
lang="en",
))
else:
for abstract in medline.find_all("AbstractText"):
- abstracts.append(dict(
+ abstracts.append(fatcat_client.ReleaseAbstract(
content=abstract.get_text().strip(),
mimetype="text/plain",
lang="en",
))
if abstract.find('math'):
- abstracts.append(dict(
+ abstracts.append(fatcat_client.ReleaseAbstract(
# strip the <AbstractText> tags
content=str(abstract)[14:-15],
mimetype="application/mathml+xml",
@@ -264,13 +336,17 @@ class PubMedParser():
contribs = []
if medline.AuthorList:
for author in medline.AuthorList.find_all("Author"):
- contrib = dict(
- role="author",
- )
+ given_name = None
+ surname = None
+ raw_name = None
if author.ForeName:
- contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string)
- elif author.LastName:
- contrib['raw_name'] = author.LastName.string
+ given_name = author.ForeName.string
+ if author.LastName:
+ surname = author.LastName.string
+ if given_name and surname:
+ raw_name = "{} {}".format(given_name, surname)
+ elif surname:
+ raw_name = surname
contrib_extra = dict()
orcid = author.find("Identifier", Source="ORCID")
if orcid:
@@ -287,19 +363,26 @@ class PubMedParser():
orcid[8:12],
orcid[12:16],
)
- contrib_extra['orcid'] = orcid
+ # XXX: do lookup by ORCID
+ #contrib_extra['orcid'] = orcid
affiliation = author.find("Affiliation")
+ raw_affiliation = None
if affiliation:
- contrib['raw_affiliation'] = affiliation.string
+ raw_affiliation = affiliation.string
if author.find("EqualContrib"):
# TODO: schema for this?
contrib_extra['equal_contrib'] = True
- if contrib_extra:
- contrib['extra'] = contrib_extra
- contribs.append(contrib)
+ contribs.append(fatcat_client.ReleaseContrib(
+ raw_name=raw_name,
+ given_name=given_name,
+ surname=surname,
+ role="author",
+ raw_affiliation=raw_affiliation,
+ extra=contrib_extra,
+ ))
if medline.AuthorList['CompleteYN'] == 'N':
- contribs.append(dict(raw_name="et al."))
+ contribs.append(fatcat_client.ReleaseContrib(raw_name="et al."))
if not contribs:
contribs = None
@@ -312,61 +395,118 @@ class PubMedParser():
ref_pmid = ref.find("ArticleId", IdType="pubmed")
if ref_pmid:
ref_extra['pmid'] = ref_pmid.string
+ # TODO: do reference lookups here based on PMID/DOI
ref_raw = ref.Citation
if ref_raw:
- ref_extra['raw'] = ref_raw.string
+ ref_extra['unstructured'] = ref_raw.string
if ref_extra:
ref_obj['extra'] = ref_extra
- refs.append(ref_obj)
+ refs.append(fatcat_client.ReleaseRef(
+ extra=ref_obj.get('extra'),
+ ))
if not refs:
refs = None
- re = dict(
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # pubmed: retraction refs
+ if extra_pubmed:
+ extra['pubmed'] = extra_pubmed
+ if not extra:
+ extra = None
+
+ re = fatcat_client.ReleaseEntity(
work_id=None,
- title=title,
- original_title=original_title,
+ title=clean(title),
+ original_title=clean(original_title),
release_type=release_type,
- release_status=release_status,
+ release_stage=release_stage,
release_date=release_date,
release_year=release_year,
- doi=doi,
- pmid=int(medline.PMID.string), # always present
- pmcid=pmcid,
- #isbn13 # never in Article
+ ext_ids=fatcat_client.ReleaseExtIds(
+ doi=doi,
+ pmid=pmid,
+ pmcid=pmcid,
+ #isbn13 # never in Article
+ ),
volume=volume,
issue=issue,
pages=pages,
#publisher # not included?
language=language,
#license_slug # not in MEDLINE
-
- # content, mimetype, lang
abstracts=abstracts,
-
- # raw_name, role, raw_affiliation, extra
contribs=contribs,
-
- # key, year, container_name, title, locator
- # extra: volume, authors, issue, publisher, identifiers
refs=refs,
-
- # name, type, publisher, issnl
- # extra: issnp, issne, original_name, languages, country
- container=container,
-
- # extra:
- # withdrawn_date
- # translation_of
- # subtitle
- # aliases
- # container_name
- # group-title
- # pubmed: retraction refs
+ container_id=container_id,
extra=extra,
)
-
return re
+ def try_update(self, re):
+
+ # first, lookup existing by PMID (which must be defined)
+ existing = None
+ try:
+ existing = self.api.lookup_release(pmid=re.ext_ids.pmid)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # then try DOI lookup if there is one
+ if not existing and re.ext_ids.doi:
+ try:
+ existing = self.api.lookup_release(doi=re.ext_ids.doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
+ warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format(
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+ self.counts['exists-pmid-doi-mismatch'] += 1
+ return False
+
+ if existing and existing.ext_ids.pmid and existing.refs:
+ # TODO: any other reasons to do an update?
+ # don't update if it already has PMID
+ self.counts['exists'] += 1
+ return False
+ elif existing:
+ # but do update if only DOI was set
+ existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
+ existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
+ existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid
+ existing.refs = existing.refs or re.refs
+ existing.extra['pubmed'] = re.extra['pubmed']
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for article in soup.find_all("PubmedArticle"):
+ resp = self.parse_article(article)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
if __name__=='__main__':
parser = PubMedParser()
parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
new file mode 100644
index 00000000..eacc3815
--- /dev/null
+++ b/python/tests/import_pubmed.py
@@ -0,0 +1,80 @@
+
+import json, gzip
+import pytest
+from fatcat_tools.importers import PubmedImporter, Bs4XmlFilePusher
+from fixtures import api
+from bs4 import BeautifulSoup
+
+
+@pytest.fixture(scope="function")
+def pubmed_importer(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
+
+@pytest.fixture(scope="function")
+def pubmed_importer_existing(api):
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
+
+def test_pubmed_importer(pubmed_importer):
+ last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/pubmedsample_2019.xml', 'r') as f:
+ pubmed_importer.bezerk_mode = True
+ counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
+ assert counts['insert'] == 1
+ assert counts['exists'] == 0
+ assert counts['skip'] == 0
+
+ # fetch most recent editgroup
+ change = pubmed_importer.api.get_changelog_entry(index=last_index+1)
+ eg = change.editgroup
+ assert eg.description
+ assert "pubmed" in eg.description.lower()
+ assert eg.extra['git_rev']
+ assert "fatcat_tools.PubmedImporter" in eg.extra['agent']
+
+ last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
+ with open('tests/files/pubmedsample_2019.xml', 'r') as f:
+ pubmed_importer.bezerk_mode = False
+ pubmed_importer.reset()
+ counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
+ assert counts['insert'] == 0
+ assert counts['exists'] == 1
+ assert counts['skip'] == 0
+ assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index
+
+def test_pubmed_xml_parse(pubmed_importer):
+ with open('tests/files/pubmedsample_2019.xml', 'r') as f:
+ soup = BeautifulSoup(f, "xml")
+ r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
+ r2 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[-1])
+
+ assert r1.title == "Hospital debt management and cost reimbursement"
+ assert r1.subtitle == None
+ assert r1.original_title == None
+ assert r1.publisher == None
+ assert r1.release_type == "article-journal"
+ assert r1.release_stage == "published"
+ assert r1.license_slug == None
+ assert r1.ext_ids.doi == None
+ assert r1.ext_ids.pmid == "973217"
+ assert r1.language == "en"
+ assert r1.volume == "3"
+ assert r1.issue == "1"
+ assert r1.pages == "69-81"
+ assert r1.release_date == None # not "1976-12-03", which is medline ingest date
+ assert r1.release_year == 1976
+ # matched by ISSN, so shouldn't be in there?
+ #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
+ assert len(r1.contribs) == 1
+
+ assert r1.contribs[0].raw_name == "F R Blume"
+ assert r1.contribs[0].given_name == "F R"
+ assert r1.contribs[0].surname == "Blume"
+
+ print(r1.extra)
+ # TODO: assert r1.extra['pubmed']['mesh_topics'] == ['Accounting', 'Economics, Hospital', 'Hospital Administration']
+ assert r1.extra['pubmed']['pub_types'] == ['Journal Article']
+ assert not r1.refs
+
+ # XXX: r2 tests