aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/__init__.py5
-rw-r--r--python/fatcat_tools/importers/pubmed.py512
2 files changed, 515 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 8ec219f8..6f8849d6 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,11 +12,12 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
-from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
from .jalc import JalcImporter
from .jstor import JstorImporter
from .arxiv import ArxivRawImporter
+from .pubmed import PubmedImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
new file mode 100644
index 00000000..1feb41cd
--- /dev/null
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -0,0 +1,512 @@
+
+import sys
+import json
+import sqlite3
+import datetime
+import warnings
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+import fatcat_client
+from .common import EntityImporter, clean, LANG_MAP_MARC
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+ #Adaptive Clinical Trial
+ "Address": "speech",
+ "Autobiography": "book",
+ #Bibliography
+ "Biography": "book",
+ #Case Reports
+ "Classical Article": "article-journal",
+ #Clinical Conference
+ #Clinical Study
+ #Clinical Trial
+ #Clinical Trial, Phase I
+ #Clinical Trial, Phase II
+ #Clinical Trial, Phase III
+ #Clinical Trial, Phase IV
+ #Clinical Trial Protocol
+ #Clinical Trial, Veterinary
+ #Collected Works
+ #Comparative Study
+ #Congress
+ #Consensus Development Conference
+ #Consensus Development Conference, NIH
+ #Controlled Clinical Trial
+ "Dataset": "dataset",
+ #Dictionary
+ #Directory
+ #Duplicate Publication
+ "Editorial": "editorial",
+ #English Abstract # doesn't indicate that this is abstract-only
+ #Equivalence Trial
+ #Evaluation Studies
+ #Expression of Concern
+ #Festschrift
+ #Government Document
+ #Guideline
+ "Historical Article": "article-journal",
+ #Interactive Tutorial
+ "Interview": "interview",
+ "Introductory Journal Article": "article-journal",
+ "Journal Article": "article-journal",
+ "Lecture": "speech",
+ "Legal Case": "legal_case",
+ "Legislation": "legislation",
+ "Letter": "letter",
+ #Meta-Analysis
+ #Multicenter Study
+ #News
+ "Newspaper Article": "article-newspaper",
+ #Observational Study
+ #Observational Study, Veterinary
+ #Overall
+ #Patient Education Handout
+ #Periodical Index
+ #Personal Narrative
+ #Portrait
+ #Practice Guideline
+ #Pragmatic Clinical Trial
+ #Publication Components
+ #Publication Formats
+ #Publication Type Category
+ #Randomized Controlled Trial
+ #Research Support, American Recovery and Reinvestment Act
+ #Research Support, N.I.H., Extramural
+ #Research Support, N.I.H., Intramural
+ #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+ #Research Support, U.S. Gov't, P.H.S.
+ #Review # in the "literature review" sense, not "product review"
+ #Scientific Integrity Review
+ #Study Characteristics
+ #Support of Research
+ #Systematic Review
+ "Technical Report": "report",
+ #Twin Study
+ #Validation Studies
+ #Video-Audio Media
+ #Webcasts
+}
+
+MONTH_ABBR_MAP = {
+ "Jan": 1, "01": 1,
+ "Feb": 2, "02": 2,
+ "Mar": 3, "03": 3,
+ "Apr": 4, "04": 4,
+ "May": 5, "05": 5,
+ "Jun": 6, "06": 6,
+ "Jul": 7, "07": 7,
+ "Aug": 8, "08": 8,
+ "Sep": 9, "09": 9,
+ "Oct": 10, "10": 10,
+ "Nov": 11, "11": 11,
+ "Dec": 12, "12": 12,
+}
+
+
+class PubmedImporter(EntityImporter):
+ """
+ Importer for PubMed/MEDLINE XML metadata.
+
+ TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
+ TODO: clean (ftfy) title, original title, etc
+ XXX: withdrawn
+ XXX: full author names
+ """
+
+ def __init__(self):
+ pass
+
+ def __init__(self, api, issn_map_file, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of PubMed/MEDLINE XML metadata")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ extid_map_file = kwargs.get('extid_map_file')
+ self.extid_map_db = None
+ if extid_map_file:
+ db_uri = "file:{}?mode=ro".format(extid_map_file)
+ print("Using external ID map: {}".format(db_uri))
+ self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+ else:
+ print("Not using external ID map")
+
+ self.create_containers = kwargs.get('create_containers')
+ self.read_issn_map_file(issn_map_file)
+
+ def lookup_ext_ids(self, pmid):
+ if self.extid_map_db is None:
+ return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+ wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1",
+ [pmid]).fetchone()
+ if row is None:
+ return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+ wikidata_qid=None, arxiv_id=None, jstor_id=None)
+ row = [str(cell or '') or None for cell in row]
+ return dict(
+ core_id=row[0],
+ doi=row[1],
+ pmcid=row[2],
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
+
+ def want(self, obj):
+ return True
+
+ def parse_record(self, a):
+
+ medline = a.MedlineCitation
+ # PubmedData isn't required by DTD, but seems to always be present
+ pubmed = a.PubmedData
+ extra = dict()
+ extra_pubmed = dict()
+
+ identifiers = pubmed.ArticleIdList
+ pmid = medline.PMID.string.strip()
+ doi = identifiers.find("ArticleId", IdType="doi")
+ if doi:
+ doi = doi.string.lower()
+
+ pmcid = identifiers.find("ArticleId", IdType="pmc")
+ if pmcid:
+ pmcid = pmcid.string
+
+ release_type = None
+ pub_types = []
+ for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
+ pub_types.append(pub_type.string)
+ if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
+ release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
+ break
+ if pub_types:
+ extra_pubmed['pub_types'] = pub_types
+ if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+ release_type = "retraction"
+ retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
+ if retraction_of:
+ extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+ extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+
+ # everything in medline is published
+ release_stage = "published"
+ if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
+ release_stage = "updated"
+ if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+ release_stage = "retraction"
+ if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
+ withdrawn_status = "retracted"
+
+ pages = medline.find('MedlinePgn')
+ if pages:
+ pages = pages.string
+
+ title = medline.Article.ArticleTitle.string # always present
+ if title:
+ if title.endswith('.'):
+ title = title[:-1]
+ # this hides some "special" titles, but the vast majority are
+ # translations; translations don't always include the original_title
+ if title.startswith('[') and title.endswith(']'):
+ title = title[1:-1]
+ else:
+ # TODO: will filter out later
+ title = None
+
+ original_title = medline.Article.find("VernacularTitle", recurse=False)
+ if original_title:
+ original_title = original_title.string or None
+ if original_title and original_title.endswith('.'):
+ original_title = original_title[:-1]
+
+ # TODO: happening in alpha order, not handling multi-language well.
+ # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html
+ language = medline.Article.Language
+ if language:
+ language = language.string
+ # TODO: map to two-letter
+ if language in ("und", "un"):
+ # "undetermined"
+ language = None
+ else:
+ language = LANG_MAP_MARC.get(language)
+ if not language:
+ warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+
+ ### Journal/Issue Metadata
+ # MedlineJournalInfo is always present
+ issnl = None
+ container_id = None
+ container_name = None
+ container_extra = dict()
+ mji = medline.MedlineJournalInfo
+ if mji.find("Country"):
+ container_extra['country_name'] = mji.Country.string
+ if mji.find("ISSNLinking"):
+ issnl = mji.ISSNLinking.string
+
+ journal = medline.Article.Journal
+ issnp = journal.find("ISSN", IssnType="Print")
+ if issnp:
+ container_extra['issnp'] = issnp.string
+ if not issnl:
+ issnll = self.issn2issnl(issnp)
+
+ if issnl:
+ container_id = self.lookup_issnl(issnl)
+
+ pub_date = journal.PubDate
+ release_date = None
+ release_year = None
+ if pub_date.Year:
+ release_year = int(pub_date.Year.string)
+ if pub_date.find("Day") and pub_date.find("Month"):
+ release_date = datetime.date(
+ release_year,
+ MONTH_ABBR_MAP[pub_date.Month.string],
+ int(pub_date.Day.string))
+ release_date = release_date.isoformat()
+ elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing?
+ release_year = int(pub_date.MedlineDate.string.split()[0][:4])
+
+ if journal.find("Title"):
+ container_name = journal.Title.string
+
+ if (container_id is None and self.create_containers and (issnl is not None)
+ and container_name):
+ # name, type, publisher, issnl
+ # extra: issnp, issne, original_name, languages, country
+ ce = fatcat_client.ContainerEntity(
+ name=container_name,
+ container_type='journal',
+ #XXX: publisher not included?
+ issnl=issnl,
+ extra=(container_extra or None))
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+
+ ji = journal.JournalIssue
+ volume = None
+ if ji.find("Volume"):
+ volume = ji.Volume.string
+ issue = None
+ if ji.find("Issue"):
+ issue = ji.Issue.string
+
+ ### Abstracts
+ # "All abstracts are in English"
+ abstracts = []
+ first_abstract = medline.find("AbstractText")
+ if first_abstract and first_abstract.get('NlmCategory'):
+ joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
+ abstracts.append(fatcat_client.ReleaseAbstract(
+ content=joined,
+ mimetype="text/plain",
+ lang="en",
+ ))
+ else:
+ for abstract in medline.find_all("AbstractText"):
+ abstracts.append(fatcat_client.ReleaseAbstract(
+ content=abstract.get_text().strip(),
+ mimetype="text/plain",
+ lang="en",
+ ))
+ if abstract.find('math'):
+ abstracts.append(fatcat_client.ReleaseAbstract(
+ # strip the <AbstractText> tags
+ content=str(abstract)[14:-15],
+ mimetype="application/mathml+xml",
+ lang="en",
+ ))
+ if not abstracts:
+ abstracts = None
+
+ ### Contribs
+ contribs = []
+ if medline.AuthorList:
+ for author in medline.AuthorList.find_all("Author"):
+ given_name = None
+ surname = None
+ raw_name = None
+ if author.ForeName:
+ given_name = author.ForeName.string
+ if author.LastName:
+ surname = author.LastName.string
+ if given_name and surname:
+ raw_name = "{} {}".format(given_name, surname)
+ elif surname:
+ raw_name = surname
+ contrib_extra = dict()
+ orcid = author.find("Identifier", Source="ORCID")
+ if orcid:
+ # needs re-formatting from, eg, "0000000179841889"
+ orcid = orcid.string
+ if orcid.startswith("http://orcid.org/"):
+ orcid = orcid.replace("http://orcid.org/", "")
+ elif orcid.startswith("https://orcid.org/"):
+ orcid = orcid.replace("https://orcid.org/", "")
+ elif not '-' in orcid:
+ orcid = "{}-{}-{}-{}".format(
+ orcid[0:4],
+ orcid[4:8],
+ orcid[8:12],
+ orcid[12:16],
+ )
+ # XXX: do lookup by ORCID
+ #contrib_extra['orcid'] = orcid
+ affiliation = author.find("Affiliation")
+ raw_affiliation = None
+ if affiliation:
+ raw_affiliation = affiliation.string
+ if author.find("EqualContrib"):
+ # TODO: schema for this?
+ contrib_extra['equal_contrib'] = True
+ contribs.append(fatcat_client.ReleaseContrib(
+ raw_name=raw_name,
+ given_name=given_name,
+ surname=surname,
+ role="author",
+ raw_affiliation=raw_affiliation,
+ extra=contrib_extra,
+ ))
+
+ if medline.AuthorList['CompleteYN'] == 'N':
+ contribs.append(fatcat_client.ReleaseContrib(raw_name="et al."))
+ if not contribs:
+ contribs = None
+
+ ### References
+ refs = []
+ if pubmed.ReferenceList:
+ for ref in pubmed.ReferenceList.find_all('Reference'):
+ ref_obj = dict()
+ ref_extra = dict()
+ ref_pmid = ref.find("ArticleId", IdType="pubmed")
+ if ref_pmid:
+ ref_extra['pmid'] = ref_pmid.string
+ # TODO: do reference lookups here based on PMID/DOI
+ ref_raw = ref.Citation
+ if ref_raw:
+ ref_extra['unstructured'] = ref_raw.string
+ if ref_extra:
+ ref_obj['extra'] = ref_extra
+ refs.append(fatcat_client.ReleaseRef(
+ extra=ref_obj.get('extra'),
+ ))
+ if not refs:
+ refs = None
+
+ # extra:
+ # withdrawn_date
+ # translation_of
+ # subtitle
+ # aliases
+ # container_name
+ # group-title
+ # pubmed: retraction refs
+ if extra_pubmed:
+ extra['pubmed'] = extra_pubmed
+ if not extra:
+ extra = None
+
+ re = fatcat_client.ReleaseEntity(
+ work_id=None,
+ title=clean(title),
+ original_title=clean(original_title),
+ release_type=release_type,
+ release_stage=release_stage,
+ release_date=release_date,
+ release_year=release_year,
+ ext_ids=fatcat_client.ReleaseExtIds(
+ doi=doi,
+ pmid=pmid,
+ pmcid=pmcid,
+ #isbn13 # never in Article
+ ),
+ volume=volume,
+ issue=issue,
+ pages=pages,
+ #publisher # not included?
+ language=language,
+ #license_slug # not in MEDLINE
+ abstracts=abstracts,
+ contribs=contribs,
+ refs=refs,
+ container_id=container_id,
+ extra=extra,
+ )
+ return re
+
+ def try_update(self, re):
+
+ # first, lookup existing by PMID (which must be defined)
+ existing = None
+ try:
+ existing = self.api.lookup_release(pmid=re.ext_ids.pmid)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # then try DOI lookup if there is one
+ if not existing and re.ext_ids.doi:
+ try:
+ existing = self.api.lookup_release(doi=re.ext_ids.doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
+ warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format(
+ existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+ self.counts['exists-pmid-doi-mismatch'] += 1
+ return False
+
+ if existing and existing.ext_ids.pmid and existing.refs:
+ # TODO: any other reasons to do an update?
+ # don't update if it already has PMID
+ self.counts['exists'] += 1
+ return False
+ elif existing:
+ # but do update if only DOI was set
+ existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
+ existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
+ existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid
+ existing.refs = existing.refs or re.refs
+ existing.extra['pubmed'] = re.extra['pubmed']
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+
+ def parse_file(self, handle):
+
+ # 1. open with beautiful soup
+ soup = BeautifulSoup(handle, "xml")
+
+ # 2. iterate over articles, call parse_article on each
+ for article in soup.find_all("PubmedArticle"):
+ resp = self.parse_article(article)
+ print(json.dumps(resp))
+ #sys.exit(-1)
+
+if __name__=='__main__':
+ parser = PubMedParser()
+ parser.parse_file(open(sys.argv[1]))