initial pubmed importer

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-15 22:36:01 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: 300665927f578151321b0d91b28f8aadffcf227d (patch)
tree: 5df52bf64004adc52f8ebde5f75f549237d02a5c /python/fatcat_tools/importers
parent: e27e3f443ea35b145dd07c252cdc8619d7c2ab15 (diff)
download: fatcat-300665927f578151321b0d91b28f8aadffcf227d.tar.gz
fatcat-300665927f578151321b0d91b28f8aadffcf227d.zip
2 files changed, 515 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 8ec219f8..6f8849d6 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,11 +12,12 @@ To run an import you combine two classes; one each of:
 
 """
 
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
-from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
 from .jalc import JalcImporter
 from .jstor import JstorImporter
 from .arxiv import ArxivRawImporter
+from .pubmed import PubmedImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
new file mode 100644
index 00000000..1feb41cd
--- /dev/null
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -0,0 +1,512 @@
+
+import sys
+import json
+import sqlite3
+import datetime
+import warnings
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+import fatcat_client
+from .common import EntityImporter, clean, LANG_MAP_MARC
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+    #Adaptive Clinical Trial
+    "Address": "speech",
+    "Autobiography": "book",
+    #Bibliography
+    "Biography": "book",
+    #Case Reports
+    "Classical Article": "article-journal",
+    #Clinical Conference
+    #Clinical Study
+    #Clinical Trial
+    #Clinical Trial, Phase I
+    #Clinical Trial, Phase II
+    #Clinical Trial, Phase III
+    #Clinical Trial, Phase IV
+    #Clinical Trial Protocol
+    #Clinical Trial, Veterinary
+    #Collected Works
+    #Comparative Study
+    #Congress
+    #Consensus Development Conference
+    #Consensus Development Conference, NIH
+    #Controlled Clinical Trial
+    "Dataset": "dataset",
+    #Dictionary
+    #Directory
+    #Duplicate Publication
+    "Editorial": "editorial",
+    #English Abstract   # doesn't indicate that this is abstract-only
+    #Equivalence Trial
+    #Evaluation Studies
+    #Expression of Concern
+    #Festschrift
+    #Government Document
+    #Guideline
+    "Historical Article": "article-journal",
+    #Interactive Tutorial
+    "Interview": "interview",
+    "Introductory Journal Article": "article-journal",
+    "Journal Article": "article-journal",
+    "Lecture": "speech",
+    "Legal Case": "legal_case",
+    "Legislation": "legislation",
+    "Letter": "letter",
+    #Meta-Analysis
+    #Multicenter Study
+    #News
+    "Newspaper Article": "article-newspaper",
+    #Observational Study
+    #Observational Study, Veterinary
+    #Overall
+    #Patient Education Handout
+    #Periodical Index
+    #Personal Narrative
+    #Portrait
+    #Practice Guideline
+    #Pragmatic Clinical Trial
+    #Publication Components
+    #Publication Formats
+    #Publication Type Category
+    #Randomized Controlled Trial
+    #Research Support, American Recovery and Reinvestment Act
+    #Research Support, N.I.H., Extramural
+    #Research Support, N.I.H., Intramural
+    #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    #Research Support, U.S. Gov't, P.H.S.
+    #Review     # in the "literature review" sense, not "product review"
+    #Scientific Integrity Review
+    #Study Characteristics
+    #Support of Research
+    #Systematic Review
+    "Technical Report": "report",
+    #Twin Study
+    #Validation Studies
+    #Video-Audio Media
+    #Webcasts
+}
+
+MONTH_ABBR_MAP = {
+    "Jan":  1, "01":  1,
+    "Feb":  2, "02":  2,
+    "Mar":  3, "03":  3,
+    "Apr":  4, "04":  4,
+    "May":  5, "05":  5,
+    "Jun":  6, "06":  6,
+    "Jul":  7, "07":  7,
+    "Aug":  8, "08":  8,
+    "Sep":  9, "09":  9,
+    "Oct": 10, "10": 10,
+    "Nov": 11, "11": 11,
+    "Dec": 12, "12": 12,
+}
+
+
+class PubmedImporter(EntityImporter):
+    """
+    Importer for PubMed/MEDLINE XML metadata.
+    
+    TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
+    TODO: clean (ftfy) title, original title, etc
+    XXX: withdrawn
+    XXX: full author names
+    """
+
+    def __init__(self):
+        pass
+
+    def __init__(self, api, issn_map_file, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of PubMed/MEDLINE XML metadata")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
+        super().__init__(api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri))
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map")
+
+        self.create_containers = kwargs.get('create_containers')
+        self.read_issn_map_file(issn_map_file)
+
+    def lookup_ext_ids(self, pmid):
+        if self.extid_map_db is None:
+            return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+                wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1",
+            [pmid]).fetchone()
+        if row is None:
+            return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+                wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            doi=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
+
+    def want(self, obj):
+        return True
+
+    def parse_record(self, a):
+
+        medline = a.MedlineCitation
+        # PubmedData isn't required by DTD, but seems to always be present
+        pubmed = a.PubmedData
+        extra = dict()
+        extra_pubmed = dict()
+
+        identifiers = pubmed.ArticleIdList
+        pmid = medline.PMID.string.strip()
+        doi = identifiers.find("ArticleId", IdType="doi")
+        if doi:
+            doi = doi.string.lower()
+
+        pmcid = identifiers.find("ArticleId", IdType="pmc")
+        if pmcid:
+            pmcid = pmcid.string
+
+        release_type = None
+        pub_types = []
+        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
+            pub_types.append(pub_type.string)
+            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
+                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
+                break
+        if pub_types:
+            extra_pubmed['pub_types'] = pub_types
+        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+            release_type = "retraction"
+            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
+            if retraction_of:
+                extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+                extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+
+        # everything in medline is published
+        release_stage = "published"
+        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
+            release_stage = "updated"
+        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+            release_stage = "retraction"
+        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
+            withdrawn_status = "retracted"
+
+        pages = medline.find('MedlinePgn')
+        if pages:
+            pages = pages.string
+
+        title = medline.Article.ArticleTitle.string # always present
+        if title:
+            if title.endswith('.'):
+                title = title[:-1]
+            # this hides some "special" titles, but the vast majority are
+            # translations; translations don't always include the original_title
+            if title.startswith('[') and title.endswith(']'):
+                title = title[1:-1]
+        else:
+            # TODO: will filter out later
+            title = None
+
+        original_title = medline.Article.find("VernacularTitle", recurse=False)
+        if original_title:
+            original_title = original_title.string or None
+            if original_title and original_title.endswith('.'):
+                original_title = original_title[:-1]
+
+        # TODO: happening in alpha order, not handling multi-language well.
+        # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html
+        language = medline.Article.Language
+        if language:
+            language = language.string
+            # TODO: map to two-letter
+            if language in ("und", "un"):
+                # "undetermined"
+                language = None
+            else:
+                language = LANG_MAP_MARC.get(language)
+                if not language:
+                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+
+        ### Journal/Issue Metadata
+        # MedlineJournalInfo is always present
+        issnl = None
+        container_id = None
+        container_name = None
+        container_extra = dict()
+        mji = medline.MedlineJournalInfo
+        if mji.find("Country"):
+            container_extra['country_name'] = mji.Country.string
+        if mji.find("ISSNLinking"):
+            issnl = mji.ISSNLinking.string
+
+        journal = medline.Article.Journal
+        issnp = journal.find("ISSN", IssnType="Print")
+        if issnp:
+            container_extra['issnp'] = issnp.string
+        if not issnl:
+            issnll = self.issn2issnl(issnp)
+
+        if issnl:
+            container_id = self.lookup_issnl(issnl)
+
+        pub_date = journal.PubDate
+        release_date = None
+        release_year = None
+        if pub_date.Year:
+            release_year = int(pub_date.Year.string)
+            if pub_date.find("Day") and pub_date.find("Month"):
+                release_date = datetime.date(
+                    release_year,
+                    MONTH_ABBR_MAP[pub_date.Month.string],
+                    int(pub_date.Day.string))
+                release_date = release_date.isoformat()
+        elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing?
+            release_year = int(pub_date.MedlineDate.string.split()[0][:4])
+
+        if journal.find("Title"):
+            container_name = journal.Title.string
+
+        if (container_id is None and self.create_containers and (issnl is not None)
+                and container_name):
+            # name, type, publisher, issnl
+            # extra: issnp, issne, original_name, languages, country
+            ce = fatcat_client.ContainerEntity(
+                name=container_name,
+                container_type='journal',
+                #XXX: publisher not included?
+                issnl=issnl,
+                extra=(container_extra or None))
+            ce_edit = self.create_container(ce)
+            container_id = ce_edit.ident
+       
+        ji = journal.JournalIssue
+        volume = None
+        if ji.find("Volume"):
+            volume = ji.Volume.string
+        issue = None
+        if ji.find("Issue"):
+            issue = ji.Issue.string
+
+        ### Abstracts
+        # "All abstracts are in English"
+        abstracts = []
+        first_abstract = medline.find("AbstractText")
+        if first_abstract and first_abstract.get('NlmCategory'):
+            joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
+            abstracts.append(fatcat_client.ReleaseAbstract(
+                content=joined,
+                mimetype="text/plain",
+                lang="en",
+            ))
+        else:
+            for abstract in medline.find_all("AbstractText"):
+                abstracts.append(fatcat_client.ReleaseAbstract(
+                    content=abstract.get_text().strip(),
+                    mimetype="text/plain",
+                    lang="en",
+                ))
+                if abstract.find('math'):
+                    abstracts.append(fatcat_client.ReleaseAbstract(
+                        # strip the <AbstractText> tags
+                        content=str(abstract)[14:-15],
+                        mimetype="application/mathml+xml",
+                        lang="en",
+                    ))
+        if not abstracts:
+            abstracts = None
+
+        ### Contribs
+        contribs = []
+        if medline.AuthorList:
+            for author in medline.AuthorList.find_all("Author"):
+                given_name = None
+                surname = None
+                raw_name = None
+                if author.ForeName:
+                    given_name = author.ForeName.string
+                if author.LastName:
+                    surname = author.LastName.string
+                if given_name and surname:
+                    raw_name = "{} {}".format(given_name, surname)
+                elif surname:
+                    raw_name = surname
+                contrib_extra = dict()
+                orcid = author.find("Identifier", Source="ORCID")
+                if orcid:
+                    # needs re-formatting from, eg, "0000000179841889"
+                    orcid = orcid.string
+                    if orcid.startswith("http://orcid.org/"):
+                        orcid = orcid.replace("http://orcid.org/", "")
+                    elif orcid.startswith("https://orcid.org/"):
+                        orcid = orcid.replace("https://orcid.org/", "")
+                    elif not '-' in orcid:
+                        orcid = "{}-{}-{}-{}".format(
+                            orcid[0:4],
+                            orcid[4:8],
+                            orcid[8:12],
+                            orcid[12:16],
+                        )
+                    # XXX: do lookup by ORCID
+                    #contrib_extra['orcid'] = orcid
+                affiliation = author.find("Affiliation")
+                raw_affiliation = None
+                if affiliation:
+                    raw_affiliation = affiliation.string
+                if author.find("EqualContrib"):
+                    # TODO: schema for this?
+                    contrib_extra['equal_contrib'] = True
+                contribs.append(fatcat_client.ReleaseContrib(
+                    raw_name=raw_name,
+                    given_name=given_name,
+                    surname=surname,
+                    role="author",
+                    raw_affiliation=raw_affiliation,
+                    extra=contrib_extra,
+                ))
+
+            if medline.AuthorList['CompleteYN'] == 'N':
+                contribs.append(fatcat_client.ReleaseContrib(raw_name="et al."))
+        if not contribs:
+            contribs = None
+
+        ### References
+        refs = []
+        if pubmed.ReferenceList:
+            for ref in pubmed.ReferenceList.find_all('Reference'):
+                ref_obj = dict()
+                ref_extra = dict()
+                ref_pmid = ref.find("ArticleId", IdType="pubmed")
+                if ref_pmid:
+                    ref_extra['pmid'] = ref_pmid.string
+                    # TODO: do reference lookups here based on PMID/DOI
+                ref_raw = ref.Citation
+                if ref_raw:
+                    ref_extra['unstructured'] = ref_raw.string
+                if ref_extra:
+                    ref_obj['extra'] = ref_extra
+                refs.append(fatcat_client.ReleaseRef(
+                    extra=ref_obj.get('extra'),
+                ))
+        if not refs:
+            refs = None
+
+        # extra:
+        #   withdrawn_date
+        #   translation_of
+        #   subtitle
+        #   aliases
+        #   container_name
+        #   group-title
+        #   pubmed: retraction refs
+        if extra_pubmed:
+            extra['pubmed'] = extra_pubmed
+        if not extra:
+            extra = None
+
+        re = fatcat_client.ReleaseEntity(
+            work_id=None,
+            title=clean(title),
+            original_title=clean(original_title),
+            release_type=release_type,
+            release_stage=release_stage,
+            release_date=release_date,
+            release_year=release_year,
+            ext_ids=fatcat_client.ReleaseExtIds(
+                doi=doi,
+                pmid=pmid,
+                pmcid=pmcid,
+                #isbn13     # never in Article
+            ),
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            #publisher  # not included?
+            language=language,
+            #license_slug   # not in MEDLINE
+            abstracts=abstracts,
+            contribs=contribs,
+            refs=refs,
+            container_id=container_id,
+            extra=extra,
+        )
+        return re
+
+    def try_update(self, re):
+
+        # first, lookup existing by PMID (which must be defined)
+        existing = None
+        try:
+            existing = self.api.lookup_release(pmid=re.ext_ids.pmid)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # then try DOI lookup if there is one
+        if not existing and re.ext_ids.doi:
+            try:
+                existing = self.api.lookup_release(doi=re.ext_ids.doi)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+            if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
+                warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format(
+                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+                self.counts['exists-pmid-doi-mismatch'] += 1
+                return False
+
+        if existing and existing.ext_ids.pmid and existing.refs:
+            # TODO: any other reasons to do an update?
+            # don't update if it already has PMID
+            self.counts['exists'] += 1
+            return False
+        elif existing:
+            # but do update if only DOI was set
+            existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
+            existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
+            existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid
+            existing.refs = existing.refs or re.refs
+            existing.extra['pubmed'] = re.extra['pubmed']
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+            self.counts['update'] += 1
+            return False
+
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+            editgroup=fatcat_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("PubmedArticle"):
+            resp = self.parse_article(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+if __name__=='__main__':
+    parser = PubMedParser()
+    parser.parse_file(open(sys.argv[1]))
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-15 22:36:01 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	300665927f578151321b0d91b28f8aadffcf227d (patch)
tree	5df52bf64004adc52f8ebde5f75f549237d02a5c /python/fatcat_tools/importers
parent	e27e3f443ea35b145dd07c252cdc8619d7c2ab15 (diff)
download	fatcat-300665927f578151321b0d91b28f8aadffcf227d.tar.gz fatcat-300665927f578151321b0d91b28f8aadffcf227d.zip