diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 5 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py (renamed from python/parse_pubmed_xml.py) | 290 | ||||
| -rw-r--r-- | python/tests/import_pubmed.py | 80 | 
3 files changed, 298 insertions, 77 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 8ec219f8..6f8849d6 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,11 +12,12 @@ To run an import you combine two classes; one each of:  """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk -from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug  from .jalc import JalcImporter  from .jstor import JstorImporter  from .arxiv import ArxivRawImporter +from .pubmed import PubmedImporter  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter  from .matched import MatchedImporter diff --git a/python/parse_pubmed_xml.py b/python/fatcat_tools/importers/pubmed.py index 413333cc..1feb41cd 100644 --- a/python/parse_pubmed_xml.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -1,10 +1,15 @@  import sys  import json +import sqlite3  import datetime +import warnings  from bs4 import BeautifulSoup  from bs4.element import NavigableString +import fatcat_client +from .common import EntityImporter, clean, LANG_MAP_MARC +  # from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly  PUBMED_RELEASE_TYPE_MAP = {      #Adaptive Clinical Trial @@ -99,29 +104,68 @@ MONTH_ABBR_MAP = {      "Dec": 12, "12": 12,  } -class PubMedParser(): -    """ -    Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON) +class PubmedImporter(EntityImporter): +    """ +    Importer for PubMed/MEDLINE XML metadata. +          TODO: MEDLINE doesn't include PMC/OA license; could include in importer?      TODO: clean (ftfy) title, original title, etc +    XXX: withdrawn +    XXX: full author names      """      def __init__(self):          pass -    def parse_file(self, handle): - -        # 1. open with beautiful soup -        soup = BeautifulSoup(handle, "xml") +    def __init__(self, api, issn_map_file, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of PubMed/MEDLINE XML metadata") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter') +        super().__init__(api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) + +        extid_map_file = kwargs.get('extid_map_file') +        self.extid_map_db = None +        if extid_map_file: +            db_uri = "file:{}?mode=ro".format(extid_map_file) +            print("Using external ID map: {}".format(db_uri)) +            self.extid_map_db = sqlite3.connect(db_uri, uri=True) +        else: +            print("Not using external ID map") + +        self.create_containers = kwargs.get('create_containers') +        self.read_issn_map_file(issn_map_file) + +    def lookup_ext_ids(self, pmid): +        if self.extid_map_db is None: +            return dict(doi=None, core_id=None, pmid=None, pmcid=None, +                wikidata_qid=None, arxiv_id=None, jstor_id=None) +        row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1", +            [pmid]).fetchone() +        if row is None: +            return dict(doi=None, core_id=None, pmid=None, pmcid=None, +                wikidata_qid=None, arxiv_id=None, jstor_id=None) +        row = [str(cell or '') or None for cell in row] +        return dict( +            core_id=row[0], +            doi=row[1], +            pmcid=row[2], +            wikidata_qid=row[3], +            # TODO: +            arxiv_id=None, +            jstor_id=None, +        ) -        # 2. iterate over articles, call parse_article on each -        for article in soup.find_all("PubmedArticle"): -            resp = self.parse_article(article) -            print(json.dumps(resp)) -            #sys.exit(-1) +    def want(self, obj): +        return True -    def parse_article(self, a): +    def parse_record(self, a):          medline = a.MedlineCitation          # PubmedData isn't required by DTD, but seems to always be present @@ -130,6 +174,7 @@ class PubMedParser():          extra_pubmed = dict()          identifiers = pubmed.ArticleIdList +        pmid = medline.PMID.string.strip()          doi = identifiers.find("ArticleId", IdType="doi")          if doi:              doi = doi.string.lower() @@ -139,10 +184,14 @@ class PubMedParser():              pmcid = pmcid.string          release_type = None +        pub_types = []          for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"): +            pub_types.append(pub_type.string)              if pub_type.string in PUBMED_RELEASE_TYPE_MAP:                  release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string] -            break +                break +        if pub_types: +            extra_pubmed['pub_types'] = pub_types          if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):              release_type = "retraction"              retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf") @@ -151,11 +200,13 @@ class PubMedParser():                  extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string          # everything in medline is published -        release_status = "published" +        release_stage = "published"          if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"): -            release_status = "updated" +            release_stage = "updated" +        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"): +            release_stage = "retraction"          if medline.Article.PublicationTypeList.find(string="Retracted Publication"): -            release_status = "retracted" +            withdrawn_status = "retracted"          pages = medline.find('MedlinePgn')          if pages: @@ -188,27 +239,37 @@ class PubMedParser():              if language in ("und", "un"):                  # "undetermined"                  language = None +            else: +                language = LANG_MAP_MARC.get(language) +                if not language: +                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))          ### Journal/Issue Metadata          # MedlineJournalInfo is always present -        container = dict() +        issnl = None +        container_id = None +        container_name = None          container_extra = dict()          mji = medline.MedlineJournalInfo          if mji.find("Country"):              container_extra['country_name'] = mji.Country.string          if mji.find("ISSNLinking"): -            container['issnl'] = mji.ISSNLinking.string +            issnl = mji.ISSNLinking.string          journal = medline.Article.Journal          issnp = journal.find("ISSN", IssnType="Print")          if issnp:              container_extra['issnp'] = issnp.string +        if not issnl: +            issnll = self.issn2issnl(issnp) + +        if issnl: +            container_id = self.lookup_issnl(issnl)          pub_date = journal.PubDate          release_date = None -        if pub_date.find("MedlineDate"): -            release_year = int(pub_date.MedlineDate.string.split()[0][:4]) -        else: +        release_year = None +        if pub_date.Year:              release_year = int(pub_date.Year.string)              if pub_date.find("Day") and pub_date.find("Month"):                  release_date = datetime.date( @@ -216,6 +277,24 @@ class PubMedParser():                      MONTH_ABBR_MAP[pub_date.Month.string],                      int(pub_date.Day.string))                  release_date = release_date.isoformat() +        elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing? +            release_year = int(pub_date.MedlineDate.string.split()[0][:4]) + +        if journal.find("Title"): +            container_name = journal.Title.string + +        if (container_id is None and self.create_containers and (issnl is not None) +                and container_name): +            # name, type, publisher, issnl +            # extra: issnp, issne, original_name, languages, country +            ce = fatcat_client.ContainerEntity( +                name=container_name, +                container_type='journal', +                #XXX: publisher not included? +                issnl=issnl, +                extra=(container_extra or None)) +            ce_edit = self.create_container(ce) +            container_id = ce_edit.ident          ji = journal.JournalIssue          volume = None @@ -224,13 +303,6 @@ class PubMedParser():          issue = None          if ji.find("Issue"):              issue = ji.Issue.string -        if journal.find("Title"): -            container['name'] = journal.Title.string - -        if extra_pubmed: -            extra['pubmed'] = extra_pubmed -        if not extra: -            extra = None          ### Abstracts          # "All abstracts are in English" @@ -238,20 +310,20 @@ class PubMedParser():          first_abstract = medline.find("AbstractText")          if first_abstract and first_abstract.get('NlmCategory'):              joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")]) -            abstracts.append(dict( +            abstracts.append(fatcat_client.ReleaseAbstract(                  content=joined,                  mimetype="text/plain",                  lang="en",              ))          else:              for abstract in medline.find_all("AbstractText"): -                abstracts.append(dict( +                abstracts.append(fatcat_client.ReleaseAbstract(                      content=abstract.get_text().strip(),                      mimetype="text/plain",                      lang="en",                  ))                  if abstract.find('math'): -                    abstracts.append(dict( +                    abstracts.append(fatcat_client.ReleaseAbstract(                          # strip the <AbstractText> tags                          content=str(abstract)[14:-15],                          mimetype="application/mathml+xml", @@ -264,13 +336,17 @@ class PubMedParser():          contribs = []          if medline.AuthorList:              for author in medline.AuthorList.find_all("Author"): -                contrib = dict( -                    role="author", -                ) +                given_name = None +                surname = None +                raw_name = None                  if author.ForeName: -                    contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string) -                elif author.LastName: -                    contrib['raw_name'] = author.LastName.string +                    given_name = author.ForeName.string +                if author.LastName: +                    surname = author.LastName.string +                if given_name and surname: +                    raw_name = "{} {}".format(given_name, surname) +                elif surname: +                    raw_name = surname                  contrib_extra = dict()                  orcid = author.find("Identifier", Source="ORCID")                  if orcid: @@ -287,19 +363,26 @@ class PubMedParser():                              orcid[8:12],                              orcid[12:16],                          ) -                    contrib_extra['orcid'] = orcid +                    # XXX: do lookup by ORCID +                    #contrib_extra['orcid'] = orcid                  affiliation = author.find("Affiliation") +                raw_affiliation = None                  if affiliation: -                    contrib['raw_affiliation'] = affiliation.string +                    raw_affiliation = affiliation.string                  if author.find("EqualContrib"):                      # TODO: schema for this?                      contrib_extra['equal_contrib'] = True -                if contrib_extra: -                    contrib['extra'] = contrib_extra -                contribs.append(contrib) +                contribs.append(fatcat_client.ReleaseContrib( +                    raw_name=raw_name, +                    given_name=given_name, +                    surname=surname, +                    role="author", +                    raw_affiliation=raw_affiliation, +                    extra=contrib_extra, +                ))              if medline.AuthorList['CompleteYN'] == 'N': -                contribs.append(dict(raw_name="et al.")) +                contribs.append(fatcat_client.ReleaseContrib(raw_name="et al."))          if not contribs:              contribs = None @@ -312,61 +395,118 @@ class PubMedParser():                  ref_pmid = ref.find("ArticleId", IdType="pubmed")                  if ref_pmid:                      ref_extra['pmid'] = ref_pmid.string +                    # TODO: do reference lookups here based on PMID/DOI                  ref_raw = ref.Citation                  if ref_raw: -                    ref_extra['raw'] = ref_raw.string +                    ref_extra['unstructured'] = ref_raw.string                  if ref_extra:                      ref_obj['extra'] = ref_extra -                refs.append(ref_obj) +                refs.append(fatcat_client.ReleaseRef( +                    extra=ref_obj.get('extra'), +                ))          if not refs:              refs = None -        re = dict( +        # extra: +        #   withdrawn_date +        #   translation_of +        #   subtitle +        #   aliases +        #   container_name +        #   group-title +        #   pubmed: retraction refs +        if extra_pubmed: +            extra['pubmed'] = extra_pubmed +        if not extra: +            extra = None + +        re = fatcat_client.ReleaseEntity(              work_id=None, -            title=title, -            original_title=original_title, +            title=clean(title), +            original_title=clean(original_title),              release_type=release_type, -            release_status=release_status, +            release_stage=release_stage,              release_date=release_date,              release_year=release_year, -            doi=doi, -            pmid=int(medline.PMID.string), # always present -            pmcid=pmcid, -            #isbn13     # never in Article +            ext_ids=fatcat_client.ReleaseExtIds( +                doi=doi, +                pmid=pmid, +                pmcid=pmcid, +                #isbn13     # never in Article +            ),              volume=volume,              issue=issue,              pages=pages,              #publisher  # not included?              language=language,              #license_slug   # not in MEDLINE - -            # content, mimetype, lang              abstracts=abstracts, - -            # raw_name, role, raw_affiliation, extra              contribs=contribs, - -            # key, year, container_name, title, locator -            # extra: volume, authors, issue, publisher, identifiers              refs=refs, - -            #   name, type, publisher, issnl -            #   extra: issnp, issne, original_name, languages, country -            container=container, - -            # extra: -            #   withdrawn_date -            #   translation_of -            #   subtitle -            #   aliases -            #   container_name -            #   group-title -            #   pubmed: retraction refs +            container_id=container_id,              extra=extra,          ) -          return re +    def try_update(self, re): + +        # first, lookup existing by PMID (which must be defined) +        existing = None +        try: +            existing = self.api.lookup_release(pmid=re.ext_ids.pmid) +        except fatcat_client.rest.ApiException as err: +            if err.status != 404: +                raise err + +        # then try DOI lookup if there is one +        if not existing and re.ext_ids.doi: +            try: +                existing = self.api.lookup_release(doi=re.ext_ids.doi) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +            if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid: +                warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format( +                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid)) +                self.counts['exists-pmid-doi-mismatch'] += 1 +                return False + +        if existing and existing.ext_ids.pmid and existing.refs: +            # TODO: any other reasons to do an update? +            # don't update if it already has PMID +            self.counts['exists'] += 1 +            return False +        elif existing: +            # but do update if only DOI was set +            existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi +            existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid +            existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid +            existing.refs = existing.refs or re.refs +            existing.extra['pubmed'] = re.extra['pubmed'] +            self.api.update_release(self.get_editgroup_id(), existing.ident, existing) +            self.counts['update'] += 1 +            return False + +        return True + +    def insert_batch(self, batch): +        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch( +            editgroup=fatcat_client.Editgroup( +                description=self.editgroup_description, +                extra=self.editgroup_extra), +            entity_list=batch)) + +    def parse_file(self, handle): + +        # 1. open with beautiful soup +        soup = BeautifulSoup(handle, "xml") + +        # 2. iterate over articles, call parse_article on each +        for article in soup.find_all("PubmedArticle"): +            resp = self.parse_article(article) +            print(json.dumps(resp)) +            #sys.exit(-1) +  if __name__=='__main__':      parser = PubMedParser()      parser.parse_file(open(sys.argv[1])) diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py new file mode 100644 index 00000000..eacc3815 --- /dev/null +++ b/python/tests/import_pubmed.py @@ -0,0 +1,80 @@ + +import json, gzip +import pytest +from fatcat_tools.importers import PubmedImporter, Bs4XmlFilePusher +from fixtures import api +from bs4 import BeautifulSoup + + +@pytest.fixture(scope="function") +def pubmed_importer(api): +    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +        yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True) + +@pytest.fixture(scope="function") +def pubmed_importer_existing(api): +    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: +        yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) + +def test_pubmed_importer(pubmed_importer): +    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/pubmedsample_2019.xml', 'r') as f: +        pubmed_importer.bezerk_mode = True +        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() +    assert counts['insert'] == 1 +    assert counts['exists'] == 0 +    assert counts['skip'] == 0 + +    # fetch most recent editgroup +    change = pubmed_importer.api.get_changelog_entry(index=last_index+1) +    eg = change.editgroup +    assert eg.description +    assert "pubmed" in eg.description.lower() +    assert eg.extra['git_rev'] +    assert "fatcat_tools.PubmedImporter" in eg.extra['agent'] + +    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index +    with open('tests/files/pubmedsample_2019.xml', 'r') as f: +        pubmed_importer.bezerk_mode = False +        pubmed_importer.reset() +        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() +    assert counts['insert'] == 0 +    assert counts['exists'] == 1 +    assert counts['skip'] == 0 +    assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index + +def test_pubmed_xml_parse(pubmed_importer): +    with open('tests/files/pubmedsample_2019.xml', 'r') as f: +        soup = BeautifulSoup(f, "xml") +        r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0]) +        r2 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[-1]) + +    assert r1.title == "Hospital debt management and cost reimbursement" +    assert r1.subtitle == None +    assert r1.original_title == None +    assert r1.publisher == None +    assert r1.release_type == "article-journal" +    assert r1.release_stage == "published" +    assert r1.license_slug == None +    assert r1.ext_ids.doi == None +    assert r1.ext_ids.pmid == "973217" +    assert r1.language == "en" +    assert r1.volume == "3" +    assert r1.issue == "1" +    assert r1.pages == "69-81" +    assert r1.release_date == None # not "1976-12-03", which is medline ingest date +    assert r1.release_year == 1976 +    # matched by ISSN, so shouldn't be in there? +    #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London" +    assert len(r1.contribs) == 1 + +    assert r1.contribs[0].raw_name == "F R Blume" +    assert r1.contribs[0].given_name == "F R" +    assert r1.contribs[0].surname == "Blume" + +    print(r1.extra) +    # TODO: assert r1.extra['pubmed']['mesh_topics'] == ['Accounting', 'Economics, Hospital', 'Hospital Administration'] +    assert r1.extra['pubmed']['pub_types'] == ['Journal Article'] +    assert not r1.refs + +    # XXX: r2 tests | 
