From 300665927f578151321b0d91b28f8aadffcf227d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 15 May 2019 22:36:01 -0700
Subject: initial pubmed importer

---
 python/fatcat_tools/importers/__init__.py |   5 +-
 python/fatcat_tools/importers/pubmed.py   | 512 ++++++++++++++++++++++++++++++
 python/parse_pubmed_xml.py                | 372 ----------------------
 python/tests/import_pubmed.py             |  80 +++++
 4 files changed, 595 insertions(+), 374 deletions(-)
 create mode 100644 python/fatcat_tools/importers/pubmed.py
 delete mode 100644 python/parse_pubmed_xml.py
 create mode 100644 python/tests/import_pubmed.py

diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 8ec219f8..6f8849d6 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,11 +12,12 @@ To run an import you combine two classes; one each of:
 
 """
 
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk
-from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
 from .jalc import JalcImporter
 from .jstor import JstorImporter
 from .arxiv import ArxivRawImporter
+from .pubmed import PubmedImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
new file mode 100644
index 00000000..1feb41cd
--- /dev/null
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -0,0 +1,512 @@
+
+import sys
+import json
+import sqlite3
+import datetime
+import warnings
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+import fatcat_client
+from .common import EntityImporter, clean, LANG_MAP_MARC
+
+# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
+PUBMED_RELEASE_TYPE_MAP = {
+    #Adaptive Clinical Trial
+    "Address": "speech",
+    "Autobiography": "book",
+    #Bibliography
+    "Biography": "book",
+    #Case Reports
+    "Classical Article": "article-journal",
+    #Clinical Conference
+    #Clinical Study
+    #Clinical Trial
+    #Clinical Trial, Phase I
+    #Clinical Trial, Phase II
+    #Clinical Trial, Phase III
+    #Clinical Trial, Phase IV
+    #Clinical Trial Protocol
+    #Clinical Trial, Veterinary
+    #Collected Works
+    #Comparative Study
+    #Congress
+    #Consensus Development Conference
+    #Consensus Development Conference, NIH
+    #Controlled Clinical Trial
+    "Dataset": "dataset",
+    #Dictionary
+    #Directory
+    #Duplicate Publication
+    "Editorial": "editorial",
+    #English Abstract   # doesn't indicate that this is abstract-only
+    #Equivalence Trial
+    #Evaluation Studies
+    #Expression of Concern
+    #Festschrift
+    #Government Document
+    #Guideline
+    "Historical Article": "article-journal",
+    #Interactive Tutorial
+    "Interview": "interview",
+    "Introductory Journal Article": "article-journal",
+    "Journal Article": "article-journal",
+    "Lecture": "speech",
+    "Legal Case": "legal_case",
+    "Legislation": "legislation",
+    "Letter": "letter",
+    #Meta-Analysis
+    #Multicenter Study
+    #News
+    "Newspaper Article": "article-newspaper",
+    #Observational Study
+    #Observational Study, Veterinary
+    #Overall
+    #Patient Education Handout
+    #Periodical Index
+    #Personal Narrative
+    #Portrait
+    #Practice Guideline
+    #Pragmatic Clinical Trial
+    #Publication Components
+    #Publication Formats
+    #Publication Type Category
+    #Randomized Controlled Trial
+    #Research Support, American Recovery and Reinvestment Act
+    #Research Support, N.I.H., Extramural
+    #Research Support, N.I.H., Intramural
+    #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
+    #Research Support, U.S. Gov't, P.H.S.
+    #Review     # in the "literature review" sense, not "product review"
+    #Scientific Integrity Review
+    #Study Characteristics
+    #Support of Research
+    #Systematic Review
+    "Technical Report": "report",
+    #Twin Study
+    #Validation Studies
+    #Video-Audio Media
+    #Webcasts
+}
+
+MONTH_ABBR_MAP = {
+    "Jan":  1, "01":  1,
+    "Feb":  2, "02":  2,
+    "Mar":  3, "03":  3,
+    "Apr":  4, "04":  4,
+    "May":  5, "05":  5,
+    "Jun":  6, "06":  6,
+    "Jul":  7, "07":  7,
+    "Aug":  8, "08":  8,
+    "Sep":  9, "09":  9,
+    "Oct": 10, "10": 10,
+    "Nov": 11, "11": 11,
+    "Dec": 12, "12": 12,
+}
+
+
+class PubmedImporter(EntityImporter):
+    """
+    Importer for PubMed/MEDLINE XML metadata.
+    
+    TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
+    TODO: clean (ftfy) title, original title, etc
+    XXX: withdrawn
+    XXX: full author names
+    """
+
+    def __init__(self):
+        pass
+
+    def __init__(self, api, issn_map_file, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of PubMed/MEDLINE XML metadata")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.PubmedImporter')
+        super().__init__(api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri))
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map")
+
+        self.create_containers = kwargs.get('create_containers')
+        self.read_issn_map_file(issn_map_file)
+
+    def lookup_ext_ids(self, pmid):
+        if self.extid_map_db is None:
+            return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+                wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = self.extid_map_db.execute("SELECT core, doi, pmcid, wikidata FROM ids WHERE pmid=? LIMIT 1",
+            [pmid]).fetchone()
+        if row is None:
+            return dict(doi=None, core_id=None, pmid=None, pmcid=None,
+                wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            doi=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
+
+    def want(self, obj):
+        return True
+
+    def parse_record(self, a):
+
+        medline = a.MedlineCitation
+        # PubmedData isn't required by DTD, but seems to always be present
+        pubmed = a.PubmedData
+        extra = dict()
+        extra_pubmed = dict()
+
+        identifiers = pubmed.ArticleIdList
+        pmid = medline.PMID.string.strip()
+        doi = identifiers.find("ArticleId", IdType="doi")
+        if doi:
+            doi = doi.string.lower()
+
+        pmcid = identifiers.find("ArticleId", IdType="pmc")
+        if pmcid:
+            pmcid = pmcid.string
+
+        release_type = None
+        pub_types = []
+        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
+            pub_types.append(pub_type.string)
+            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
+                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
+                break
+        if pub_types:
+            extra_pubmed['pub_types'] = pub_types
+        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+            release_type = "retraction"
+            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
+            if retraction_of:
+                extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
+                extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
+
+        # everything in medline is published
+        release_stage = "published"
+        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
+            release_stage = "updated"
+        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
+            release_stage = "retraction"
+        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
+            withdrawn_status = "retracted"
+
+        pages = medline.find('MedlinePgn')
+        if pages:
+            pages = pages.string
+
+        title = medline.Article.ArticleTitle.string # always present
+        if title:
+            if title.endswith('.'):
+                title = title[:-1]
+            # this hides some "special" titles, but the vast majority are
+            # translations; translations don't always include the original_title
+            if title.startswith('[') and title.endswith(']'):
+                title = title[1:-1]
+        else:
+            # TODO: will filter out later
+            title = None
+
+        original_title = medline.Article.find("VernacularTitle", recurse=False)
+        if original_title:
+            original_title = original_title.string or None
+            if original_title and original_title.endswith('.'):
+                original_title = original_title[:-1]
+
+        # TODO: happening in alpha order, not handling multi-language well.
+        # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html
+        language = medline.Article.Language
+        if language:
+            language = language.string
+            # TODO: map to two-letter
+            if language in ("und", "un"):
+                # "undetermined"
+                language = None
+            else:
+                language = LANG_MAP_MARC.get(language)
+                if not language:
+                    warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))
+
+        ### Journal/Issue Metadata
+        # MedlineJournalInfo is always present
+        issnl = None
+        container_id = None
+        container_name = None
+        container_extra = dict()
+        mji = medline.MedlineJournalInfo
+        if mji.find("Country"):
+            container_extra['country_name'] = mji.Country.string
+        if mji.find("ISSNLinking"):
+            issnl = mji.ISSNLinking.string
+
+        journal = medline.Article.Journal
+        issnp = journal.find("ISSN", IssnType="Print")
+        if issnp:
+            container_extra['issnp'] = issnp.string
+        if not issnl:
+            issnll = self.issn2issnl(issnp)
+
+        if issnl:
+            container_id = self.lookup_issnl(issnl)
+
+        pub_date = journal.PubDate
+        release_date = None
+        release_year = None
+        if pub_date.Year:
+            release_year = int(pub_date.Year.string)
+            if pub_date.find("Day") and pub_date.find("Month"):
+                release_date = datetime.date(
+                    release_year,
+                    MONTH_ABBR_MAP[pub_date.Month.string],
+                    int(pub_date.Day.string))
+                release_date = release_date.isoformat()
+        elif pub_date.find("MedlineDate") and False: #XXX more/better date parsing?
+            release_year = int(pub_date.MedlineDate.string.split()[0][:4])
+
+        if journal.find("Title"):
+            container_name = journal.Title.string
+
+        if (container_id is None and self.create_containers and (issnl is not None)
+                and container_name):
+            # name, type, publisher, issnl
+            # extra: issnp, issne, original_name, languages, country
+            ce = fatcat_client.ContainerEntity(
+                name=container_name,
+                container_type='journal',
+                #XXX: publisher not included?
+                issnl=issnl,
+                extra=(container_extra or None))
+            ce_edit = self.create_container(ce)
+            container_id = ce_edit.ident
+       
+        ji = journal.JournalIssue
+        volume = None
+        if ji.find("Volume"):
+            volume = ji.Volume.string
+        issue = None
+        if ji.find("Issue"):
+            issue = ji.Issue.string
+
+        ### Abstracts
+        # "All abstracts are in English"
+        abstracts = []
+        first_abstract = medline.find("AbstractText")
+        if first_abstract and first_abstract.get('NlmCategory'):
+            joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
+            abstracts.append(fatcat_client.ReleaseAbstract(
+                content=joined,
+                mimetype="text/plain",
+                lang="en",
+            ))
+        else:
+            for abstract in medline.find_all("AbstractText"):
+                abstracts.append(fatcat_client.ReleaseAbstract(
+                    content=abstract.get_text().strip(),
+                    mimetype="text/plain",
+                    lang="en",
+                ))
+                if abstract.find('math'):
+                    abstracts.append(fatcat_client.ReleaseAbstract(
+                        # strip the <AbstractText> tags
+                        content=str(abstract)[14:-15],
+                        mimetype="application/mathml+xml",
+                        lang="en",
+                    ))
+        if not abstracts:
+            abstracts = None
+
+        ### Contribs
+        contribs = []
+        if medline.AuthorList:
+            for author in medline.AuthorList.find_all("Author"):
+                given_name = None
+                surname = None
+                raw_name = None
+                if author.ForeName:
+                    given_name = author.ForeName.string
+                if author.LastName:
+                    surname = author.LastName.string
+                if given_name and surname:
+                    raw_name = "{} {}".format(given_name, surname)
+                elif surname:
+                    raw_name = surname
+                contrib_extra = dict()
+                orcid = author.find("Identifier", Source="ORCID")
+                if orcid:
+                    # needs re-formatting from, eg, "0000000179841889"
+                    orcid = orcid.string
+                    if orcid.startswith("http://orcid.org/"):
+                        orcid = orcid.replace("http://orcid.org/", "")
+                    elif orcid.startswith("https://orcid.org/"):
+                        orcid = orcid.replace("https://orcid.org/", "")
+                    elif not '-' in orcid:
+                        orcid = "{}-{}-{}-{}".format(
+                            orcid[0:4],
+                            orcid[4:8],
+                            orcid[8:12],
+                            orcid[12:16],
+                        )
+                    # XXX: do lookup by ORCID
+                    #contrib_extra['orcid'] = orcid
+                affiliation = author.find("Affiliation")
+                raw_affiliation = None
+                if affiliation:
+                    raw_affiliation = affiliation.string
+                if author.find("EqualContrib"):
+                    # TODO: schema for this?
+                    contrib_extra['equal_contrib'] = True
+                contribs.append(fatcat_client.ReleaseContrib(
+                    raw_name=raw_name,
+                    given_name=given_name,
+                    surname=surname,
+                    role="author",
+                    raw_affiliation=raw_affiliation,
+                    extra=contrib_extra,
+                ))
+
+            if medline.AuthorList['CompleteYN'] == 'N':
+                contribs.append(fatcat_client.ReleaseContrib(raw_name="et al."))
+        if not contribs:
+            contribs = None
+
+        ### References
+        refs = []
+        if pubmed.ReferenceList:
+            for ref in pubmed.ReferenceList.find_all('Reference'):
+                ref_obj = dict()
+                ref_extra = dict()
+                ref_pmid = ref.find("ArticleId", IdType="pubmed")
+                if ref_pmid:
+                    ref_extra['pmid'] = ref_pmid.string
+                    # TODO: do reference lookups here based on PMID/DOI
+                ref_raw = ref.Citation
+                if ref_raw:
+                    ref_extra['unstructured'] = ref_raw.string
+                if ref_extra:
+                    ref_obj['extra'] = ref_extra
+                refs.append(fatcat_client.ReleaseRef(
+                    extra=ref_obj.get('extra'),
+                ))
+        if not refs:
+            refs = None
+
+        # extra:
+        #   withdrawn_date
+        #   translation_of
+        #   subtitle
+        #   aliases
+        #   container_name
+        #   group-title
+        #   pubmed: retraction refs
+        if extra_pubmed:
+            extra['pubmed'] = extra_pubmed
+        if not extra:
+            extra = None
+
+        re = fatcat_client.ReleaseEntity(
+            work_id=None,
+            title=clean(title),
+            original_title=clean(original_title),
+            release_type=release_type,
+            release_stage=release_stage,
+            release_date=release_date,
+            release_year=release_year,
+            ext_ids=fatcat_client.ReleaseExtIds(
+                doi=doi,
+                pmid=pmid,
+                pmcid=pmcid,
+                #isbn13     # never in Article
+            ),
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            #publisher  # not included?
+            language=language,
+            #license_slug   # not in MEDLINE
+            abstracts=abstracts,
+            contribs=contribs,
+            refs=refs,
+            container_id=container_id,
+            extra=extra,
+        )
+        return re
+
+    def try_update(self, re):
+
+        # first, lookup existing by PMID (which must be defined)
+        existing = None
+        try:
+            existing = self.api.lookup_release(pmid=re.ext_ids.pmid)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # then try DOI lookup if there is one
+        if not existing and re.ext_ids.doi:
+            try:
+                existing = self.api.lookup_release(doi=re.ext_ids.doi)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+            if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
+                warnings.warn("PMID/DOI mismatch: release {}, pmid {} != {}".format(
+                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))
+                self.counts['exists-pmid-doi-mismatch'] += 1
+                return False
+
+        if existing and existing.ext_ids.pmid and existing.refs:
+            # TODO: any other reasons to do an update?
+            # don't update if it already has PMID
+            self.counts['exists'] += 1
+            return False
+        elif existing:
+            # but do update if only DOI was set
+            existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
+            existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
+            existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid
+            existing.refs = existing.refs or re.refs
+            existing.extra['pubmed'] = re.extra['pubmed']
+            self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+            self.counts['update'] += 1
+            return False
+
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_release_auto_batch(fatcat_client.ReleaseAutoBatch(
+            editgroup=fatcat_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+    def parse_file(self, handle):
+
+        # 1. open with beautiful soup
+        soup = BeautifulSoup(handle, "xml")
+
+        # 2. iterate over articles, call parse_article on each
+        for article in soup.find_all("PubmedArticle"):
+            resp = self.parse_article(article)
+            print(json.dumps(resp))
+            #sys.exit(-1)
+
+if __name__=='__main__':
+    parser = PubMedParser()
+    parser.parse_file(open(sys.argv[1]))
diff --git a/python/parse_pubmed_xml.py b/python/parse_pubmed_xml.py
deleted file mode 100644
index 413333cc..00000000
--- a/python/parse_pubmed_xml.py
+++ /dev/null
@@ -1,372 +0,0 @@
-
-import sys
-import json
-import datetime
-from bs4 import BeautifulSoup
-from bs4.element import NavigableString
-
-# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
-PUBMED_RELEASE_TYPE_MAP = {
-    #Adaptive Clinical Trial
-    "Address": "speech",
-    "Autobiography": "book",
-    #Bibliography
-    "Biography": "book",
-    #Case Reports
-    "Classical Article": "article-journal",
-    #Clinical Conference
-    #Clinical Study
-    #Clinical Trial
-    #Clinical Trial, Phase I
-    #Clinical Trial, Phase II
-    #Clinical Trial, Phase III
-    #Clinical Trial, Phase IV
-    #Clinical Trial Protocol
-    #Clinical Trial, Veterinary
-    #Collected Works
-    #Comparative Study
-    #Congress
-    #Consensus Development Conference
-    #Consensus Development Conference, NIH
-    #Controlled Clinical Trial
-    "Dataset": "dataset",
-    #Dictionary
-    #Directory
-    #Duplicate Publication
-    "Editorial": "editorial",
-    #English Abstract   # doesn't indicate that this is abstract-only
-    #Equivalence Trial
-    #Evaluation Studies
-    #Expression of Concern
-    #Festschrift
-    #Government Document
-    #Guideline
-    "Historical Article": "article-journal",
-    #Interactive Tutorial
-    "Interview": "interview",
-    "Introductory Journal Article": "article-journal",
-    "Journal Article": "article-journal",
-    "Lecture": "speech",
-    "Legal Case": "legal_case",
-    "Legislation": "legislation",
-    "Letter": "letter",
-    #Meta-Analysis
-    #Multicenter Study
-    #News
-    "Newspaper Article": "article-newspaper",
-    #Observational Study
-    #Observational Study, Veterinary
-    #Overall
-    #Patient Education Handout
-    #Periodical Index
-    #Personal Narrative
-    #Portrait
-    #Practice Guideline
-    #Pragmatic Clinical Trial
-    #Publication Components
-    #Publication Formats
-    #Publication Type Category
-    #Randomized Controlled Trial
-    #Research Support, American Recovery and Reinvestment Act
-    #Research Support, N.I.H., Extramural
-    #Research Support, N.I.H., Intramural
-    #Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
-    #Research Support, U.S. Gov't, P.H.S.
-    #Review     # in the "literature review" sense, not "product review"
-    #Scientific Integrity Review
-    #Study Characteristics
-    #Support of Research
-    #Systematic Review
-    "Technical Report": "report",
-    #Twin Study
-    #Validation Studies
-    #Video-Audio Media
-    #Webcasts
-}
-
-MONTH_ABBR_MAP = {
-    "Jan":  1, "01":  1,
-    "Feb":  2, "02":  2,
-    "Mar":  3, "03":  3,
-    "Apr":  4, "04":  4,
-    "May":  5, "05":  5,
-    "Jun":  6, "06":  6,
-    "Jul":  7, "07":  7,
-    "Aug":  8, "08":  8,
-    "Sep":  9, "09":  9,
-    "Oct": 10, "10": 10,
-    "Nov": 11, "11": 11,
-    "Dec": 12, "12": 12,
-}
-
-class PubMedParser():
-    """
-    Converts PubMed/MEDLINE XML into in release entity (which can dump as JSON)
-
-    TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
-    TODO: clean (ftfy) title, original title, etc
-    """
-
-    def __init__(self):
-        pass
-
-    def parse_file(self, handle):
-
-        # 1. open with beautiful soup
-        soup = BeautifulSoup(handle, "xml")
-
-        # 2. iterate over articles, call parse_article on each
-        for article in soup.find_all("PubmedArticle"):
-            resp = self.parse_article(article)
-            print(json.dumps(resp))
-            #sys.exit(-1)
-
-    def parse_article(self, a):
-
-        medline = a.MedlineCitation
-        # PubmedData isn't required by DTD, but seems to always be present
-        pubmed = a.PubmedData
-        extra = dict()
-        extra_pubmed = dict()
-
-        identifiers = pubmed.ArticleIdList
-        doi = identifiers.find("ArticleId", IdType="doi")
-        if doi:
-            doi = doi.string.lower()
-
-        pmcid = identifiers.find("ArticleId", IdType="pmc")
-        if pmcid:
-            pmcid = pmcid.string
-
-        release_type = None
-        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
-            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
-                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
-            break
-        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
-            release_type = "retraction"
-            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
-            if retraction_of:
-                extra_pubmed['retraction_of_raw'] = retraction_of.RefSource.string
-                extra_pubmed['retraction_of_pmid'] = retraction_of.PMID.string
-
-        # everything in medline is published
-        release_status = "published"
-        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
-            release_status = "updated"
-        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
-            release_status = "retracted"
-
-        pages = medline.find('MedlinePgn')
-        if pages:
-            pages = pages.string
-
-        title = medline.Article.ArticleTitle.string # always present
-        if title:
-            if title.endswith('.'):
-                title = title[:-1]
-            # this hides some "special" titles, but the vast majority are
-            # translations; translations don't always include the original_title
-            if title.startswith('[') and title.endswith(']'):
-                title = title[1:-1]
-        else:
-            # TODO: will filter out later
-            title = None
-
-        original_title = medline.Article.find("VernacularTitle", recurse=False)
-        if original_title:
-            original_title = original_title.string or None
-            if original_title and original_title.endswith('.'):
-                original_title = original_title[:-1]
-
-        # TODO: happening in alpha order, not handling multi-language well.
-        # also need to convert lang codes: https://www.nlm.nih.gov/bsd/language_table.html
-        language = medline.Article.Language
-        if language:
-            language = language.string
-            # TODO: map to two-letter
-            if language in ("und", "un"):
-                # "undetermined"
-                language = None
-
-        ### Journal/Issue Metadata
-        # MedlineJournalInfo is always present
-        container = dict()
-        container_extra = dict()
-        mji = medline.MedlineJournalInfo
-        if mji.find("Country"):
-            container_extra['country_name'] = mji.Country.string
-        if mji.find("ISSNLinking"):
-            container['issnl'] = mji.ISSNLinking.string
-
-        journal = medline.Article.Journal
-        issnp = journal.find("ISSN", IssnType="Print")
-        if issnp:
-            container_extra['issnp'] = issnp.string
-
-        pub_date = journal.PubDate
-        release_date = None
-        if pub_date.find("MedlineDate"):
-            release_year = int(pub_date.MedlineDate.string.split()[0][:4])
-        else:
-            release_year = int(pub_date.Year.string)
-            if pub_date.find("Day") and pub_date.find("Month"):
-                release_date = datetime.date(
-                    release_year,
-                    MONTH_ABBR_MAP[pub_date.Month.string],
-                    int(pub_date.Day.string))
-                release_date = release_date.isoformat()
-       
-        ji = journal.JournalIssue
-        volume = None
-        if ji.find("Volume"):
-            volume = ji.Volume.string
-        issue = None
-        if ji.find("Issue"):
-            issue = ji.Issue.string
-        if journal.find("Title"):
-            container['name'] = journal.Title.string
-
-        if extra_pubmed:
-            extra['pubmed'] = extra_pubmed
-        if not extra:
-            extra = None
-
-        ### Abstracts
-        # "All abstracts are in English"
-        abstracts = []
-        first_abstract = medline.find("AbstractText")
-        if first_abstract and first_abstract.get('NlmCategory'):
-            joined = "\n".join([m.get_text() for m in medline.find_all("AbstractText")])
-            abstracts.append(dict(
-                content=joined,
-                mimetype="text/plain",
-                lang="en",
-            ))
-        else:
-            for abstract in medline.find_all("AbstractText"):
-                abstracts.append(dict(
-                    content=abstract.get_text().strip(),
-                    mimetype="text/plain",
-                    lang="en",
-                ))
-                if abstract.find('math'):
-                    abstracts.append(dict(
-                        # strip the <AbstractText> tags
-                        content=str(abstract)[14:-15],
-                        mimetype="application/mathml+xml",
-                        lang="en",
-                    ))
-        if not abstracts:
-            abstracts = None
-
-        ### Contribs
-        contribs = []
-        if medline.AuthorList:
-            for author in medline.AuthorList.find_all("Author"):
-                contrib = dict(
-                    role="author",
-                )
-                if author.ForeName:
-                    contrib['raw_name'] = "{} {}".format(author.ForeName.string, author.LastName.string)
-                elif author.LastName:
-                    contrib['raw_name'] = author.LastName.string
-                contrib_extra = dict()
-                orcid = author.find("Identifier", Source="ORCID")
-                if orcid:
-                    # needs re-formatting from, eg, "0000000179841889"
-                    orcid = orcid.string
-                    if orcid.startswith("http://orcid.org/"):
-                        orcid = orcid.replace("http://orcid.org/", "")
-                    elif orcid.startswith("https://orcid.org/"):
-                        orcid = orcid.replace("https://orcid.org/", "")
-                    elif not '-' in orcid:
-                        orcid = "{}-{}-{}-{}".format(
-                            orcid[0:4],
-                            orcid[4:8],
-                            orcid[8:12],
-                            orcid[12:16],
-                        )
-                    contrib_extra['orcid'] = orcid
-                affiliation = author.find("Affiliation")
-                if affiliation:
-                    contrib['raw_affiliation'] = affiliation.string
-                if author.find("EqualContrib"):
-                    # TODO: schema for this?
-                    contrib_extra['equal_contrib'] = True
-                if contrib_extra:
-                    contrib['extra'] = contrib_extra
-                contribs.append(contrib)
-
-            if medline.AuthorList['CompleteYN'] == 'N':
-                contribs.append(dict(raw_name="et al."))
-        if not contribs:
-            contribs = None
-
-        ### References
-        refs = []
-        if pubmed.ReferenceList:
-            for ref in pubmed.ReferenceList.find_all('Reference'):
-                ref_obj = dict()
-                ref_extra = dict()
-                ref_pmid = ref.find("ArticleId", IdType="pubmed")
-                if ref_pmid:
-                    ref_extra['pmid'] = ref_pmid.string
-                ref_raw = ref.Citation
-                if ref_raw:
-                    ref_extra['raw'] = ref_raw.string
-                if ref_extra:
-                    ref_obj['extra'] = ref_extra
-                refs.append(ref_obj)
-        if not refs:
-            refs = None
-
-        re = dict(
-            work_id=None,
-            title=title,
-            original_title=original_title,
-            release_type=release_type,
-            release_status=release_status,
-            release_date=release_date,
-            release_year=release_year,
-            doi=doi,
-            pmid=int(medline.PMID.string), # always present
-            pmcid=pmcid,
-            #isbn13     # never in Article
-            volume=volume,
-            issue=issue,
-            pages=pages,
-            #publisher  # not included?
-            language=language,
-            #license_slug   # not in MEDLINE
-
-            # content, mimetype, lang
-            abstracts=abstracts,
-
-            # raw_name, role, raw_affiliation, extra
-            contribs=contribs,
-
-            # key, year, container_name, title, locator
-            # extra: volume, authors, issue, publisher, identifiers
-            refs=refs,
-
-            #   name, type, publisher, issnl
-            #   extra: issnp, issne, original_name, languages, country
-            container=container,
-
-            # extra:
-            #   withdrawn_date
-            #   translation_of
-            #   subtitle
-            #   aliases
-            #   container_name
-            #   group-title
-            #   pubmed: retraction refs
-            extra=extra,
-        )
-
-        return re
-
-if __name__=='__main__':
-    parser = PubMedParser()
-    parser.parse_file(open(sys.argv[1]))
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
new file mode 100644
index 00000000..eacc3815
--- /dev/null
+++ b/python/tests/import_pubmed.py
@@ -0,0 +1,80 @@
+
+import json, gzip
+import pytest
+from fatcat_tools.importers import PubmedImporter, Bs4XmlFilePusher
+from fixtures import api
+from bs4 import BeautifulSoup
+
+
+@pytest.fixture(scope="function")
+def pubmed_importer(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
+
+@pytest.fixture(scope="function")
+def pubmed_importer_existing(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield PubmedImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
+
+def test_pubmed_importer(pubmed_importer):
+    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/pubmedsample_2019.xml', 'r') as f:
+        pubmed_importer.bezerk_mode = True
+        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
+    assert counts['insert'] == 1
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = pubmed_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "pubmed" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.PubmedImporter" in eg.extra['agent']
+
+    last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/pubmedsample_2019.xml', 'r') as f:
+        pubmed_importer.bezerk_mode = False
+        pubmed_importer.reset()
+        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 1
+    assert counts['skip'] == 0
+    assert last_index == pubmed_importer.api.get_changelog(limit=1)[0].index
+
+def test_pubmed_xml_parse(pubmed_importer):
+    with open('tests/files/pubmedsample_2019.xml', 'r') as f:
+        soup = BeautifulSoup(f, "xml")
+        r1 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[0])
+        r2 = pubmed_importer.parse_record(soup.find_all("PubmedArticle")[-1])
+
+    assert r1.title == "Hospital debt management and cost reimbursement"
+    assert r1.subtitle == None
+    assert r1.original_title == None
+    assert r1.publisher == None
+    assert r1.release_type == "article-journal"
+    assert r1.release_stage == "published"
+    assert r1.license_slug == None
+    assert r1.ext_ids.doi == None
+    assert r1.ext_ids.pmid == "973217"
+    assert r1.language == "en"
+    assert r1.volume == "3"
+    assert r1.issue == "1"
+    assert r1.pages == "69-81"
+    assert r1.release_date == None # not "1976-12-03", which is medline ingest date
+    assert r1.release_year == 1976
+    # matched by ISSN, so shouldn't be in there?
+    #assert extra['container_name'] == "Abstracts of the Papers Communicated to the Royal Society of London"
+    assert len(r1.contribs) == 1
+
+    assert r1.contribs[0].raw_name == "F R Blume"
+    assert r1.contribs[0].given_name == "F R"
+    assert r1.contribs[0].surname == "Blume"
+
+    print(r1.extra)
+    # TODO: assert r1.extra['pubmed']['mesh_topics'] == ['Accounting', 'Economics, Hospital', 'Hospital Administration']
+    assert r1.extra['pubmed']['pub_types'] == ['Journal Article']
+    assert not r1.refs
+
+    # XXX: r2 tests
-- 
cgit v1.2.3