import datetime
import json
import sys
import warnings
from typing import Any, Dict, List, Optional, Sequence

import fatcat_openapi_client
from bs4 import BeautifulSoup
from fatcat_openapi_client import ApiClient, ReleaseEntity

from fatcat_tools.normal import (
    LANG_MAP_MARC,
    clean_doi,
    clean_issn,
    clean_pmcid,
    clean_pmid,
    clean_str,
)

from .common import EntityImporter

# from: https://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.publication_types/?report=objectonly
PUBMED_RELEASE_TYPE_MAP = {
    # Adaptive Clinical Trial
    "Address": "speech",
    "Autobiography": "book",
    # Bibliography
    "Biography": "book",
    # Case Reports
    "Classical Article": "article-journal",
    # Clinical Conference
    # Clinical Study
    # Clinical Trial
    # Clinical Trial, Phase I
    # Clinical Trial, Phase II
    # Clinical Trial, Phase III
    # Clinical Trial, Phase IV
    # Clinical Trial Protocol
    # Clinical Trial, Veterinary
    # Collected Works
    # Comparative Study
    # Congress
    # Consensus Development Conference
    # Consensus Development Conference, NIH
    # Controlled Clinical Trial
    "Dataset": "dataset",
    # Dictionary
    # Directory
    # Duplicate Publication
    "Editorial": "editorial",
    # English Abstract   # doesn't indicate that this is abstract-only
    # Equivalence Trial
    # Evaluation Studies
    # Expression of Concern
    # Festschrift
    # Government Document
    # Guideline
    "Historical Article": "article-journal",
    # Interactive Tutorial
    "Interview": "interview",
    "Introductory Journal Article": "article-journal",
    "Journal Article": "article-journal",
    "Lecture": "speech",
    "Legal Case": "legal_case",
    "Legislation": "legislation",
    "Letter": "letter",
    # Meta-Analysis
    # Multicenter Study
    # News
    "Newspaper Article": "article-newspaper",
    # Observational Study
    # Observational Study, Veterinary
    # Overall
    # Patient Education Handout
    # Periodical Index
    # Personal Narrative
    # Portrait
    # Practice Guideline
    # Pragmatic Clinical Trial
    # Publication Components
    # Publication Formats
    # Publication Type Category
    # Randomized Controlled Trial
    # Research Support, American Recovery and Reinvestment Act
    # Research Support, N.I.H., Extramural
    # Research Support, N.I.H., Intramural
    # Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.
    # Research Support, U.S. Gov't, P.H.S.
    # Review     # in the "literature review" sense, not "product review"
    # Scientific Integrity Review
    # Study Characteristics
    # Support of Research
    # Systematic Review
    "Technical Report": "report",
    # Twin Study
    # Validation Studies
    # Video-Audio Media
    # Webcasts
}

MONTH_ABBR_MAP = {
    "Jan": 1,
    "01": 1,
    "Feb": 2,
    "02": 2,
    "Mar": 3,
    "03": 3,
    "Apr": 4,
    "04": 4,
    "May": 5,
    "05": 5,
    "Jun": 6,
    "06": 6,
    "Jul": 7,
    "07": 7,
    "Aug": 8,
    "08": 8,
    "Sep": 9,
    "09": 9,
    "Oct": 10,
    "10": 10,
    "Nov": 11,
    "11": 11,
    "Dec": 12,
    "12": 12,
}

# From: https://www.ncbi.nlm.nih.gov/books/NBK7249/
COUNTRY_NAME_MAP = {
    "Afghanistan": "af",
    "Albania": "al",
    "Algeria": "dz",
    "Andorra": "ad",
    "Angola": "ao",
    "Antigua and Barbuda": "ag",
    "Argentina": "ar",
    "Armenia": "am",
    "Australia": "au",
    "Austria": "at",
    "Azerbaijan": "az",
    "Bahamas": "bs",
    "Bahrain": "bh",
    "Bangladesh": "bd",
    "Barbados": "bb",
    "Belarus": "by",
    "Belgium": "be",
    "Belize": "bz",
    "Benin": "bj",
    "Bhutan": "bt",
    "Bolivia": "bo",
    "Bosnia and Herzegowina": "ba",
    "Botswana": "bw",
    "Brazil": "br",
    "Brunei Darussalam": "bn",
    "Bulgaria": "bg",
    "Burkina Faso": "bf",
    "Burundi": "bi",
    "Cambodia": "kh",
    "Cameroon": "cm",
    "Canada": "ca",
    "Cape Verde": "cv",
    "Central African Republic": "cf",
    "Chad": "td",
    "Chile": "cl",
    "China": "cn",
    "Colombia": "co",
    "Comoros": "km",
    "Congo, Democratic Republic": "cd",
    "Congo, People’s Republic": "cg",
    "Costa Rica": "cr",
    "Cote d'Ivoire": "ci",
    "Croatia (Local Name: Hrvatska)": "hr",
    "Cuba": "cu",
    "Cyprus": "cy",
    "Czech Republic": "cz",
    "Denmark": "dk",
    "Djibouti": "dj",
    "Dominica": "dm",
    "Dominican Republic": "do",
    "East Timor": "tl",
    "Ecuador": "ec",
    "El Salvador": "sv",
    "Equatorial Guinea": "gq",
    "Eritrea": "er",
    "Estonia": "ee",
    "Ethiopia": "et",
    "Fiji": "fj",
    "Finland": "fi",
    "France": "fr",
    "Gabon": "ga",
    "Gambia": "gm",
    "Georgia": "ge",
    "Germany": "de",
    "Ghana": "gh",
    "Greece": "gr",
    "Greenland": "gl",
    "Grenada": "gd",
    "Guatemala": "gt",
    "Guinea": "gn",
    "Guinea-Bissau": "gw",
    "Guyana": "gy",
    "Haiti": "ht",
    "Honduras": "hn",
    "Hong Kong": "hk",
    "Hungary": "hu",
    "Iceland": "is",
    "India": "in",
    "Indonesia": "id",
    "Iran": "ir",
    "Iraq": "iq",
    "Ireland": "ie",
    "Israel": "il",
    "Italy": "it",
    "Jamaica": "jm",
    "Japan": "jp",
    "Jordan": "jo",
    "Kazakhstan": "kz",
    "Kenya": "ke",
    "Kiribati": "ki",
    "Korea, Democratic People's Republic": "kp",
    "Korea, Republic": "kr",
    "Kuwait": "kw",
    "Kyrgyzstan": "kg",
    "Laos": "la",
    "Latvia": "lv",
    "Lebanon": "lb",
    "Lesotho": "ls",
    "Liberia": "lr",
    "Libya": "ly",
    "Liechtenstein": "li",
    "Lithuania": "lt",
    "Luxembourg": "lu",
    "Macedonia": "mk",
    "Madagascar": "mg",
    "Malawi": "mw",
    "Malaysia": "my",
    "Maldives": "mv",
    "Mali": "ml",
    "Malta": "mt",
    "Marshall Islands": "mh",
    "Mauritania": "mr",
    "Mauritius": "mu",
    "Mexico": "mx",
    "Micronesia": "fm",
    "Moldova": "md",
    "Monaco": "mc",
    "Mongolia": "mn",
    "Morocco": "ma",
    "Mozambique": "mz",
    "Myanmar": "mm",
    "Namibia": "na",
    "Nauru": "nr",
    "Nepal": "np",
    "Netherlands": "nl",
    "New Zealand": "nz",
    "Nicaragua": "ni",
    "Niger": "ne",
    "Nigeria": "ng",
    "Norway": "no",
    "Oman": "om",
    "Pakistan": "pk",
    "Palau": "pw",
    "Panama": "pa",
    "Papua New Guinea": "pg",
    "Paraguay": "py",
    "Peru": "pe",
    "Philippines": "ph",
    "Poland": "pl",
    "Portugal": "pt",
    "Puerto Rico": "pr",
    "Qatar": "qa",
    "Romania": "ro",
    "Russian Federation": "ru",
    "Rwanda": "rw",
    "Saint Kitts and Nevis": "kn",
    "Saint Lucia": "lc",
    "Saint Vincent and the Grenadines": "vc",
    "Samoa": "ws",
    "San Marino": "sm",
    "Sao Tome and Príncipe": "st",
    "Saudi Arabia": "sa",
    "Senegal": "sn",
    "Serbia and Montenegro": "cs",
    "Seychelles": "sc",
    "Sierra Leone": "sl",
    "Singapore": "sg",
    "Slovakia (Slovak Republic)": "sk",
    "Slovenia": "si",
    "Solomon Islands": "sb",
    "Somalia": "so",
    "South Africa": "za",
    "Spain": "es",
    "Sri Lanka": "lk",
    "Sudan": "sd",
    "Suriname": "sr",
    "Swaziland": "sz",
    "Sweden": "se",
    "Switzerland": "ch",
    "Syrian Arab Republic": "sy",
    "Taiwan": "tw",
    "Tajikistan": "tj",
    "Tanzania": "tz",
    "Tanzania": "tz",
    "Thailand": "th",
    "Togo": "tg",
    "Tonga": "to",
    "Trinidad and Tobago": "tt",
    "Tunisia": "tn",
    "Turkey": "tr",
    "Turkmenistan": "tm",
    "Tuvalu": "tv",
    "Uganda": "ug",
    "Ukraine": "ua",
    "United Arab Emirates": "ae",
    "United Kingdom": "gb",
    "United States": "us",
    "Uruguay": "uy",
    # Additions from running over large files
    "Bosnia and Herzegovina": "ba",
    # "International"
    "China (Republic : 1949- )": "tw",  # pretty sure this is tw not cn
    "Russia (Federation)": "ru",
    "Scotland": "gb",
    "England": "gb",
    "Korea (South)": "kr",
    "Georgia (Republic)": "ge",
    "Egypt": "eg",
}


class PubmedImporter(EntityImporter):
    """
    Importer for PubMed/MEDLINE XML metadata.

    If lookup_refs is true, will do identifer-based lookups for all references.

    TODO: MEDLINE doesn't include PMC/OA license; could include in importer?
    """

    def __init__(
        self, api: ApiClient, issn_map_file: Sequence, lookup_refs: bool = True, **kwargs
    ):

        eg_desc = kwargs.get(
            "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata"
        )
        eg_extra = kwargs.get("editgroup_extra", dict())
        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.PubmedImporter")
        super().__init__(
            api,
            issn_map_file=issn_map_file,
            editgroup_description=eg_desc,
            editgroup_extra=eg_extra,
            **kwargs
        )

        self.lookup_refs = lookup_refs
        self.create_containers = kwargs.get("create_containers", True)
        self.read_issn_map_file(issn_map_file)

    def want(self, raw_record: BeautifulSoup) -> bool:
        return True

    # TODO: mypy annotations partially skipped on this function ('Any' instead of
    # 'BeautifulSoup') for now because XML parsing annotations are large and
    # complex
    def parse_record(self, a: Any) -> ReleaseEntity:

        medline = a.MedlineCitation
        # PubmedData isn't required by DTD, but seems to always be present
        pubmed = a.PubmedData
        extra = dict()
        extra_pubmed = dict()

        identifiers = pubmed.ArticleIdList
        pmid = medline.PMID.string.strip()
        doi = identifiers.find("ArticleId", IdType="doi")
        if doi and doi.string:
            doi = clean_doi(doi.string)
        else:
            doi = None

        pmcid = identifiers.find("ArticleId", IdType="pmc")
        if pmcid:
            pmcid = clean_pmcid(pmcid.string.strip().upper())

        release_type = None
        pub_types = []
        for pub_type in medline.Article.PublicationTypeList.find_all("PublicationType"):
            pub_types.append(pub_type.string)
            if pub_type.string in PUBMED_RELEASE_TYPE_MAP:
                release_type = PUBMED_RELEASE_TYPE_MAP[pub_type.string]
                break
        if pub_types:
            extra_pubmed["pub_types"] = pub_types
        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
            release_type = "retraction"
            retraction_of = medline.find("CommentsCorrections", RefType="RetractionOf")
            if retraction_of:
                if retraction_of.RefSource:
                    extra_pubmed["retraction_of_raw"] = retraction_of.RefSource.string
                if retraction_of.PMID:
                    extra_pubmed["retraction_of_pmid"] = retraction_of.PMID.string

        # everything in medline is published
        release_stage = "published"
        if medline.Article.PublicationTypeList.find(string="Corrected and Republished Article"):
            release_stage = "updated"
        if medline.Article.PublicationTypeList.find(string="Retraction of Publication"):
            release_stage = "retraction"

        withdrawn_status = None
        if medline.Article.PublicationTypeList.find(string="Retracted Publication"):
            withdrawn_status = "retracted"
        elif medline.find("CommentsCorrections", RefType="ExpressionOfConcernIn"):
            withdrawn_status = "concern"

        pages = medline.find("MedlinePgn")
        if pages:
            pages = pages.string

        title = medline.Article.ArticleTitle.get_text()  # always present
        if title:
            title = title.replace("\n", " ")
            if title.endswith("."):
                title = title[:-1]
            # this hides some "special" titles, but the vast majority are
            # translations; translations don't always include the original_title
            if title.startswith("[") and title.endswith("]"):
                title = title[1:-1]
        else:
            # will filter out later
            title = None

        original_title = medline.Article.find("VernacularTitle", recurse=False)
        if original_title:
            original_title = original_title.get_text() or None
            original_title = original_title.replace("\n", " ")
            if original_title and original_title.endswith("."):
                original_title = original_title[:-1]

        if original_title and not title:
            # if we only have an "original" title, but not translated/english
            # title, sub in the original title so the entity can be created
            title = original_title
            original_title = None

        # TODO: happening in alpha order, not handling multi-language well.
        language = medline.Article.Language
        if language:
            language = language.get_text()
            if language in ("und", "un"):
                # "undetermined"
                language = None
            else:
                language = LANG_MAP_MARC.get(language)
                if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):
                    warnings.warn(
                        "MISSING MARC LANG: {}".format(medline.Article.Language.string)
                    )

        ### Journal/Issue Metadata
        # MedlineJournalInfo is always present
        issnl = None
        container_id = None
        container_name = None
        container_extra = dict()
        mji = medline.MedlineJournalInfo
        if mji.find("Country"):
            country_name = mji.Country.string.strip()
            country_code = COUNTRY_NAME_MAP.get(country_name)
            if country_code:
                container_extra["country"] = country_code
            elif country_name:
                container_extra["country_name"] = country_name
        if mji.find("ISSNLinking"):
            issnl = mji.ISSNLinking.string

        journal = medline.Article.Journal
        issnp = journal.find("ISSN", IssnType="Print")
        if issnp:
            issnp = clean_issn(issnp.string)
        else:
            issnp = None

        if not issnl and issnp:
            issnl = self.issn2issnl(issnp)
        else:
            issnl = None

        if issnl:
            container_id = self.lookup_issnl(issnl)

        pub_date = medline.Article.find("ArticleDate")
        if not pub_date:
            pub_date = journal.PubDate
        if not pub_date:
            pub_date = journal.JournalIssue.PubDate
        release_date: Optional[str] = None
        release_year: Optional[int] = None
        if pub_date.Year:
            release_year = int(pub_date.Year.string)
            if pub_date.find("Day") and pub_date.find("Month"):
                try:
                    release_date_date = datetime.date(
                        release_year,
                        MONTH_ABBR_MAP[pub_date.Month.string],
                        int(pub_date.Day.string),
                    )
                    release_date = release_date_date.isoformat()
                except ValueError as ve:
                    print("bad date, skipping: {}".format(ve), file=sys.stderr)
                    release_date = None
        elif pub_date.MedlineDate:
            medline_date = pub_date.MedlineDate.string.strip()
            if len(medline_date) >= 4 and medline_date[:4].isdigit():
                release_year = int(medline_date[:4])
                if release_year < 1300 or release_year > 2040:
                    print(
                        "bad medline year, skipping: {}".format(release_year), file=sys.stderr
                    )
                    release_year = None
            else:
                print(
                    "unparsable medline date, skipping: {}".format(medline_date),
                    file=sys.stderr,
                )

        if journal.find("Title"):
            container_name = journal.Title.get_text()

        if (
            container_id is None
            and self.create_containers
            and (issnl is not None)
            and container_name
        ):
            # name, type, publisher, issnl
            # extra: original_name, languages, country
            ce = fatcat_openapi_client.ContainerEntity(
                name=container_name,
                container_type="journal",
                # NOTE: publisher not included
                issnl=issnl,
                issnp=issnp,
                extra=(container_extra or None),
            )
            ce_edit = self.create_container(ce)
            container_id = ce_edit.ident
            self._issnl_id_map[issnl] = container_id

        ji = journal.JournalIssue
        volume = None
        if ji.find("Volume"):
            volume = ji.Volume.string
        issue = None
        if ji.find("Issue"):
            issue = ji.Issue.string

        ### Abstracts
        # "All abstracts are in English"
        abstracts = []
        primary_abstract = medline.find("Abstract")
        if primary_abstract and primary_abstract.AbstractText.get("NlmCategory"):
            joined = "\n".join(
                [m.get_text() for m in primary_abstract.find_all("AbstractText")]
            )
            abst = fatcat_openapi_client.ReleaseAbstract(
                content=joined,
                mimetype="text/plain",
                lang="en",
            )
            if abst.content:
                abstracts.append(abst)
        elif primary_abstract:
            for abstract in primary_abstract.find_all("AbstractText"):
                abst = fatcat_openapi_client.ReleaseAbstract(
                    content=abstract.get_text().strip(),
                    mimetype="text/plain",
                    lang="en",
                )
                if abst.content:
                    abstracts.append(abst)
                if abstract.find("math"):
                    abst = fatcat_openapi_client.ReleaseAbstract(
                        # strip the <AbstractText> tags
                        content=str(abstract)[14:-15],
                        mimetype="application/mathml+xml",
                        lang="en",
                    )
                    if abst.content:
                        abstracts.append(abst)
        other_abstracts = medline.find_all("OtherAbstract")
        for other in other_abstracts:
            lang: Optional[str] = "en"
            if other.get("Language"):
                lang = LANG_MAP_MARC.get(other["Language"])
            abst = fatcat_openapi_client.ReleaseAbstract(
                content=other.AbstractText.get_text().strip(),
                mimetype="text/plain",
                lang=lang,
            )
            if abst.content:
                abstracts.append(abst)

        ### Contribs
        contribs = []
        if medline.AuthorList:
            for author in medline.AuthorList.find_all("Author"):
                creator_id = None
                given_name = None
                surname = None
                raw_name = None
                if author.ForeName:
                    given_name = author.ForeName.get_text().replace("\n", " ")
                if author.LastName:
                    surname = author.LastName.get_text().replace("\n", " ")
                if given_name and surname:
                    raw_name = "{} {}".format(given_name, surname)
                elif surname:
                    raw_name = surname
                if not raw_name and author.CollectiveName and author.CollectiveName.get_text():
                    raw_name = author.CollectiveName.get_text().replace("\n", " ")
                contrib_extra = dict()
                orcid = author.find("Identifier", Source="ORCID")
                if orcid:
                    # needs re-formatting from, eg, "0000000179841889"
                    orcid = orcid.string
                    if orcid.startswith("http://orcid.org/"):
                        orcid = orcid.replace("http://orcid.org/", "")
                    elif orcid.startswith("https://orcid.org/"):
                        orcid = orcid.replace("https://orcid.org/", "")
                    elif "-" not in orcid:
                        orcid = "{}-{}-{}-{}".format(
                            orcid[0:4],
                            orcid[4:8],
                            orcid[8:12],
                            orcid[12:16],
                        )
                    creator_id = self.lookup_orcid(orcid)
                    contrib_extra["orcid"] = orcid
                affiliations = author.find_all("Affiliation")
                raw_affiliation = None
                if affiliations:
                    raw_affiliation = affiliations[0].get_text().replace("\n", " ")
                    if len(affiliations) > 1:
                        contrib_extra["more_affiliations"] = [
                            ra.get_text().replace("\n", " ") for ra in affiliations[1:]
                        ]
                if author.find("EqualContrib"):
                    # TODO: schema for this?
                    contrib_extra["equal"] = True
                contribs.append(
                    fatcat_openapi_client.ReleaseContrib(
                        raw_name=raw_name,
                        given_name=given_name,
                        surname=surname,
                        role="author",
                        raw_affiliation=raw_affiliation,
                        creator_id=creator_id,
                        extra=contrib_extra,
                    )
                )

            if medline.AuthorList["CompleteYN"] == "N":
                contribs.append(fatcat_openapi_client.ReleaseContrib(raw_name="et al."))

        for i, contrib in enumerate(contribs):
            if contrib.raw_name != "et al.":
                contrib.index = i

        ### References
        refs = []
        if pubmed.ReferenceList:
            # note that Reference always exists within a ReferenceList, but
            # that there may be multiple ReferenceList (eg, sometimes one per
            # Reference)
            for ref in pubmed.find_all("Reference"):
                ref_extra: Dict[str, Any] = dict()
                ref_doi = ref.find("ArticleId", IdType="doi")
                if ref_doi:
                    ref_doi = clean_doi(ref_doi.string)
                ref_pmid = ref.find("ArticleId", IdType="pubmed")
                if ref_pmid:
                    ref_pmid = clean_pmid(ref_pmid.string)
                ref_release_id = None
                if ref_doi:
                    ref_extra["doi"] = ref_doi
                    if self.lookup_refs:
                        ref_release_id = self.lookup_doi(ref_doi)
                if ref_pmid:
                    ref_extra["pmid"] = ref_pmid
                    if self.lookup_refs:
                        ref_release_id = self.lookup_pmid(ref_pmid)
                ref_raw = ref.Citation
                if ref_raw:
                    ref_extra["unstructured"] = ref_raw.get_text()
                refs.append(
                    fatcat_openapi_client.ReleaseRef(
                        target_release_id=ref_release_id,
                        extra=ref_extra or None,
                    )
                )

        # extra:
        #   translation_of
        #   aliases
        #   container_name
        #   group-title
        #   pubmed: retraction refs
        if extra_pubmed:
            extra["pubmed"] = extra_pubmed

        title = clean_str(title)
        if not title:
            return None

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            title=title,
            original_title=clean_str(original_title),
            release_type=release_type,
            release_stage=release_stage,
            release_date=release_date,
            release_year=release_year,
            withdrawn_status=withdrawn_status,
            ext_ids=fatcat_openapi_client.ReleaseExtIds(
                doi=doi,
                pmid=pmid,
                pmcid=pmcid,
                # isbn13     # never in Article
            ),
            volume=volume,
            issue=issue,
            pages=pages,
            # publisher  # not included?
            language=language,
            # license_slug   # not in MEDLINE
            abstracts=abstracts or None,
            contribs=contribs or None,
            refs=refs or None,
            container_id=container_id,
            extra=extra or None,
        )
        return re

    def try_update(self, re: ReleaseEntity) -> bool:

        # first, lookup existing by PMID (which must be defined)
        existing = None
        try:
            existing = self.api.lookup_release(pmid=re.ext_ids.pmid)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err

        # then try DOI lookup if there is one
        if not existing and re.ext_ids.doi:
            try:
                existing = self.api.lookup_release(doi=re.ext_ids.doi)
            except fatcat_openapi_client.rest.ApiException as err:
                if err.status != 404:
                    raise err
            if existing and existing.ext_ids.pmid and existing.ext_ids.pmid != re.ext_ids.pmid:
                warn_str = "PMID/DOI mismatch: release {}, pmid {} != {}".format(
                    existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid
                )
                warnings.warn(warn_str)
                self.counts["warn-pmid-doi-mismatch"] += 1
                # don't clobber DOI, but do group together
                re.ext_ids.doi = None
                re.work_id = existing.work_id

        if existing and not self.do_updates:
            self.counts["exists"] += 1
            return False

        if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
            # TODO: any other reasons to do an update?
            # don't update if it already has PMID
            self.counts["exists"] += 1
            return False
        elif existing:
            # but do update if only DOI was set
            existing.ext_ids.doi = existing.ext_ids.doi or re.ext_ids.doi
            existing.ext_ids.pmid = existing.ext_ids.pmid or re.ext_ids.pmid
            existing.ext_ids.pmcid = existing.ext_ids.pmcid or re.ext_ids.pmcid

            existing.container_id = existing.container_id or re.container_id
            existing.refs = existing.refs or re.refs
            existing.abstracts = existing.abstracts or re.abstracts
            existing.extra["pubmed"] = re.extra["pubmed"]

            # fix stub titles
            if existing.title in [
                "OUP accepted manuscript",
            ]:
                existing.title = re.title

            existing.original_title = existing.original_title or re.original_title
            existing.release_type = existing.release_type or re.release_type
            existing.release_stage = existing.release_stage or re.release_stage
            existing.release_date = existing.release_date or re.release_date
            existing.release_year = existing.release_year or re.release_year
            existing.withdrawn_status = existing.withdrawn_status or re.withdrawn_status
            existing.volume = existing.volume or re.volume
            existing.issue = existing.issue or re.issue
            existing.pages = existing.pages or re.pages
            existing.language = existing.language or re.language

            # update subtitle in-place first
            if not existing.subtitle and existing.extra.get("subtitle"):
                subtitle = existing.extra.pop("subtitle")
                if type(subtitle) == list:
                    subtitle = subtitle[0]
                if subtitle:
                    existing.subtitle = subtitle
            if not existing.subtitle:
                existing.subtitle = re.subtitle

            try:
                self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
                self.counts["update"] += 1
            except fatcat_openapi_client.rest.ApiException as err:
                # there is a code path where we try to update the same release
                # twice in a row; if that happens, just skip
                # NOTE: API behavior might change in the future?
                if "release_edit_editgroup_id_ident_id_key" in err.body:
                    self.counts["skip-update-conflict"] += 1
                    return False
                else:
                    raise err
            finally:
                return False

        return True

    def insert_batch(self, batch: List[ReleaseEntity]) -> None:
        self.api.create_release_auto_batch(
            fatcat_openapi_client.ReleaseAutoBatch(
                editgroup=fatcat_openapi_client.Editgroup(
                    description=self.editgroup_description, extra=self.editgroup_extra
                ),
                entity_list=batch,
            )
        )

    def parse_file(self, handle: Any) -> None:

        # 1. open with beautiful soup
        soup = BeautifulSoup(handle, "xml")

        # 2. iterate over articles, call parse_article on each
        for article in soup.find_all("PubmedArticle"):
            resp = self.parse_record(article)
            print(json.dumps(resp))
            # sys.exit(-1)