2 files changed, 193 insertions, 137 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index dfb5f799..ad4b7722 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,10 +1,13 @@
 
 import datetime
+from typing import Optional
 
 import tldextract
 
+from fatcat_openapi_client import ReleaseEntity, ContainerEntity
 
-def check_kbart(year, archive):
+
+def check_kbart(year: int, archive: dict) -> Optional[bool]:
     if not archive or not archive.get('year_spans'):
         return None
     for span in archive['year_spans']:
@@ -12,7 +15,7 @@ def check_kbart(year, archive):
             return True
     return False
 
-def test_check_kbart():
+def test_check_kbart() -> None:
 
     assert check_kbart(1990, dict()) is None
     assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False
@@ -21,10 +24,13 @@ def test_check_kbart():
     assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True
 
 
-def release_to_elasticsearch(entity, force_bool=True):
+def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict:
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
 
+    This is a large/complex transform, so subsets are split out into helper
+    functions.
+
     Returns: dict
     Raises exception on error (never returns None)
     """
@@ -68,16 +74,18 @@ def release_to_elasticsearch(entity, force_bool=True):
         mag_id = release.ext_ids.mag,
     )
 
-    is_oa = None
-    is_preserved = None
-    is_longtail_oa = None
-    in_kbart = None
-    in_jstor = False
-    in_web = False
-    in_dweb = False
-    in_ia = False
-    in_ia_sim = False
-    in_shadows = False
+    t.update(dict(
+        is_oa = None,
+        is_longtail_oa = None,
+        is_preserved = None,
+        in_web = False,
+        in_dweb = False,
+        in_ia = False,
+        in_ia_sim = False,
+        in_kbart = None,
+        in_jstor = False,
+        in_shadows = False,
+    ))
 
     release_year = release.release_year
     if release.release_date:
@@ -116,55 +124,8 @@ def release_to_elasticsearch(entity, force_bool=True):
     # TODO: mapping... probably by lookup?
     t['affiliation_rors'] = None
 
-    this_year = datetime.date.today().year
-    container = release.container
-    if container:
-        t['publisher'] = container.publisher
-        t['container_name'] = container.name
-        # this is container.ident, not release.container_id, because there may
-        # be a redirect involved
-        t['container_id'] = container.ident
-        t['container_issnl'] = container.issnl
-        t['container_type'] = container.container_type
-        if container.extra:
-            c_extra = container.extra
-            if c_extra.get('kbart') and release_year:
-                in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor'))
-                in_kbart = in_jstor
-                for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
-                                'hathitrust', 'scholarsportal', 'cariniana'):
-                    in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive))
-                    # recent KBART coverage is often not updated for the
-                    # current year. So for current-year publications, consider
-                    # coverage from *last* year to also be included in the
-                    # Keeper
-                    if not in_kbart and release_year == this_year:
-                        in_kbart = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
-
-            if c_extra.get('ia'):
-                if c_extra['ia'].get('sim') and release_year:
-                    in_ia_sim = check_kbart(release_year, c_extra['ia']['sim'])
-                if c_extra['ia'].get('longtail_oa'):
-                    is_longtail_oa = True
-            if c_extra.get('sherpa_romeo'):
-                if c_extra['sherpa_romeo'].get('color') == 'white':
-                    is_oa = False
-            if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
-                is_oa = True
-            if c_extra.get('doaj'):
-                if c_extra['doaj'].get('as_of'):
-                    is_oa = True
-            if c_extra.get('road'):
-                if c_extra['road'].get('as_of'):
-                    is_oa = True
-            if c_extra.get('szczepanski'):
-                if c_extra['szczepanski'].get('as_of'):
-                    is_oa = True
-            if c_extra.get('country'):
-                t['country_code'] = c_extra['country']
-                t['country_code_upper'] = c_extra['country'].upper()
-            if c_extra.get('publisher_type'):
-                t['publisher_type'] = c_extra['publisher_type']
+    if release.container:
+        t.update(_rte_container_helper(release.container, release_year))
 
     # fall back to release-level container metadata if container not linked or
     # missing context
@@ -174,67 +135,36 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['container_name'] = release.extra.get('container_name')
 
     if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
-        in_jstor = True
+        t['in_jstor'] = True
 
-    files = release.files or []
-    t['file_count'] = len(files)
-    t['fileset_count'] = len(release.filesets or [])
-    t['webcapture_count'] = len(release.webcaptures or [])
-    any_pdf_url = None
-    good_pdf_url = None
-    best_pdf_url = None
-    ia_pdf_url = None
-    for f in files:
-        if f.extra and f.extra.get('shadows'):
-            # TODO: shadow check goes here
-            in_shadows = True
-        is_pdf = 'pdf' in (f.mimetype or '')
-        for release_url in (f.urls or []):
-            if not f.mimetype and 'pdf' in release_url.url.lower():
-                is_pdf = True
-            if release_url.url.lower().startswith('http'):
-                in_web = True
-            if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
-                # not sure what rel will be for this stuff
-                in_dweb = True
-            if is_pdf:
-                any_pdf_url = release_url.url
-            if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf:
-                is_preserved = True
-                good_pdf_url = release_url.url
-            if '//www.jstor.org/' in release_url.url:
-                in_jstor = True
-            if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
-                in_ia = True
-                if is_pdf:
-                    best_pdf_url = release_url.url
-                    ia_pdf_url = release_url.url
-    # here is where we bake-in priority; IA-specific
-    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
-    t['ia_pdf_url'] = ia_pdf_url
+    # transform file/fileset/webcapture related fields
+    t.update(_rte_content_helper(release))
+
+    if release.ext_ids.doaj:
+        t['is_oa'] = True
 
     if release.license_slug:
         # TODO: more/better checks here, particularly strict *not* OA licenses
         if release.license_slug.startswith("CC-"):
-            is_oa = True
+            t['is_oa'] = True
         if release.license_slug.startswith("ARXIV-"):
-            is_oa = True
+            t['is_oa'] = True
 
     extra = release.extra or dict()
     if extra:
         if extra.get('is_oa'):
             # NOTE: not actually setting this anywhere... but could
-            is_oa = True
+            t['is_oa'] = True
         if extra.get('longtail_oa'):
             # sometimes set by GROBID/matcher
-            is_oa = True
-            is_longtail_oa = True
+            t['is_oa'] = True
+            t['is_longtail_oa'] = True
         if not t.get('container_name'):
             t['container_name'] = extra.get('container_name')
         if extra.get('crossref'):
             if extra['crossref'].get('archive'):
                 # all crossref archives are KBART, I believe
-                in_kbart = True
+                t['in_kbart'] = True
         # backwards compatible subtitle fetching
         if not t['subtitle'] and extra.get('subtitle'):
             if type(extra['subtitle']) == list:
@@ -251,7 +181,7 @@ def release_to_elasticsearch(entity, force_bool=True):
         # TODO: non-numerical first pages
 
     t['ia_microfilm_url'] = None
-    if in_ia_sim:
+    if t['in_ia_sim']:
         # TODO: determine URL somehow? I think this is in flux. Will probably
         # need extra metadata in the container extra field.
         # special case as a demo for now.
@@ -277,42 +207,168 @@ def release_to_elasticsearch(entity, force_bool=True):
     if t['doi']:
         t['doi_prefix'] = t['doi'].split('/')[0]
 
-    if is_longtail_oa:
-        is_oa = True
+    if t['is_longtail_oa']:
+        t['is_oa'] = True
 
+    # optionally coerce all flags from Optional[bool] to bool
     if force_bool:
-        t['is_oa'] = bool(is_oa)
-        t['is_longtail_oa'] = bool(is_longtail_oa)
-        t['in_kbart'] = bool(in_kbart)
-        t['in_ia_sim'] = bool(in_ia_sim)
-        t['in_jstor'] = bool(in_jstor)
-        t['in_web'] = bool(in_web)
-        t['in_dweb'] = bool(in_dweb)
-        t['in_shadows'] = bool(in_shadows)
-    else:
-        t['is_oa'] = is_oa
-        t['is_longtail_oa'] = is_longtail_oa
-        t['in_kbart'] = in_kbart
-        t['in_ia_sim'] = in_ia_sim
-        t['in_jstor'] = in_jstor
-        t['in_web'] = in_web
-        t['in_dweb'] = in_dweb
-        t['in_shadows'] = in_shadows
-
-    t['in_ia'] = bool(in_ia)
-    t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'))
+        for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim',
+                  'in_jstor', 'in_web', 'in_dweb', 'in_shadows'):
+            t[k] = bool(t[k])
+
+    t['in_ia'] = bool(t['in_ia'])
+    t['is_preserved'] = (
+        bool(t['is_preserved'])
+        or t['in_ia']
+        or t['in_kbart']
+        or t['in_jstor']
+        or t.get('pmcid')
+        or t.get('arxiv_id')
+    )
 
-    if in_ia:
+    if t['in_ia']:
         t['preservation'] = 'bright'
-    elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'):
+    elif t['is_preserved']:
         t['preservation'] = 'dark'
-    elif in_shadows:
+    elif t['in_shadows']:
         t['preservation'] = 'shadows_only'
     else:
         t['preservation'] = 'none'
 
     return t
 
+def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict:
+    """
+    Container metadata sub-section of release_to_elasticsearch()
+    """
+    this_year = datetime.date.today().year
+    t = dict()
+    t['publisher'] = container.publisher
+    t['container_name'] = container.name
+    # this is container.ident, not release.container_id, because there may
+    # be a redirect involved
+    t['container_id'] = container.ident
+    t['container_issnl'] = container.issnl
+    t['container_type'] = container.container_type
+    if container.extra:
+        c_extra = container.extra
+        if c_extra.get('kbart') and release_year:
+            if check_kbart(release_year, c_extra['kbart'].get('jstor')):
+                t['in_jstor'] = True
+            if t.get('in_kbart') or t.get('in_jstor'):
+                t['in_kbart'] = True
+            for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
+                            'hathitrust', 'scholarsportal', 'cariniana'):
+                t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive))
+                # recent KBART coverage is often not updated for the
+                # current year. So for current-year publications, consider
+                # coverage from *last* year to also be included in the
+                # Keeper
+                if not t.get('in_kbart') and release_year == this_year:
+                    t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
+
+        if c_extra.get('ia'):
+            if c_extra['ia'].get('sim') and release_year:
+                t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim'])
+            if c_extra['ia'].get('longtail_oa'):
+                t['is_longtail_oa'] = True
+        if c_extra.get('sherpa_romeo'):
+            if c_extra['sherpa_romeo'].get('color') == 'white':
+                t['is_oa'] = False
+        if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
+            t['is_oa'] = True
+        if c_extra.get('doaj'):
+            if c_extra['doaj'].get('as_of'):
+                t['is_oa'] = True
+        if c_extra.get('road'):
+            if c_extra['road'].get('as_of'):
+                t['is_oa'] = True
+        if c_extra.get('szczepanski'):
+            if c_extra['szczepanski'].get('as_of'):
+                t['is_oa'] = True
+        if c_extra.get('country'):
+            t['country_code'] = c_extra['country']
+            t['country_code_upper'] = c_extra['country'].upper()
+        if c_extra.get('publisher_type'):
+            t['publisher_type'] = c_extra['publisher_type']
+    return t
+
+def _rte_content_helper(release: ReleaseEntity) -> dict:
+    """
+    File/FileSet/WebCapture sub-section of release_to_elasticsearch()
+
+    The current priority order for "best_pdf_url" is:
+    - internet archive urls (archive.org or web.archive.org)
+    - other webarchive or repository URLs
+    - any other URL
+    """
+    t = dict(
+        file_count = len(release.files or []),
+        fileset_count = len(release.filesets or []),
+        webcapture_count = len(release.webcaptures or []),
+    )
+
+    any_pdf_url = None
+    good_pdf_url = None
+    best_pdf_url = None
+    ia_pdf_url = None
+
+    for f in release.files or []:
+        if f.extra and f.extra.get('shadows'):
+            t['in_shadows'] = True
+        is_pdf = 'pdf' in (f.mimetype or '')
+        for release_url in (f.urls or []):
+            # first generic flags
+            t.update(_rte_url_helper(release_url))
+
+            # then PDF specific stuff (for generating "best URL" fields)
+            if not f.mimetype and 'pdf' in release_url.url.lower():
+                is_pdf = True
+            if is_pdf:
+                any_pdf_url = release_url.url
+                if release_url.rel in ('webarchive', 'repository', 'repo'):
+                    good_pdf_url = release_url.url
+                if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+                    best_pdf_url = release_url.url
+                    ia_pdf_url = release_url.url
+
+    # here is where we bake-in PDF url priority; IA-specific
+    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+    t['ia_pdf_url'] = ia_pdf_url
+
+    for fs in release.filesets or []:
+        for url_obj in (fs.urls or []):
+            t.update(_rte_url_helper(url_obj))
+
+    for wc in release.webcaptures or []:
+        for url_obj in (wc.archive_urls or []):
+            t.update(_rte_url_helper(url_obj))
+
+    return t
+
+def _rte_url_helper(url_obj) -> dict:
+    """
+    Takes a location URL ('url' and 'rel' keys) and returns generic preservation status.
+
+    Designed to work with file, webcapture, or fileset URLs.
+
+    Returns a dict; should *not* include non-True values for any keys because
+    these will be iteratively update() into the overal object.
+    """
+    t = dict()
+    if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'):
+        t['is_preserved'] = True
+    if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url:
+        t['in_ia'] = True
+    if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'):
+        t['in_web'] = True
+    if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+        # not sure what rel will be for this stuff
+        t['in_dweb'] = True
+    if '//www.jstor.org/' in url_obj.url:
+        t['in_jstor'] = True
+    return t
+
 
 def container_to_elasticsearch(entity, force_bool=True):
     """
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 2f4e2271..59831017 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -15,15 +15,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
     if release.state != 'active':
         return None
 
+    # TODO: infer ingest type based on release_type or container metadata?
+    if not ingest_type:
+        ingest_type = 'pdf'
+
     # generate a URL where we expect to find fulltext
     url = None
     link_source = None
     link_source_id = None
-    if release.ext_ids.arxiv:
+    if release.ext_ids.arxiv and ingest_type == "pdf":
         url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv)
         link_source = "arxiv"
         link_source_id = release.ext_ids.arxiv
-    elif release.ext_ids.pmcid:
+    elif release.ext_ids.pmcid and ingest_type == "pdf":
         # TODO: how to tell if an author manuscript in PMC vs. published?
         #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
         url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
@@ -40,10 +44,6 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
     ext_ids = release.ext_ids.to_dict()
     ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
 
-    # TODO: infer ingest type based on release_type or container metadata?
-    if not ingest_type:
-        ingest_type = 'pdf'
-
     ingest_request = {
         'ingest_type': ingest_type,
         'ingest_request_source': ingest_request_source,