Merge branch 'bnewbold-es-transform-html' into 'master'

Elasticsearch release transform updates: handle webcaptures better, and refactoring See merge request webgroup/fatcat!91
author: Martin Czygan <martin@archive.org> 2020-12-17 18:10:36 +0000
committer: Martin Czygan <martin@archive.org> 2020-12-17 18:10:36 +0000
commit: 37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f (patch)
tree: d4de1d1d529776205568f55ad7f724e398e442c9 /python
parent: f7a75a019c9dee35542e6f92ec37937df36ff756 (diff)
parent: f60ba0ea04081ac0095c12d8ecbaa48b3da74aee (diff)
download: fatcat-37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f.tar.gz
fatcat-37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f.zip
5 files changed, 296 insertions, 146 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 96a5b96b..ad4b7722 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,10 +1,13 @@
 
 import datetime
+from typing import Optional
 
 import tldextract
 
+from fatcat_openapi_client import ReleaseEntity, ContainerEntity
 
-def check_kbart(year, archive):
+
+def check_kbart(year: int, archive: dict) -> Optional[bool]:
     if not archive or not archive.get('year_spans'):
         return None
     for span in archive['year_spans']:
@@ -12,7 +15,7 @@ def check_kbart(year, archive):
             return True
     return False
 
-def test_check_kbart():
+def test_check_kbart() -> None:
 
     assert check_kbart(1990, dict()) is None
     assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False
@@ -21,10 +24,13 @@ def test_check_kbart():
     assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True
 
 
-def release_to_elasticsearch(entity, force_bool=True):
+def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict:
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
 
+    This is a large/complex transform, so subsets are split out into helper
+    functions.
+
     Returns: dict
     Raises exception on error (never returns None)
     """
@@ -68,16 +74,18 @@ def release_to_elasticsearch(entity, force_bool=True):
         mag_id = release.ext_ids.mag,
     )
 
-    is_oa = None
-    is_preserved = None
-    is_longtail_oa = None
-    in_kbart = None
-    in_jstor = False
-    in_web = False
-    in_dweb = False
-    in_ia = False
-    in_ia_sim = False
-    in_shadows = False
+    t.update(dict(
+        is_oa = None,
+        is_longtail_oa = None,
+        is_preserved = None,
+        in_web = False,
+        in_dweb = False,
+        in_ia = False,
+        in_ia_sim = False,
+        in_kbart = None,
+        in_jstor = False,
+        in_shadows = False,
+    ))
 
     release_year = release.release_year
     if release.release_date:
@@ -116,55 +124,8 @@ def release_to_elasticsearch(entity, force_bool=True):
     # TODO: mapping... probably by lookup?
     t['affiliation_rors'] = None
 
-    this_year = datetime.date.today().year
-    container = release.container
-    if container:
-        t['publisher'] = container.publisher
-        t['container_name'] = container.name
-        # this is container.ident, not release.container_id, because there may
-        # be a redirect involved
-        t['container_id'] = container.ident
-        t['container_issnl'] = container.issnl
-        t['container_type'] = container.container_type
-        if container.extra:
-            c_extra = container.extra
-            if c_extra.get('kbart') and release_year:
-                in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor'))
-                in_kbart = in_jstor
-                for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
-                                'hathitrust', 'scholarsportal', 'cariniana'):
-                    in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive))
-                    # recent KBART coverage is often not updated for the
-                    # current year. So for current-year publications, consider
-                    # coverage from *last* year to also be included in the
-                    # Keeper
-                    if not in_kbart and release_year == this_year:
-                        in_kbart = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
-
-            if c_extra.get('ia'):
-                if c_extra['ia'].get('sim') and release_year:
-                    in_ia_sim = check_kbart(release_year, c_extra['ia']['sim'])
-                if c_extra['ia'].get('longtail_oa'):
-                    is_longtail_oa = True
-            if c_extra.get('sherpa_romeo'):
-                if c_extra['sherpa_romeo'].get('color') == 'white':
-                    is_oa = False
-            if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
-                is_oa = True
-            if c_extra.get('doaj'):
-                if c_extra['doaj'].get('as_of'):
-                    is_oa = True
-            if c_extra.get('road'):
-                if c_extra['road'].get('as_of'):
-                    is_oa = True
-            if c_extra.get('szczepanski'):
-                if c_extra['szczepanski'].get('as_of'):
-                    is_oa = True
-            if c_extra.get('country'):
-                t['country_code'] = c_extra['country']
-                t['country_code_upper'] = c_extra['country'].upper()
-            if c_extra.get('publisher_type'):
-                t['publisher_type'] = c_extra['publisher_type']
+    if release.container:
+        t.update(_rte_container_helper(release.container, release_year))
 
     # fall back to release-level container metadata if container not linked or
     # missing context
@@ -174,70 +135,36 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['container_name'] = release.extra.get('container_name')
 
     if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
-        in_jstor = True
+        t['in_jstor'] = True
 
-    files = release.files or []
-    t['file_count'] = len(files)
-    t['fileset_count'] = len(release.filesets or [])
-    t['webcapture_count'] = len(release.webcaptures or [])
-    any_pdf_url = None
-    good_pdf_url = None
-    best_pdf_url = None
-    ia_pdf_url = None
-    for f in files:
-        if f.extra and f.extra.get('shadows'):
-            # TODO: shadow check goes here
-            in_shadows = True
-        is_pdf = 'pdf' in (f.mimetype or '')
-        for release_url in (f.urls or []):
-            if not f.mimetype and 'pdf' in release_url.url.lower():
-                is_pdf = True
-            if release_url.url.lower().startswith('http'):
-                in_web = True
-            if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
-                # not sure what rel will be for this stuff
-                in_dweb = True
-            if is_pdf:
-                any_pdf_url = release_url.url
-            if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf:
-                is_preserved = True
-                good_pdf_url = release_url.url
-            if '//www.jstor.org/' in release_url.url:
-                in_jstor = True
-            if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
-                in_ia = True
-                if is_pdf:
-                    best_pdf_url = release_url.url
-                    ia_pdf_url = release_url.url
-    # here is where we bake-in priority; IA-specific
-    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
-    t['ia_pdf_url'] = ia_pdf_url
+    # transform file/fileset/webcapture related fields
+    t.update(_rte_content_helper(release))
 
     if release.ext_ids.doaj:
-        is_oa = True
+        t['is_oa'] = True
 
     if release.license_slug:
         # TODO: more/better checks here, particularly strict *not* OA licenses
         if release.license_slug.startswith("CC-"):
-            is_oa = True
+            t['is_oa'] = True
         if release.license_slug.startswith("ARXIV-"):
-            is_oa = True
+            t['is_oa'] = True
 
     extra = release.extra or dict()
     if extra:
         if extra.get('is_oa'):
             # NOTE: not actually setting this anywhere... but could
-            is_oa = True
+            t['is_oa'] = True
         if extra.get('longtail_oa'):
             # sometimes set by GROBID/matcher
-            is_oa = True
-            is_longtail_oa = True
+            t['is_oa'] = True
+            t['is_longtail_oa'] = True
         if not t.get('container_name'):
             t['container_name'] = extra.get('container_name')
         if extra.get('crossref'):
             if extra['crossref'].get('archive'):
                 # all crossref archives are KBART, I believe
-                in_kbart = True
+                t['in_kbart'] = True
         # backwards compatible subtitle fetching
         if not t['subtitle'] and extra.get('subtitle'):
             if type(extra['subtitle']) == list:
@@ -254,7 +181,7 @@ def release_to_elasticsearch(entity, force_bool=True):
         # TODO: non-numerical first pages
 
     t['ia_microfilm_url'] = None
-    if in_ia_sim:
+    if t['in_ia_sim']:
         # TODO: determine URL somehow? I think this is in flux. Will probably
         # need extra metadata in the container extra field.
         # special case as a demo for now.
@@ -280,42 +207,168 @@ def release_to_elasticsearch(entity, force_bool=True):
     if t['doi']:
         t['doi_prefix'] = t['doi'].split('/')[0]
 
-    if is_longtail_oa:
-        is_oa = True
+    if t['is_longtail_oa']:
+        t['is_oa'] = True
 
+    # optionally coerce all flags from Optional[bool] to bool
     if force_bool:
-        t['is_oa'] = bool(is_oa)
-        t['is_longtail_oa'] = bool(is_longtail_oa)
-        t['in_kbart'] = bool(in_kbart)
-        t['in_ia_sim'] = bool(in_ia_sim)
-        t['in_jstor'] = bool(in_jstor)
-        t['in_web'] = bool(in_web)
-        t['in_dweb'] = bool(in_dweb)
-        t['in_shadows'] = bool(in_shadows)
-    else:
-        t['is_oa'] = is_oa
-        t['is_longtail_oa'] = is_longtail_oa
-        t['in_kbart'] = in_kbart
-        t['in_ia_sim'] = in_ia_sim
-        t['in_jstor'] = in_jstor
-        t['in_web'] = in_web
-        t['in_dweb'] = in_dweb
-        t['in_shadows'] = in_shadows
-
-    t['in_ia'] = bool(in_ia)
-    t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'))
+        for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim',
+                  'in_jstor', 'in_web', 'in_dweb', 'in_shadows'):
+            t[k] = bool(t[k])
+
+    t['in_ia'] = bool(t['in_ia'])
+    t['is_preserved'] = (
+        bool(t['is_preserved'])
+        or t['in_ia']
+        or t['in_kbart']
+        or t['in_jstor']
+        or t.get('pmcid')
+        or t.get('arxiv_id')
+    )
 
-    if in_ia:
+    if t['in_ia']:
         t['preservation'] = 'bright'
-    elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'):
+    elif t['is_preserved']:
         t['preservation'] = 'dark'
-    elif in_shadows:
+    elif t['in_shadows']:
         t['preservation'] = 'shadows_only'
     else:
         t['preservation'] = 'none'
 
     return t
 
+def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict:
+    """
+    Container metadata sub-section of release_to_elasticsearch()
+    """
+    this_year = datetime.date.today().year
+    t = dict()
+    t['publisher'] = container.publisher
+    t['container_name'] = container.name
+    # this is container.ident, not release.container_id, because there may
+    # be a redirect involved
+    t['container_id'] = container.ident
+    t['container_issnl'] = container.issnl
+    t['container_type'] = container.container_type
+    if container.extra:
+        c_extra = container.extra
+        if c_extra.get('kbart') and release_year:
+            if check_kbart(release_year, c_extra['kbart'].get('jstor')):
+                t['in_jstor'] = True
+            if t.get('in_kbart') or t.get('in_jstor'):
+                t['in_kbart'] = True
+            for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
+                            'hathitrust', 'scholarsportal', 'cariniana'):
+                t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive))
+                # recent KBART coverage is often not updated for the
+                # current year. So for current-year publications, consider
+                # coverage from *last* year to also be included in the
+                # Keeper
+                if not t.get('in_kbart') and release_year == this_year:
+                    t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive))
+
+        if c_extra.get('ia'):
+            if c_extra['ia'].get('sim') and release_year:
+                t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim'])
+            if c_extra['ia'].get('longtail_oa'):
+                t['is_longtail_oa'] = True
+        if c_extra.get('sherpa_romeo'):
+            if c_extra['sherpa_romeo'].get('color') == 'white':
+                t['is_oa'] = False
+        if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
+            t['is_oa'] = True
+        if c_extra.get('doaj'):
+            if c_extra['doaj'].get('as_of'):
+                t['is_oa'] = True
+        if c_extra.get('road'):
+            if c_extra['road'].get('as_of'):
+                t['is_oa'] = True
+        if c_extra.get('szczepanski'):
+            if c_extra['szczepanski'].get('as_of'):
+                t['is_oa'] = True
+        if c_extra.get('country'):
+            t['country_code'] = c_extra['country']
+            t['country_code_upper'] = c_extra['country'].upper()
+        if c_extra.get('publisher_type'):
+            t['publisher_type'] = c_extra['publisher_type']
+    return t
+
+def _rte_content_helper(release: ReleaseEntity) -> dict:
+    """
+    File/FileSet/WebCapture sub-section of release_to_elasticsearch()
+
+    The current priority order for "best_pdf_url" is:
+    - internet archive urls (archive.org or web.archive.org)
+    - other webarchive or repository URLs
+    - any other URL
+    """
+    t = dict(
+        file_count = len(release.files or []),
+        fileset_count = len(release.filesets or []),
+        webcapture_count = len(release.webcaptures or []),
+    )
+
+    any_pdf_url = None
+    good_pdf_url = None
+    best_pdf_url = None
+    ia_pdf_url = None
+
+    for f in release.files or []:
+        if f.extra and f.extra.get('shadows'):
+            t['in_shadows'] = True
+        is_pdf = 'pdf' in (f.mimetype or '')
+        for release_url in (f.urls or []):
+            # first generic flags
+            t.update(_rte_url_helper(release_url))
+
+            # then PDF specific stuff (for generating "best URL" fields)
+            if not f.mimetype and 'pdf' in release_url.url.lower():
+                is_pdf = True
+            if is_pdf:
+                any_pdf_url = release_url.url
+                if release_url.rel in ('webarchive', 'repository', 'repo'):
+                    good_pdf_url = release_url.url
+                if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url:
+                    best_pdf_url = release_url.url
+                    ia_pdf_url = release_url.url
+
+    # here is where we bake-in PDF url priority; IA-specific
+    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+    t['ia_pdf_url'] = ia_pdf_url
+
+    for fs in release.filesets or []:
+        for url_obj in (fs.urls or []):
+            t.update(_rte_url_helper(url_obj))
+
+    for wc in release.webcaptures or []:
+        for url_obj in (wc.archive_urls or []):
+            t.update(_rte_url_helper(url_obj))
+
+    return t
+
+def _rte_url_helper(url_obj) -> dict:
+    """
+    Takes a location URL ('url' and 'rel' keys) and returns generic preservation status.
+
+    Designed to work with file, webcapture, or fileset URLs.
+
+    Returns a dict; should *not* include non-True values for any keys because
+    these will be iteratively update() into the overal object.
+    """
+    t = dict()
+    if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'):
+        t['is_preserved'] = True
+    if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url:
+        t['in_ia'] = True
+    if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'):
+        t['in_web'] = True
+    if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+        # not sure what rel will be for this stuff
+        t['in_dweb'] = True
+    if '//www.jstor.org/' in url_obj.url:
+        t['in_jstor'] = True
+    return t
+
 
 def container_to_elasticsearch(entity, force_bool=True):
     """
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 2111a20d..94791770 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -326,6 +326,8 @@ class EntityUpdatesWorker(FatcatWorker):
             release_ids = []
             new_release_ids = []
             file_ids = []
+            fileset_ids = []
+            webcapture_ids = []
             container_ids = []
             work_ids = []
             release_edits = cle['editgroup']['edits']['releases']
@@ -337,6 +339,12 @@ class EntityUpdatesWorker(FatcatWorker):
             file_edits = cle['editgroup']['edits']['files']
             for e in file_edits:
                 file_ids.append(e['ident'])
+            fileset_edits = cle['editgroup']['edits']['filesets']
+            for e in fileset_edits:
+                fileset_ids.append(e['ident'])
+            webcapture_edits = cle['editgroup']['edits']['webcaptures']
+            for e in webcapture_edits:
+                webcapture_ids.append(e['ident'])
             container_edits = cle['editgroup']['edits']['containers']
             for e in container_edits:
                 container_ids.append(e['ident'])
@@ -348,8 +356,8 @@ class EntityUpdatesWorker(FatcatWorker):
             for ident in set(file_ids):
                 file_entity = self.api.get_file(ident, expand=None)
                 # update release when a file changes
-                # TODO: fetch old revision as well, and only update
-                # releases for which list changed
+                # TODO: also fetch old version of file and update any *removed*
+                # release idents (and same for filesets, webcapture updates)
                 release_ids.extend(file_entity.release_ids or [])
                 file_dict = self.api.api_client.sanitize_for_serialization(file_entity)
                 producer.produce(
@@ -358,6 +366,19 @@ class EntityUpdatesWorker(FatcatWorker):
                     key=ident.encode('utf-8'),
                     on_delivery=fail_fast,
                 )
+
+            # TODO: topic for fileset updates
+            for ident in set(fileset_ids):
+                fileset_entity = self.api.get_fileset(ident, expand=None)
+                # update release when a fileset changes
+                release_ids.extend(file_entity.release_ids or [])
+
+            # TODO: topic for webcapture updates
+            for ident in set(webcapture_ids):
+                webcapture_entity = self.api.get_webcapture(ident, expand=None)
+                # update release when a webcapture changes
+                release_ids.extend(webcapture_entity.release_ids or [])
+
             for ident in set(container_ids):
                 container = self.api.get_container(ident)
                 container_dict = self.api.api_client.sanitize_for_serialization(container)
@@ -367,6 +388,7 @@ class EntityUpdatesWorker(FatcatWorker):
                     key=ident.encode('utf-8'),
                     on_delivery=fail_fast,
                 )
+
             for ident in set(release_ids):
                 release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
                 if release.work_id:
@@ -378,7 +400,7 @@ class EntityUpdatesWorker(FatcatWorker):
                     key=ident.encode('utf-8'),
                     on_delivery=fail_fast,
                 )
-                # filter to "new" active releases with no matched files
+                # for ingest requests, filter to "new" active releases with no matched files
                 if release.ident in new_release_ids:
                     ir = release_ingest_request(release, ingest_request_source='fatcat-changelog')
                     if ir and not release.files and self.want_live_ingest(release, ir):
diff --git a/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json b/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json
new file mode 100644
index 00000000..1c559509
--- /dev/null
+++ b/python/tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json
@@ -0,0 +1 @@
+{"abstracts":[{"sha1":"b2523f13fc2aa730a2e2336f27d448644074e24f","content":"<p>Jakobshavn Isbræ, West Greenland, which holds a 0.6-m sea level volume equivalent, has been speeding up and retreating since the late 1990s. Interpretation of its retreat has been hindered by difficulties in measuring its ice thickness with airborne radar depth sounders. Here, we employ high-resolution, helicopter-borne gravity data from 2012 to reconstruct its bed elevation within 50 km of the ocean margin using a three-dimensional inversion constrained by fjord bathymetry data offshore and a mass conservation algorithm inland. We find the glacier trough to be asymmetric and several 100 m deeper than estimated previously in the lower part. From 1996-2016, the grounding line migrated at 0.6 km/yr from 700 m to 1,100 m depth. Upstream, the bed drops to 1,600 m over 10 km then slowly climbs to 1,200 m depth in 40 km. Jakobshavn Isbræ will continue to retreat along a retrograde slope for decades to come.\n\nAn L., E. Rignot, S.H.P. Elieff, M. Morlighem, R. Millan, J. Mouginot, D.M. Holland, D. Holland, and J. Paden (2017), Bed elevation of Jakobshavn Isbræ, West Greenland, from high-resolution airborne gravity and other data, Geophys. Res. Lett., 44, doi:10.1002/2017GL073245.\n\n</p>","mimetype":"text/html"}],"refs":[],"contribs":[{"raw_name":"Lu An","role":"author","raw_affiliation":"University of California, Irvine"}],"license_slug":"CC-BY","publisher":"UC Irvine","ext_ids":{"doi":"10.7280/d1j37z"},"release_year":2018,"release_type":"dataset","webcaptures":[],"filesets":[{"release_ids":["3mssw2qnlnblbk7oqyv2dafgey"],"urls":[{"url":"https://merritt.cdlib.org/u/ark%3A%2F13030%2Fm5rg0r8q/1","rel":"repo-bundle"},{"url":"https://merritt.cdlib.org/d/ark%3A%2F13030%2Fm5rg0r8q/1/","rel":"repo"},{"url":"dat://77e94744aa5f967e6ed7e3990bfc29f141dbf2c0fff572eb1212b3bd706882f4/files/","rel":"dweb"}],"manifest":[{"path":"JKS_BedElevation_An_etal_2017.nc","size":736484,"md5":"af738fa325833a56bf947622958fd504","sha1":"443f1867b3a56132905e8d611ad03445d8134d3c","sha256":"52438ef0035b391027e989f00208de5c16ab8f9ff619aa7f45e998d6214a452f","extra":{"mimetype":"application/x-netcdf"}}],"state":"active","ident":"ho376wmdanckpp66iwfs7g22ne","revision":"e07ab7b0-bc0e-4da2-9121-542263e84e2d","extra":{"cdl_dash":{"version":1}}}],"files":[],"work_id":"pbf2dmuu5jf4dac2k22gxsjk6y","title":"Jakobshavn Glacier Bed Elevation","state":"active","ident":"3mssw2qnlnblbk7oqyv2dafgey","revision":"23040a75-2aa6-49f2-af3c-a5c12dcceffe","extra":{"ark_id":"ark:/13030/m5rg0r8q","cdl_dash":{"version":1}}}
+\ No newline at end of file
diff --git a/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json b/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json
new file mode 100644
index 00000000..3bfe8564
--- /dev/null
+++ b/python/tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json
@@ -0,0 +1 @@
+{"abstracts":[],"refs":[],"contribs":[{"index":0,"raw_name":"Catherine C. Marshall","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"CNRI Acct","issue":"3/4","volume":"14","ext_ids":{"doi":"10.1045/march2008-marshall-pt1"},"release_year":2008,"release_stage":"published","release_type":"article-journal","container_id":"ugbiirfvufgcjkx33r3cmemcuu","webcaptures":[{"release_ids":["mjtqtuyhwfdr7j2c3l36uor7uy"],"timestamp":"2019-01-06T18:58:12Z","original_url":"http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html","archive_urls":[{"url":"https://web.archive.org/web/","rel":"wayback"}],"cdx":[{"surt":"org,dlib)/dlib/march08/images/spacer00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/spacer00.gif","mimetype":"image/gif","status_code":200,"sha1":"0e75513436e6b01963759f6a88282445ff2e5b3a","sha256":"7455bacb03f7ef04d79010638db14d8434cf7a349914c2ee99eb5d4220338675"},{"surt":"org,dlib)/dlib/march08/marshall/marshall-part1-fig1.png","timestamp":"2019-01-06T19:51:01Z","url":"http://www.dlib.org/dlib/march08/marshall/marshall-part1-fig1.png","mimetype":"image/png","status_code":200,"sha1":"89cee41b938a1d2cdc51688b4be1c72366ae8102","sha256":"d63abfb99c9c48e1e6e3e37bbc5f01c0d37429f0ac0a404ae6aadc1a7d187b60"},{"surt":"org,dlib)/dlib/march08/images/redline00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/redline00.gif","mimetype":"image/gif","status_code":200,"sha1":"3a902e1d6075e37962ab37afc1567819bc3a164e","sha256":"3279d6916807f9e244beb23c91d58cd238509f77a26c06b14314f276b77b9c06"},{"surt":"org,dlib)/dlib/march08/images/commentary00.gif","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/images/commentary00.gif","mimetype":"image/gif","status_code":200,"sha1":"cdbf8804daa2627ef915db725b29cce9eaa9cd68","sha256":"8d8956e992a7f3004ccbbaaebe585ee4c2b1256ad418507d7c33f94b290d0b04"},{"surt":"org,dlib)/dlib/march08/style/main.css","timestamp":"2019-01-06T19:50:55Z","url":"http://www.dlib.org/dlib/march08/style/main.css","mimetype":"text/css","status_code":200,"sha1":"425f00efb41156f03d5c139c1b24acfcbdd611cb","sha256":"ff811660270fc847b5efc3ff9d62967244c924f91a5e4796ac2e6fc8058440ff"},{"surt":"org,dlib)/dlib/march08/marshall/03marshall-pt1.html","timestamp":"2018-12-06T13:16:33Z","url":"http://www.dlib.org/dlib/march08/marshall/03marshall-pt1.html","mimetype":"text/html","status_code":200,"sha1":"8443a044aa1f4571dd1e5561d59150e34eff0dd2","sha256":"0e9c76cdf20db60b93f0d129e5336e5344aae8bd03c5dbd75a5eea8f5d1820da"}],"revision":"6019e2a1-3503-4e91-97ec-5fba3abc70af","ident":"z7uaeatyvfgwdpuxtrdu4okqii","state":"active"}],"filesets":[],"files":[],"container":{"wikidata_qid":"Q5203268","issnl":"1082-9873","publisher":"Corporation for National Research Initiatives","name":"D-Lib Magazine","extra":{"abbrev":"Dlib Mag","country":"us","issne":"1082-9873","road":{"as_of":"2018-01-24"},"szczepanski":{"as_of":"2018"},"urls":["http://www.dlib.org/"]},"revision":"3957936f-d418-4006-b830-71341068121c","ident":"ugbiirfvufgcjkx33r3cmemcuu","state":"active"},"work_id":"kqi27ogvjvcrtnritxwumkebya","title":"Rethinking Personal Digital Archiving, Part 1","state":"active","ident":"mjtqtuyhwfdr7j2c3l36uor7uy","revision":"74270e11-c961-47f7-a682-1f6ad5927205","extra":{"crossref":{"type":"journal-article"},"subtitle":["Four Challenges from the Field"]}}
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 0d96e139..b5f23e76 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -43,7 +43,7 @@ def test_rich_elasticsearch_convert():
                     "year_spans": [[1200, 1300]],
                 },
                 "jstor": {
-                    "year_spans": [[1950, 1960], [1980, 2005]],
+                    "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
                 },
             },
             "sherpa_romeo": {"color": "blue"},
@@ -63,17 +63,23 @@ def test_rich_elasticsearch_convert():
     )]
     es = release_to_elasticsearch(r)
     assert es['release_year'] == r.release_year
-    assert es['in_ia'] == True
-    assert es['in_jstor'] == False
-    assert es['in_ia_sim'] == False
-    assert es['in_ia'] == True
-    assert es['in_web'] == True
-    assert es['in_dweb'] == True
-    assert es['is_oa'] == True
-    assert es['is_longtail_oa'] == False
+    assert es['file_count'] == 1
+    assert es['fileset_count'] == 0
+    assert es['webcapture_count'] == 0
     assert es['ref_count'] == 2
     assert es['ref_linked_count'] == 1
 
+    assert es['preservation'] == "bright"
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['is_preserved'] == True
+    assert es['in_web'] == True
+    assert es['in_dweb'] == True
+    assert es['in_ia'] == True
+    assert es['in_ia_sim'] == False
+    assert es['in_kbart'] == True
+    assert es['in_jstor'] == True
+
 def test_elasticsearch_release_from_json():
     r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
     es = release_to_elasticsearch(r)
@@ -85,8 +91,59 @@ def test_elasticsearch_release_from_json():
     assert es['issue'] == "11"
     assert es['volume'] == "118"
     assert es['number'] == None
+
+    assert es['preservation'] == "dark"
+    assert es['is_oa'] == False
+    assert es['is_longtail_oa'] == False
+    assert es['is_preserved'] == True
+    assert es['in_web'] == False
+    assert es['in_dweb'] == False
+    assert es['in_ia'] == False
     assert es['in_ia_sim'] == True
     assert es['in_kbart'] == True
+    assert es['in_jstor'] == False
+
+    # this release has a fileset, and no file
+    r = entity_from_json(open('./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json', 'r').read(), ReleaseEntity)
+    es = release_to_elasticsearch(r)
+
+    assert es['title'] == "Jakobshavn Glacier Bed Elevation"
+    assert es['ident'] == "3mssw2qnlnblbk7oqyv2dafgey"
+    assert es['file_count'] == 0
+    assert es['fileset_count'] == 1
+    assert es['webcapture_count'] == 0
+
+    assert es['preservation'] == "dark"
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['is_preserved'] == True
+    assert es['in_web'] == True
+    assert es['in_dweb'] == True
+    assert es['in_ia'] == False
+    assert es['in_ia_sim'] == False
+    assert es['in_kbart'] == False
+    assert es['in_jstor'] == False
+
+    # this release has a web capture, and no file (edited the JSON to remove file)
+    r = entity_from_json(open('./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json', 'r').read(), ReleaseEntity)
+    es = release_to_elasticsearch(r)
+
+    assert es['title'] == "Rethinking Personal Digital Archiving, Part 1"
+    assert es['ident'] == "mjtqtuyhwfdr7j2c3l36uor7uy"
+    assert es['file_count'] == 0
+    assert es['fileset_count'] == 0
+    assert es['webcapture_count'] == 1
+
+    assert es['preservation'] == "bright"
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['is_preserved'] == True
+    assert es['in_web'] == True
+    assert es['in_dweb'] == False
+    assert es['in_ia'] == True
+    assert es['in_ia_sim'] == False
+    assert es['in_kbart'] == False
+    assert es['in_jstor'] == False
 
 def test_elasticsearch_container_transform(journal_metadata_importer):
     with open('tests/files/journal_metadata.sample.json', 'r') as f:
@@ -164,9 +221,17 @@ def test_elasticsearch_release_kbart_year():
     )
     es = release_to_elasticsearch(r)
     assert es['release_year'] == this_year
+
+    assert es['preservation'] == "none"
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['is_preserved'] == None
+    assert es['in_web'] == False
+    assert es['in_dweb'] == False
     assert es['in_ia'] == False
+    assert es['in_ia_sim'] == False
     assert es['in_kbart'] == False
-    assert es['preservation'] == "none"
+    assert es['in_jstor'] == False
 
     r.container = ContainerEntity(
         name="dummy journal",
@@ -180,6 +245,14 @@ def test_elasticsearch_release_kbart_year():
     )
     es = release_to_elasticsearch(r)
     assert es['release_year'] == this_year
+
+    assert es['preservation'] == "dark"
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['is_preserved'] == True
+    assert es['in_web'] == False
+    assert es['in_dweb'] == False
     assert es['in_ia'] == False
+    assert es['in_ia_sim'] == False
     assert es['in_kbart'] == True
-    assert es['preservation'] == "dark"
+    assert es['in_jstor'] == False
author	Martin Czygan <martin@archive.org>	2020-12-17 18:10:36 +0000
committer	Martin Czygan <martin@archive.org>	2020-12-17 18:10:36 +0000
commit	37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f (patch)
tree	d4de1d1d529776205568f55ad7f724e398e442c9 /python
parent	f7a75a019c9dee35542e6f92ec37937df36ff756 (diff)
parent	f60ba0ea04081ac0095c12d8ecbaa48b3da74aee (diff)
download	fatcat-37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f.tar.gz fatcat-37508f3d7cbbbb2cfdf6bcb5bd50a00cf3b0311f.zip