diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 279 | 
1 files changed, 148 insertions, 131 deletions
| diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 96a5b96b..b0139751 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,10 +1,13 @@  import datetime +from typing import Optional  import tldextract +from fatcat_openapi_client import ReleaseEntity, ContainerEntity -def check_kbart(year, archive): + +def check_kbart(year: int, archive: dict) -> Optional[bool]:      if not archive or not archive.get('year_spans'):          return None      for span in archive['year_spans']: @@ -12,7 +15,7 @@ def check_kbart(year, archive):              return True      return False -def test_check_kbart(): +def test_check_kbart() -> None:      assert check_kbart(1990, dict()) is None      assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False @@ -21,10 +24,13 @@ def test_check_kbart():      assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True -def release_to_elasticsearch(entity, force_bool=True): +def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict:      """      Converts from an entity model/schema to elasticsearch oriented schema. +    This is a large/complex transform, so subsets are split out into helper +    functions. +      Returns: dict      Raises exception on error (never returns None)      """ @@ -68,16 +74,18 @@ def release_to_elasticsearch(entity, force_bool=True):          mag_id = release.ext_ids.mag,      ) -    is_oa = None -    is_preserved = None -    is_longtail_oa = None -    in_kbart = None -    in_jstor = False -    in_web = False -    in_dweb = False -    in_ia = False -    in_ia_sim = False -    in_shadows = False +    t.update(dict( +        is_oa = None, +        is_preserved = None, +        is_longtail_oa = None, +        in_kbart = None, +        in_jstor = False, +        in_web = False, +        in_dweb = False, +        in_ia = False, +        in_ia_sim = False, +        in_shadows = False, +    ))      release_year = release.release_year      if release.release_date: @@ -116,55 +124,8 @@ def release_to_elasticsearch(entity, force_bool=True):      # TODO: mapping... probably by lookup?      t['affiliation_rors'] = None -    this_year = datetime.date.today().year -    container = release.container -    if container: -        t['publisher'] = container.publisher -        t['container_name'] = container.name -        # this is container.ident, not release.container_id, because there may -        # be a redirect involved -        t['container_id'] = container.ident -        t['container_issnl'] = container.issnl -        t['container_type'] = container.container_type -        if container.extra: -            c_extra = container.extra -            if c_extra.get('kbart') and release_year: -                in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor')) -                in_kbart = in_jstor -                for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', -                                'hathitrust', 'scholarsportal', 'cariniana'): -                    in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive)) -                    # recent KBART coverage is often not updated for the -                    # current year. So for current-year publications, consider -                    # coverage from *last* year to also be included in the -                    # Keeper -                    if not in_kbart and release_year == this_year: -                        in_kbart = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) - -            if c_extra.get('ia'): -                if c_extra['ia'].get('sim') and release_year: -                    in_ia_sim = check_kbart(release_year, c_extra['ia']['sim']) -                if c_extra['ia'].get('longtail_oa'): -                    is_longtail_oa = True -            if c_extra.get('sherpa_romeo'): -                if c_extra['sherpa_romeo'].get('color') == 'white': -                    is_oa = False -            if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): -                is_oa = True -            if c_extra.get('doaj'): -                if c_extra['doaj'].get('as_of'): -                    is_oa = True -            if c_extra.get('road'): -                if c_extra['road'].get('as_of'): -                    is_oa = True -            if c_extra.get('szczepanski'): -                if c_extra['szczepanski'].get('as_of'): -                    is_oa = True -            if c_extra.get('country'): -                t['country_code'] = c_extra['country'] -                t['country_code_upper'] = c_extra['country'].upper() -            if c_extra.get('publisher_type'): -                t['publisher_type'] = c_extra['publisher_type'] +    if release.container: +        t.update(_rte_container_helper(release.container, release_year))      # fall back to release-level container metadata if container not linked or      # missing context @@ -174,70 +135,36 @@ def release_to_elasticsearch(entity, force_bool=True):          t['container_name'] = release.extra.get('container_name')      if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): -        in_jstor = True +        t['in_jstor'] = True -    files = release.files or [] -    t['file_count'] = len(files) -    t['fileset_count'] = len(release.filesets or []) -    t['webcapture_count'] = len(release.webcaptures or []) -    any_pdf_url = None -    good_pdf_url = None -    best_pdf_url = None -    ia_pdf_url = None -    for f in files: -        if f.extra and f.extra.get('shadows'): -            # TODO: shadow check goes here -            in_shadows = True -        is_pdf = 'pdf' in (f.mimetype or '') -        for release_url in (f.urls or []): -            if not f.mimetype and 'pdf' in release_url.url.lower(): -                is_pdf = True -            if release_url.url.lower().startswith('http'): -                in_web = True -            if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): -                # not sure what rel will be for this stuff -                in_dweb = True -            if is_pdf: -                any_pdf_url = release_url.url -            if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf: -                is_preserved = True -                good_pdf_url = release_url.url -            if '//www.jstor.org/' in release_url.url: -                in_jstor = True -            if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: -                in_ia = True -                if is_pdf: -                    best_pdf_url = release_url.url -                    ia_pdf_url = release_url.url -    # here is where we bake-in priority; IA-specific -    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url -    t['ia_pdf_url'] = ia_pdf_url +    # transform file/fileset/webcapture related fields +    t.update(_rte_content_helper(release))      if release.ext_ids.doaj: -        is_oa = True +        t['is_oa'] = True      if release.license_slug:          # TODO: more/better checks here, particularly strict *not* OA licenses          if release.license_slug.startswith("CC-"): -            is_oa = True +            t['is_oa'] = True          if release.license_slug.startswith("ARXIV-"): -            is_oa = True +            t['is_oa'] = True      extra = release.extra or dict()      if extra:          if extra.get('is_oa'):              # NOTE: not actually setting this anywhere... but could -            is_oa = True +            t['is_oa'] = True          if extra.get('longtail_oa'):              # sometimes set by GROBID/matcher -            is_oa = True -            is_longtail_oa = True +            t['is_oa'] = True +            t['is_longtail_oa'] = True          if not t.get('container_name'):              t['container_name'] = extra.get('container_name')          if extra.get('crossref'):              if extra['crossref'].get('archive'):                  # all crossref archives are KBART, I believe -                in_kbart = True +                t['in_kbart'] = True          # backwards compatible subtitle fetching          if not t['subtitle'] and extra.get('subtitle'):              if type(extra['subtitle']) == list: @@ -254,7 +181,7 @@ def release_to_elasticsearch(entity, force_bool=True):          # TODO: non-numerical first pages      t['ia_microfilm_url'] = None -    if in_ia_sim: +    if t['in_ia_sim']:          # TODO: determine URL somehow? I think this is in flux. Will probably          # need extra metadata in the container extra field.          # special case as a demo for now. @@ -280,42 +207,132 @@ def release_to_elasticsearch(entity, force_bool=True):      if t['doi']:          t['doi_prefix'] = t['doi'].split('/')[0] -    if is_longtail_oa: -        is_oa = True +    if t['is_longtail_oa']: +        t['is_oa'] = True +    # optionally coerce all flags from Optional[bool] to bool      if force_bool: -        t['is_oa'] = bool(is_oa) -        t['is_longtail_oa'] = bool(is_longtail_oa) -        t['in_kbart'] = bool(in_kbart) -        t['in_ia_sim'] = bool(in_ia_sim) -        t['in_jstor'] = bool(in_jstor) -        t['in_web'] = bool(in_web) -        t['in_dweb'] = bool(in_dweb) -        t['in_shadows'] = bool(in_shadows) -    else: -        t['is_oa'] = is_oa -        t['is_longtail_oa'] = is_longtail_oa -        t['in_kbart'] = in_kbart -        t['in_ia_sim'] = in_ia_sim -        t['in_jstor'] = in_jstor -        t['in_web'] = in_web -        t['in_dweb'] = in_dweb -        t['in_shadows'] = in_shadows +        for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim', +                  'in_jstor', 'in_web', 'in_dweb', 'in_shadows'): +            t[k] = bool(t[k]) -    t['in_ia'] = bool(in_ia) -    t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id')) +    t['in_ia'] = bool(t['in_ia']) +    t['is_preserved'] = bool(t['is_preserved'] or t['in_ia'] or t['in_kbart'] or t['in_jstor'] or t.get('pmcid') or t.get('arxiv_id')) -    if in_ia: +    if t['in_ia']:          t['preservation'] = 'bright' -    elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'): +    # XXX: simplify: elif t['is_preserved'] +    elif t['in_kbart'] or t['in_jstor'] or t.get('pmcid') or t.get('arxiv_id'):          t['preservation'] = 'dark' -    elif in_shadows: +    elif t['in_shadows']:          t['preservation'] = 'shadows_only'      else:          t['preservation'] = 'none'      return t +def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict: +    """ +    Container metadata sub-section of release_to_elasticsearch() +    """ +    this_year = datetime.date.today().year +    t = dict() +    t['publisher'] = container.publisher +    t['container_name'] = container.name +    # this is container.ident, not release.container_id, because there may +    # be a redirect involved +    t['container_id'] = container.ident +    t['container_issnl'] = container.issnl +    t['container_type'] = container.container_type +    if container.extra: +        c_extra = container.extra +        if c_extra.get('kbart') and release_year: +            t['in_jstor'] = check_kbart(release_year, c_extra['kbart'].get('jstor')) +            # XXX: +            t['in_kbart'] = t['in_jstor'] +            for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', +                            'hathitrust', 'scholarsportal', 'cariniana'): +                t['in_kbart'] = t['in_kbart'] or check_kbart(release_year, c_extra['kbart'].get(archive)) +                # recent KBART coverage is often not updated for the +                # current year. So for current-year publications, consider +                # coverage from *last* year to also be included in the +                # Keeper +                if not t['in_kbart'] and release_year == this_year: +                    t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) + +        if c_extra.get('ia'): +            if c_extra['ia'].get('sim') and release_year: +                t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim']) +            if c_extra['ia'].get('longtail_oa'): +                t['is_longtail_oa'] = True +        if c_extra.get('sherpa_romeo'): +            if c_extra['sherpa_romeo'].get('color') == 'white': +                t['is_oa'] = False +        if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): +            t['is_oa'] = True +        if c_extra.get('doaj'): +            if c_extra['doaj'].get('as_of'): +                t['is_oa'] = True +        if c_extra.get('road'): +            if c_extra['road'].get('as_of'): +                t['is_oa'] = True +        if c_extra.get('szczepanski'): +            if c_extra['szczepanski'].get('as_of'): +                t['is_oa'] = True +        if c_extra.get('country'): +            t['country_code'] = c_extra['country'] +            t['country_code_upper'] = c_extra['country'].upper() +        if c_extra.get('publisher_type'): +            t['publisher_type'] = c_extra['publisher_type'] +    return t + +def _rte_content_helper(release: ReleaseEntity) -> dict: +    """ +    File/FileSet/WebCapture sub-section of release_to_elasticsearch() +    """ +    files = release.files or [] +    t = dict( +        file_count = len(release.files or []), +        fileset_count = len(release.filesets or []), +        webcapture_count = len(release.webcaptures or []), +    ) + +    any_pdf_url = None +    good_pdf_url = None +    best_pdf_url = None +    ia_pdf_url = None + +    for f in files: +        if f.extra and f.extra.get('shadows'): +            t['in_shadows'] = True +        is_pdf = 'pdf' in (f.mimetype or '') +        for release_url in (f.urls or []): +            if not f.mimetype and 'pdf' in release_url.url.lower(): +                is_pdf = True +            if release_url.url.lower().startswith('http'): +                # XXX: also startswith('ftp') +                t['in_web'] = True +            if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): +                # not sure what rel will be for this stuff +                t['in_dweb'] = True +            if is_pdf: +                any_pdf_url = release_url.url +            if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf: +                t['is_preserved'] = True +                good_pdf_url = release_url.url +            if '//www.jstor.org/' in release_url.url: +                t['in_jstor'] = True +            if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: +                t['in_ia'] = True +                if is_pdf: +                    best_pdf_url = release_url.url +                    ia_pdf_url = release_url.url + +    # here is where we bake-in priority; IA-specific +    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url +    t['ia_pdf_url'] = ia_pdf_url +    return t +  def container_to_elasticsearch(entity, force_bool=True):      """ | 
