diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 14:34:26 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 14:34:28 -0800 | 
| commit | 486bbd7ea65fa50b3a839e5d371f04b8655a00c8 (patch) | |
| tree | 49b8274b834e134720a0ae414e32f3860257f42f /python | |
| parent | ebcc86561dabf3974ca11151445e66c0df4431f1 (diff) | |
| download | fatcat-486bbd7ea65fa50b3a839e5d371f04b8655a00c8.tar.gz fatcat-486bbd7ea65fa50b3a839e5d371f04b8655a00c8.zip | |
have release elasticsearch transform count webcaptures and filesets towards preservation
These are simple/partial changes to have webcaptures and filesets show
up in 'preservation', 'in_ia', and 'in_web' ES schema flags. A
longer-term TODO is to update the ES schema to have more granular
analytic flags.
Also includes a small generalization refactor for URL object parsing
into preservation status, shared across file+fileset+webcapture entity
types (all have similar URL objects with url+rel fields).
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 83 | 
1 files changed, 57 insertions, 26 deletions
| diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index c2ab5369..e23495ea 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -76,14 +76,14 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->      t.update(dict(          is_oa = None, -        is_preserved = None,          is_longtail_oa = None, -        in_kbart = None, -        in_jstor = False, +        is_preserved = None,          in_web = False,          in_dweb = False,          in_ia = False,          in_ia_sim = False, +        in_kbart = None, +        in_jstor = False,          in_shadows = False,      )) @@ -250,20 +250,21 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int      t['container_id'] = container.ident      t['container_issnl'] = container.issnl      t['container_type'] = container.container_type -    t['in_kbart'] = None      if container.extra:          c_extra = container.extra          if c_extra.get('kbart') and release_year: -            t['in_jstor'] = check_kbart(release_year, c_extra['kbart'].get('jstor')) -            t['in_kbart'] = t['in_kbart'] or t['in_jstor'] +            if check_kbart(release_year, c_extra['kbart'].get('jstor')): +                t['in_jstor'] = True +            if t.get('in_kbart') or t.get('in_jstor'): +                t['in_kbart'] = True              for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',                              'hathitrust', 'scholarsportal', 'cariniana'): -                t['in_kbart'] = t['in_kbart'] or check_kbart(release_year, c_extra['kbart'].get(archive)) +                t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive))                  # recent KBART coverage is often not updated for the                  # current year. So for current-year publications, consider                  # coverage from *last* year to also be included in the                  # Keeper -                if not t['in_kbart'] and release_year == this_year: +                if not t.get('in_kbart') and release_year == this_year:                      t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive))          if c_extra.get('ia'): @@ -295,8 +296,12 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int  def _rte_content_helper(release: ReleaseEntity) -> dict:      """      File/FileSet/WebCapture sub-section of release_to_elasticsearch() + +    The current priority order for "best_pdf_url" is: +    - internet archive urls (archive.org or web.archive.org) +    - other webarchive or repository URLs +    - any other URL      """ -    files = release.files or []      t = dict(          file_count = len(release.files or []),          fileset_count = len(release.filesets or []), @@ -308,34 +313,60 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:      best_pdf_url = None      ia_pdf_url = None -    for f in files: +    for f in release.files or []:          if f.extra and f.extra.get('shadows'):              t['in_shadows'] = True          is_pdf = 'pdf' in (f.mimetype or '')          for release_url in (f.urls or []): +            # first generic flags +            t.update(_rte_url_helper(release_url)) + +            # then PDF specific stuff (for generating "best URL" fields)              if not f.mimetype and 'pdf' in release_url.url.lower():                  is_pdf = True -            if release_url.url.lower().startswith('http') or release_url.url.lower().startswith('ftp'): -                t['in_web'] = True -            if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): -                # not sure what rel will be for this stuff -                t['in_dweb'] = True              if is_pdf:                  any_pdf_url = release_url.url -            if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf: -                t['is_preserved'] = True -                good_pdf_url = release_url.url -            if '//www.jstor.org/' in release_url.url: -                t['in_jstor'] = True -            if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: -                t['in_ia'] = True -                if is_pdf: -                    best_pdf_url = release_url.url -                    ia_pdf_url = release_url.url +                if release_url.rel in ('webarchive', 'repository', 'repo'): +                    good_pdf_url = release_url.url +                if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: +                        best_pdf_url = release_url.url +                        ia_pdf_url = release_url.url -    # here is where we bake-in priority; IA-specific +    # here is where we bake-in PDF url priority; IA-specific      t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url      t['ia_pdf_url'] = ia_pdf_url + +    for fs in release.filesets or []: +        for url_obj in (fs.urls or []): +            t.update(_rte_url_helper(url_obj)) + +    for wc in release.webcaptures or []: +        for url_obj in (wc.archive_urls or []): +            t.update(_rte_url_helper(url_obj)) + +    return t + +def _rte_url_helper(url_obj) -> dict: +    """ +    Takes a location URL ('url' and 'rel' keys) and returns generic preservation status. + +    Designed to work with file, webcapture, or fileset URLs. + +    Returns a dict; should *not* include non-True values for any keys because +    these will be iteratively update() into the overal object. +    """ +    t = dict() +    if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'): +        t['is_preserved'] = True +    if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url: +        t['in_ia'] = True +    if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'): +        t['in_web'] = True +    if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): +        # not sure what rel will be for this stuff +        t['in_dweb'] = True +    if '//www.jstor.org/' in url_obj.url: +        t['in_jstor'] = True      return t | 
