diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/transforms.py | 120 | 
1 files changed, 90 insertions, 30 deletions
| diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index b1fd7e68..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ +  import collections  from fatcat_client import ReleaseEntity, ApiClient @@ -40,17 +41,29 @@ def release_to_elasticsearch(release):          state = release.state,          revision = release.revision,          title = release.title, +        original_title = release.original_title,          release_type = release.release_type,          release_status = release.release_status,          language = release.language, +        license = release.license_slug,          doi = release.doi,          pmid = release.pmid,          pmcid = release.pmcid,          isbn13 = release.isbn13, +        wikidata_qid = release.wikidata_qid,          core_id = release.core_id, -        wikidata_qid = release.wikidata_qid +        arxiv_id = release.core_id, +        jstor_id = release.jstor_id,      ) +    is_oa = None +    is_longtail_oa = None +    in_kbart = None +    in_web = False +    in_dweb = False +    in_ia = False +    in_shadow = False +      if release.release_date:          # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)          t['release_date'] = release.release_date.isoformat() @@ -59,52 +72,99 @@ def release_to_elasticsearch(release):      if release.release_year is not None:          t['release_year'] = release.release_year +    t['any_abstract'] = len(release.abstracts) > 0 +    t['ref_count'] = len(release.refs or []) +    t['contrib_count'] = len(release.contribs or []) +    contrib_names = [] +    for c in (release.contribs or []): +        if c.raw_name: +            contrib_names.append(c.raw_name) +    t['contrib_names'] = contrib_names +      container = release.container -    container_is_kept = False      if container:          t['publisher'] = container.publisher          t['container_name'] = container.name          t['container_issnl'] = container.issnl -        container_extra = container.extra -        if container_extra: -            t['container_is_oa'] = container_extra.get('is_oa') -            container_is_kept = container_extra.get('is_kept', False) -            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') +        t['container_type'] = container.container_type +        if container.extra: +            if container.extra.get('is_oa') or container.extra.get('in_doaj'): +                is_oa = True +            if container.extra.get('in_kbart'): +                # TODO: better KBART check goes here +                in_kbart = True +            if container.extra.get('ia'): +                # TODO: container longtail check goes here +                # TODO: sim/microfilm check goes here +                pass +            # TODO: SHERPA/Romeo goes here      else:          t['publisher'] = release.publisher      files = release.files or []      t['file_count'] = len(files) -    in_wa = False -    in_ia = False -    t['file_pdf_url'] = None +    t['fileset_count'] = len(release.filesets or []) +    t['webcapture_count'] = len(release.webcaptures or []) +    any_pdf_url = None +    good_pdf_url = None +    best_pdf_url = None +    ia_pdf_url = None      for f in files: +        if f.extra and f.extra.get('shadows'): +            # TODO: shadow check goes here +            in_shadows = True          is_pdf = 'pdf' in (f.mimetype or '')          for url in (f.urls or []): -            if url.rel == 'webarchive': -                in_wa = True -            if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): +            if url.url.lower().startswith('http'): +                in_web = True +            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): +                # TODO: not sure what rel will be +                in_dweb = True +            if is_pdf: +                any_pdf_url = url.url +            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: +                is_preserved = True +                good_pdf_url = url.url +            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:                  in_ia = True                  if is_pdf: -                    t['file_pdf_url'] = url.url -            if not t['file_pdf_url'] and is_pdf: -                t['file_pdf_url'] = url.url -    t['file_in_webarchive'] = in_wa -    t['file_in_ia'] = in_ia +                    best_pdf_url = url.url +                    ia_pdf_url = url.url +    # here is where we bake-in priority; IA-specific +    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url +    t['ia_pdf_url'] = ia_pdf_url + +    if release.license_slug: +        # TODO: more/better checks here, particularly strict *not* OA licenses +        if release.license_slug.startswith("CC-"): +            is_oa = True      extra = release.extra or dict()      if extra: -        t['in_shadow'] = extra.get('in_shadow') -        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): -            t['container_is_longtail_oa'] = True -    t['any_abstract'] = len(release.abstracts) > 0 -    t['is_kept'] = container_is_kept or extra.get('is_kept', False) +        # TODO: longtail OA check from GROBID here +        if extra.get('in_kbart'): +            # NOTE: not actually setting this anywhere +            in_kbart = True +        if extra.get('is_oa'): +            # NOTE: not actually setting this anywhere +            is_oa = True +        if extra.get('grobid'): +            if not t.get('container_name'): +                t['container_name'] = extra['grobid'].get('container_name') +            if extra['grobid'].get('longtail_oa'): +                is_longtail_oa = True +        if extra.get('crossref'): +            if extra['crossref'].get('archive'): +                # all crossref archives are KBART, I believe +                in_kbart = True -    t['ref_count'] = len(release.refs or []) -    t['contrib_count'] = len(release.contribs or []) -    contrib_names = [] -    for c in (release.contribs or []): -        if c.raw_name: -            contrib_names.append(c.raw_name) -    t['contrib_names'] = contrib_names +    if is_longtail_oa: +        is_oa = True +    t['is_oa'] = is_oa +    t['is_longtail_oa'] = is_longtail_oa +    t['in_kbart'] = in_kbart +    t['in_web'] = in_web +    t['in_dweb'] = in_dweb +    t['in_ia'] = in_ia +    t['is_preserved'] = in_ia or in_kbart      return t | 
