diff options
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 318 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 12 |
2 files changed, 193 insertions, 137 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index dfb5f799..ad4b7722 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,10 +1,13 @@ import datetime +from typing import Optional import tldextract +from fatcat_openapi_client import ReleaseEntity, ContainerEntity -def check_kbart(year, archive): + +def check_kbart(year: int, archive: dict) -> Optional[bool]: if not archive or not archive.get('year_spans'): return None for span in archive['year_spans']: @@ -12,7 +15,7 @@ def check_kbart(year, archive): return True return False -def test_check_kbart(): +def test_check_kbart() -> None: assert check_kbart(1990, dict()) is None assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) is False @@ -21,10 +24,13 @@ def test_check_kbart(): assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) is True -def release_to_elasticsearch(entity, force_bool=True): +def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> dict: """ Converts from an entity model/schema to elasticsearch oriented schema. + This is a large/complex transform, so subsets are split out into helper + functions. + Returns: dict Raises exception on error (never returns None) """ @@ -68,16 +74,18 @@ def release_to_elasticsearch(entity, force_bool=True): mag_id = release.ext_ids.mag, ) - is_oa = None - is_preserved = None - is_longtail_oa = None - in_kbart = None - in_jstor = False - in_web = False - in_dweb = False - in_ia = False - in_ia_sim = False - in_shadows = False + t.update(dict( + is_oa = None, + is_longtail_oa = None, + is_preserved = None, + in_web = False, + in_dweb = False, + in_ia = False, + in_ia_sim = False, + in_kbart = None, + in_jstor = False, + in_shadows = False, + )) release_year = release.release_year if release.release_date: @@ -116,55 +124,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: mapping... probably by lookup? t['affiliation_rors'] = None - this_year = datetime.date.today().year - container = release.container - if container: - t['publisher'] = container.publisher - t['container_name'] = container.name - # this is container.ident, not release.container_id, because there may - # be a redirect involved - t['container_id'] = container.ident - t['container_issnl'] = container.issnl - t['container_type'] = container.container_type - if container.extra: - c_extra = container.extra - if c_extra.get('kbart') and release_year: - in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor')) - in_kbart = in_jstor - for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', - 'hathitrust', 'scholarsportal', 'cariniana'): - in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive)) - # recent KBART coverage is often not updated for the - # current year. So for current-year publications, consider - # coverage from *last* year to also be included in the - # Keeper - if not in_kbart and release_year == this_year: - in_kbart = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) - - if c_extra.get('ia'): - if c_extra['ia'].get('sim') and release_year: - in_ia_sim = check_kbart(release_year, c_extra['ia']['sim']) - if c_extra['ia'].get('longtail_oa'): - is_longtail_oa = True - if c_extra.get('sherpa_romeo'): - if c_extra['sherpa_romeo'].get('color') == 'white': - is_oa = False - if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): - is_oa = True - if c_extra.get('doaj'): - if c_extra['doaj'].get('as_of'): - is_oa = True - if c_extra.get('road'): - if c_extra['road'].get('as_of'): - is_oa = True - if c_extra.get('szczepanski'): - if c_extra['szczepanski'].get('as_of'): - is_oa = True - if c_extra.get('country'): - t['country_code'] = c_extra['country'] - t['country_code_upper'] = c_extra['country'].upper() - if c_extra.get('publisher_type'): - t['publisher_type'] = c_extra['publisher_type'] + if release.container: + t.update(_rte_container_helper(release.container, release_year)) # fall back to release-level container metadata if container not linked or # missing context @@ -174,67 +135,36 @@ def release_to_elasticsearch(entity, force_bool=True): t['container_name'] = release.extra.get('container_name') if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): - in_jstor = True + t['in_jstor'] = True - files = release.files or [] - t['file_count'] = len(files) - t['fileset_count'] = len(release.filesets or []) - t['webcapture_count'] = len(release.webcaptures or []) - any_pdf_url = None - good_pdf_url = None - best_pdf_url = None - ia_pdf_url = None - for f in files: - if f.extra and f.extra.get('shadows'): - # TODO: shadow check goes here - in_shadows = True - is_pdf = 'pdf' in (f.mimetype or '') - for release_url in (f.urls or []): - if not f.mimetype and 'pdf' in release_url.url.lower(): - is_pdf = True - if release_url.url.lower().startswith('http'): - in_web = True - if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): - # not sure what rel will be for this stuff - in_dweb = True - if is_pdf: - any_pdf_url = release_url.url - if is_pdf and release_url.rel in ('webarchive', 'repository') and is_pdf: - is_preserved = True - good_pdf_url = release_url.url - if '//www.jstor.org/' in release_url.url: - in_jstor = True - if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: - in_ia = True - if is_pdf: - best_pdf_url = release_url.url - ia_pdf_url = release_url.url - # here is where we bake-in priority; IA-specific - t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url - t['ia_pdf_url'] = ia_pdf_url + # transform file/fileset/webcapture related fields + t.update(_rte_content_helper(release)) + + if release.ext_ids.doaj: + t['is_oa'] = True if release.license_slug: # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): - is_oa = True + t['is_oa'] = True if release.license_slug.startswith("ARXIV-"): - is_oa = True + t['is_oa'] = True extra = release.extra or dict() if extra: if extra.get('is_oa'): # NOTE: not actually setting this anywhere... but could - is_oa = True + t['is_oa'] = True if extra.get('longtail_oa'): # sometimes set by GROBID/matcher - is_oa = True - is_longtail_oa = True + t['is_oa'] = True + t['is_longtail_oa'] = True if not t.get('container_name'): t['container_name'] = extra.get('container_name') if extra.get('crossref'): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe - in_kbart = True + t['in_kbart'] = True # backwards compatible subtitle fetching if not t['subtitle'] and extra.get('subtitle'): if type(extra['subtitle']) == list: @@ -251,7 +181,7 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: non-numerical first pages t['ia_microfilm_url'] = None - if in_ia_sim: + if t['in_ia_sim']: # TODO: determine URL somehow? I think this is in flux. Will probably # need extra metadata in the container extra field. # special case as a demo for now. @@ -277,42 +207,168 @@ def release_to_elasticsearch(entity, force_bool=True): if t['doi']: t['doi_prefix'] = t['doi'].split('/')[0] - if is_longtail_oa: - is_oa = True + if t['is_longtail_oa']: + t['is_oa'] = True + # optionally coerce all flags from Optional[bool] to bool if force_bool: - t['is_oa'] = bool(is_oa) - t['is_longtail_oa'] = bool(is_longtail_oa) - t['in_kbart'] = bool(in_kbart) - t['in_ia_sim'] = bool(in_ia_sim) - t['in_jstor'] = bool(in_jstor) - t['in_web'] = bool(in_web) - t['in_dweb'] = bool(in_dweb) - t['in_shadows'] = bool(in_shadows) - else: - t['is_oa'] = is_oa - t['is_longtail_oa'] = is_longtail_oa - t['in_kbart'] = in_kbart - t['in_ia_sim'] = in_ia_sim - t['in_jstor'] = in_jstor - t['in_web'] = in_web - t['in_dweb'] = in_dweb - t['in_shadows'] = in_shadows - - t['in_ia'] = bool(in_ia) - t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id')) + for k in ('is_oa', 'is_longtail_oa', 'in_kbart', 'in_ia_sim', + 'in_jstor', 'in_web', 'in_dweb', 'in_shadows'): + t[k] = bool(t[k]) + + t['in_ia'] = bool(t['in_ia']) + t['is_preserved'] = ( + bool(t['is_preserved']) + or t['in_ia'] + or t['in_kbart'] + or t['in_jstor'] + or t.get('pmcid') + or t.get('arxiv_id') + ) - if in_ia: + if t['in_ia']: t['preservation'] = 'bright' - elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'): + elif t['is_preserved']: t['preservation'] = 'dark' - elif in_shadows: + elif t['in_shadows']: t['preservation'] = 'shadows_only' else: t['preservation'] = 'none' return t +def _rte_container_helper(container: ContainerEntity, release_year: Optional[int]) -> dict: + """ + Container metadata sub-section of release_to_elasticsearch() + """ + this_year = datetime.date.today().year + t = dict() + t['publisher'] = container.publisher + t['container_name'] = container.name + # this is container.ident, not release.container_id, because there may + # be a redirect involved + t['container_id'] = container.ident + t['container_issnl'] = container.issnl + t['container_type'] = container.container_type + if container.extra: + c_extra = container.extra + if c_extra.get('kbart') and release_year: + if check_kbart(release_year, c_extra['kbart'].get('jstor')): + t['in_jstor'] = True + if t.get('in_kbart') or t.get('in_jstor'): + t['in_kbart'] = True + for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', + 'hathitrust', 'scholarsportal', 'cariniana'): + t['in_kbart'] = t.get('in_kbart') or check_kbart(release_year, c_extra['kbart'].get(archive)) + # recent KBART coverage is often not updated for the + # current year. So for current-year publications, consider + # coverage from *last* year to also be included in the + # Keeper + if not t.get('in_kbart') and release_year == this_year: + t['in_kbart'] = check_kbart(this_year - 1, c_extra['kbart'].get(archive)) + + if c_extra.get('ia'): + if c_extra['ia'].get('sim') and release_year: + t['in_ia_sim'] = check_kbart(release_year, c_extra['ia']['sim']) + if c_extra['ia'].get('longtail_oa'): + t['is_longtail_oa'] = True + if c_extra.get('sherpa_romeo'): + if c_extra['sherpa_romeo'].get('color') == 'white': + t['is_oa'] = False + if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): + t['is_oa'] = True + if c_extra.get('doaj'): + if c_extra['doaj'].get('as_of'): + t['is_oa'] = True + if c_extra.get('road'): + if c_extra['road'].get('as_of'): + t['is_oa'] = True + if c_extra.get('szczepanski'): + if c_extra['szczepanski'].get('as_of'): + t['is_oa'] = True + if c_extra.get('country'): + t['country_code'] = c_extra['country'] + t['country_code_upper'] = c_extra['country'].upper() + if c_extra.get('publisher_type'): + t['publisher_type'] = c_extra['publisher_type'] + return t + +def _rte_content_helper(release: ReleaseEntity) -> dict: + """ + File/FileSet/WebCapture sub-section of release_to_elasticsearch() + + The current priority order for "best_pdf_url" is: + - internet archive urls (archive.org or web.archive.org) + - other webarchive or repository URLs + - any other URL + """ + t = dict( + file_count = len(release.files or []), + fileset_count = len(release.filesets or []), + webcapture_count = len(release.webcaptures or []), + ) + + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None + + for f in release.files or []: + if f.extra and f.extra.get('shadows'): + t['in_shadows'] = True + is_pdf = 'pdf' in (f.mimetype or '') + for release_url in (f.urls or []): + # first generic flags + t.update(_rte_url_helper(release_url)) + + # then PDF specific stuff (for generating "best URL" fields) + if not f.mimetype and 'pdf' in release_url.url.lower(): + is_pdf = True + if is_pdf: + any_pdf_url = release_url.url + if release_url.rel in ('webarchive', 'repository', 'repo'): + good_pdf_url = release_url.url + if '//web.archive.org/' in release_url.url or '//archive.org/' in release_url.url: + best_pdf_url = release_url.url + ia_pdf_url = release_url.url + + # here is where we bake-in PDF url priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + for fs in release.filesets or []: + for url_obj in (fs.urls or []): + t.update(_rte_url_helper(url_obj)) + + for wc in release.webcaptures or []: + for url_obj in (wc.archive_urls or []): + t.update(_rte_url_helper(url_obj)) + + return t + +def _rte_url_helper(url_obj) -> dict: + """ + Takes a location URL ('url' and 'rel' keys) and returns generic preservation status. + + Designed to work with file, webcapture, or fileset URLs. + + Returns a dict; should *not* include non-True values for any keys because + these will be iteratively update() into the overal object. + """ + t = dict() + if url_obj.rel in ('webarchive', 'repository', 'archive', 'repo'): + t['is_preserved'] = True + if '//web.archive.org/' in url_obj.url or '//archive.org/' in url_obj.url: + t['in_ia'] = True + if url_obj.url.lower().startswith('http') or url_obj.url.lower().startswith('ftp'): + t['in_web'] = True + if url_obj.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # not sure what rel will be for this stuff + t['in_dweb'] = True + if '//www.jstor.org/' in url_obj.url: + t['in_jstor'] = True + return t + def container_to_elasticsearch(entity, force_bool=True): """ diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 2f4e2271..59831017 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -15,15 +15,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= if release.state != 'active': return None + # TODO: infer ingest type based on release_type or container metadata? + if not ingest_type: + ingest_type = 'pdf' + # generate a URL where we expect to find fulltext url = None link_source = None link_source_id = None - if release.ext_ids.arxiv: + if release.ext_ids.arxiv and ingest_type == "pdf": url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) link_source = "arxiv" link_source_id = release.ext_ids.arxiv - elif release.ext_ids.pmcid: + elif release.ext_ids.pmcid and ingest_type == "pdf": # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) @@ -40,10 +44,6 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - # TODO: infer ingest type based on release_type or container metadata? - if not ingest_type: - ingest_type = 'pdf' - ingest_request = { 'ingest_type': ingest_type, 'ingest_request_source': ingest_request_source, |