diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 11:29:45 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 11:31:22 -0800 |
commit | 532a25205f2cd2929c4258dee87bc6c53cd5cdc3 (patch) | |
tree | 7cca6a61dbb76521054014a48cfac59ee688b3c4 /python | |
parent | d6ad61c28ddf5bd7dc57f9766ce57d5b48022d3e (diff) | |
download | fatcat-532a25205f2cd2929c4258dee87bc6c53cd5cdc3.tar.gz fatcat-532a25205f2cd2929c4258dee87bc6c53cd5cdc3.zip |
small release_to_elasticsearch refactors
These should have almost no change in behavior, but improve code
quality.
The one behavior change is counting ftp URLs as "in_web"
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index b0139751..c2ab5369 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -217,12 +217,18 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) -> t[k] = bool(t[k]) t['in_ia'] = bool(t['in_ia']) - t['is_preserved'] = bool(t['is_preserved'] or t['in_ia'] or t['in_kbart'] or t['in_jstor'] or t.get('pmcid') or t.get('arxiv_id')) + t['is_preserved'] = ( + bool(t['is_preserved']) + or t['in_ia'] + or t['in_kbart'] + or t['in_jstor'] + or t.get('pmcid') + or t.get('arxiv_id') + ) if t['in_ia']: t['preservation'] = 'bright' - # XXX: simplify: elif t['is_preserved'] - elif t['in_kbart'] or t['in_jstor'] or t.get('pmcid') or t.get('arxiv_id'): + elif t['is_preserved']: t['preservation'] = 'dark' elif t['in_shadows']: t['preservation'] = 'shadows_only' @@ -244,12 +250,12 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int t['container_id'] = container.ident t['container_issnl'] = container.issnl t['container_type'] = container.container_type + t['in_kbart'] = None if container.extra: c_extra = container.extra if c_extra.get('kbart') and release_year: t['in_jstor'] = check_kbart(release_year, c_extra['kbart'].get('jstor')) - # XXX: - t['in_kbart'] = t['in_jstor'] + t['in_kbart'] = t['in_kbart'] or t['in_jstor'] for archive in ('portico', 'lockss', 'clockss', 'pkp_pln', 'hathitrust', 'scholarsportal', 'cariniana'): t['in_kbart'] = t['in_kbart'] or check_kbart(release_year, c_extra['kbart'].get(archive)) @@ -309,8 +315,7 @@ def _rte_content_helper(release: ReleaseEntity) -> dict: for release_url in (f.urls or []): if not f.mimetype and 'pdf' in release_url.url.lower(): is_pdf = True - if release_url.url.lower().startswith('http'): - # XXX: also startswith('ftp') + if release_url.url.lower().startswith('http') or release_url.url.lower().startswith('ftp'): t['in_web'] = True if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): # not sure what rel will be for this stuff |