aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-16 11:29:45 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-16 11:31:22 -0800
commit532a25205f2cd2929c4258dee87bc6c53cd5cdc3 (patch)
tree7cca6a61dbb76521054014a48cfac59ee688b3c4 /python/fatcat_tools/transforms
parentd6ad61c28ddf5bd7dc57f9766ce57d5b48022d3e (diff)
downloadfatcat-532a25205f2cd2929c4258dee87bc6c53cd5cdc3.tar.gz
fatcat-532a25205f2cd2929c4258dee87bc6c53cd5cdc3.zip
small release_to_elasticsearch refactors
These should have almost no change in behavior, but improve code quality. The one behavior change is counting ftp URLs as "in_web"
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py19
1 files changed, 12 insertions, 7 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index b0139751..c2ab5369 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -217,12 +217,18 @@ def release_to_elasticsearch(entity: ReleaseEntity, force_bool: bool = True) ->
t[k] = bool(t[k])
t['in_ia'] = bool(t['in_ia'])
- t['is_preserved'] = bool(t['is_preserved'] or t['in_ia'] or t['in_kbart'] or t['in_jstor'] or t.get('pmcid') or t.get('arxiv_id'))
+ t['is_preserved'] = (
+ bool(t['is_preserved'])
+ or t['in_ia']
+ or t['in_kbart']
+ or t['in_jstor']
+ or t.get('pmcid')
+ or t.get('arxiv_id')
+ )
if t['in_ia']:
t['preservation'] = 'bright'
- # XXX: simplify: elif t['is_preserved']
- elif t['in_kbart'] or t['in_jstor'] or t.get('pmcid') or t.get('arxiv_id'):
+ elif t['is_preserved']:
t['preservation'] = 'dark'
elif t['in_shadows']:
t['preservation'] = 'shadows_only'
@@ -244,12 +250,12 @@ def _rte_container_helper(container: ContainerEntity, release_year: Optional[int
t['container_id'] = container.ident
t['container_issnl'] = container.issnl
t['container_type'] = container.container_type
+ t['in_kbart'] = None
if container.extra:
c_extra = container.extra
if c_extra.get('kbart') and release_year:
t['in_jstor'] = check_kbart(release_year, c_extra['kbart'].get('jstor'))
- # XXX:
- t['in_kbart'] = t['in_jstor']
+ t['in_kbart'] = t['in_kbart'] or t['in_jstor']
for archive in ('portico', 'lockss', 'clockss', 'pkp_pln',
'hathitrust', 'scholarsportal', 'cariniana'):
t['in_kbart'] = t['in_kbart'] or check_kbart(release_year, c_extra['kbart'].get(archive))
@@ -309,8 +315,7 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:
for release_url in (f.urls or []):
if not f.mimetype and 'pdf' in release_url.url.lower():
is_pdf = True
- if release_url.url.lower().startswith('http'):
- # XXX: also startswith('ftp')
+ if release_url.url.lower().startswith('http') or release_url.url.lower().startswith('ftp'):
t['in_web'] = True
if release_url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
# not sure what rel will be for this stuff