aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/transforms.py')
-rw-r--r--python/fatcat_tools/transforms.py130
1 files changed, 98 insertions, 32 deletions
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 0f957f9a..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
+
import collections
from fatcat_client import ReleaseEntity, ApiClient
@@ -26,25 +27,43 @@ def release_to_elasticsearch(release):
Raises exception on error (never returns None)
"""
- if release.state != 'active':
- raise ValueError("Entity is not 'active'")
+ if release.state in ('redirect', 'deleted'):
+ return dict(
+ ident = release.ident,
+ state = release.state,
+ )
+ elif release.state != 'active':
+ raise ValueError("Unhandled release state: {}".format(release.state))
# First, the easy ones (direct copy)
t = dict(
ident = release.ident,
+ state = release.state,
revision = release.revision,
title = release.title,
+ original_title = release.original_title,
release_type = release.release_type,
release_status = release.release_status,
language = release.language,
+ license = release.license_slug,
doi = release.doi,
pmid = release.pmid,
pmcid = release.pmcid,
isbn13 = release.isbn13,
+ wikidata_qid = release.wikidata_qid,
core_id = release.core_id,
- wikidata_qid = release.wikidata_qid
+ arxiv_id = release.core_id,
+ jstor_id = release.jstor_id,
)
+ is_oa = None
+ is_longtail_oa = None
+ in_kbart = None
+ in_web = False
+ in_dweb = False
+ in_ia = False
+ in_shadow = False
+
if release.release_date:
# .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
t['release_date'] = release.release_date.isoformat()
@@ -53,52 +72,99 @@ def release_to_elasticsearch(release):
if release.release_year is not None:
t['release_year'] = release.release_year
+ t['any_abstract'] = len(release.abstracts) > 0
+ t['ref_count'] = len(release.refs or [])
+ t['contrib_count'] = len(release.contribs or [])
+ contrib_names = []
+ for c in (release.contribs or []):
+ if c.raw_name:
+ contrib_names.append(c.raw_name)
+ t['contrib_names'] = contrib_names
+
container = release.container
- container_is_kept = False
if container:
t['publisher'] = container.publisher
t['container_name'] = container.name
t['container_issnl'] = container.issnl
- container_extra = container.extra
- if container_extra:
- t['container_is_oa'] = container_extra.get('is_oa')
- container_is_kept = container_extra.get('is_kept', False)
- t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ t['container_type'] = container.container_type
+ if container.extra:
+ if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+ is_oa = True
+ if container.extra.get('in_kbart'):
+ # TODO: better KBART check goes here
+ in_kbart = True
+ if container.extra.get('ia'):
+ # TODO: container longtail check goes here
+ # TODO: sim/microfilm check goes here
+ pass
+ # TODO: SHERPA/Romeo goes here
else:
t['publisher'] = release.publisher
files = release.files or []
t['file_count'] = len(files)
- in_wa = False
- in_ia = False
- t['file_pdf_url'] = None
+ t['fileset_count'] = len(release.filesets or [])
+ t['webcapture_count'] = len(release.webcaptures or [])
+ any_pdf_url = None
+ good_pdf_url = None
+ best_pdf_url = None
+ ia_pdf_url = None
for f in files:
+ if f.extra and f.extra.get('shadows'):
+ # TODO: shadow check goes here
+ in_shadows = True
is_pdf = 'pdf' in (f.mimetype or '')
for url in (f.urls or []):
- if url.rel == 'webarchive':
- in_wa = True
- if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+ if url.url.lower().startswith('http'):
+ in_web = True
+ if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+ # TODO: not sure what rel will be
+ in_dweb = True
+ if is_pdf:
+ any_pdf_url = url.url
+ if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+ is_preserved = True
+ good_pdf_url = url.url
+ if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
in_ia = True
if is_pdf:
- t['file_pdf_url'] = url.url
- if not t['file_pdf_url'] and is_pdf:
- t['file_pdf_url'] = url.url
- t['file_in_webarchive'] = in_wa
- t['file_in_ia'] = in_ia
+ best_pdf_url = url.url
+ ia_pdf_url = url.url
+ # here is where we bake-in priority; IA-specific
+ t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+ t['ia_pdf_url'] = ia_pdf_url
+
+ if release.license_slug:
+ # TODO: more/better checks here, particularly strict *not* OA licenses
+ if release.license_slug.startswith("CC-"):
+ is_oa = True
extra = release.extra or dict()
if extra:
- t['in_shadow'] = extra.get('in_shadow')
- if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
- t['container_is_longtail_oa'] = True
- t['any_abstract'] = bool(release.abstracts)
- t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+ # TODO: longtail OA check from GROBID here
+ if extra.get('in_kbart'):
+ # NOTE: not actually setting this anywhere
+ in_kbart = True
+ if extra.get('is_oa'):
+ # NOTE: not actually setting this anywhere
+ is_oa = True
+ if extra.get('grobid'):
+ if not t.get('container_name'):
+ t['container_name'] = extra['grobid'].get('container_name')
+ if extra['grobid'].get('longtail_oa'):
+ is_longtail_oa = True
+ if extra.get('crossref'):
+ if extra['crossref'].get('archive'):
+ # all crossref archives are KBART, I believe
+ in_kbart = True
- t['ref_count'] = len(release.refs or [])
- t['contrib_count'] = len(release.contribs or [])
- contrib_names = []
- for c in (release.contribs or []):
- if c.raw_name:
- contrib_names.append(c.raw_name)
- t['contrib_names'] = contrib_names
+ if is_longtail_oa:
+ is_oa = True
+ t['is_oa'] = is_oa
+ t['is_longtail_oa'] = is_longtail_oa
+ t['in_kbart'] = in_kbart
+ t['in_web'] = in_web
+ t['in_dweb'] = in_dweb
+ t['in_ia'] = in_ia
+ t['is_preserved'] = in_ia or in_kbart
return t