summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/transforms.py')
-rw-r--r--python/fatcat_tools/transforms.py183
1 files changed, 155 insertions, 28 deletions
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 2493b1ab..a85c877c 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -19,7 +19,22 @@ def entity_from_json(json_str, entity_type):
thing.data = json_str
return ac.deserialize(thing, entity_type)
-def release_to_elasticsearch(release):
+def check_kbart(year, archive):
+ if not archive or not archive.get('year_spans'):
+ return None
+ for span in archive['year_spans']:
+ if year >= span[0] and year <= span[1]:
+ return True
+ return False
+
+def test_check_kbart():
+
+ assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False
+ assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True
+ assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False
+ assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True
+
+def release_to_elasticsearch(entity):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
@@ -27,15 +42,16 @@ def release_to_elasticsearch(release):
Raises exception on error (never returns None)
"""
- if release.state in ('redirect', 'deleted'):
+ if entity.state in ('redirect', 'deleted'):
return dict(
- ident = release.ident,
- state = release.state,
+ ident = entity.ident,
+ state = entity.state,
)
- elif release.state != 'active':
- raise ValueError("Unhandled release state: {}".format(release.state))
+ elif entity.state != 'active':
+ raise ValueError("Unhandled entity state: {}".format(entity.state))
# First, the easy ones (direct copy)
+ release = entity
t = dict(
ident = release.ident,
state = release.state,
@@ -57,11 +73,14 @@ def release_to_elasticsearch(release):
)
is_oa = None
+ is_preserved = None
is_longtail_oa = None
in_kbart = None
+ in_jstor = False
in_web = False
in_dweb = False
in_ia = False
+ in_ia_sim = False
in_shadow = False
if release.release_date:
@@ -88,19 +107,35 @@ def release_to_elasticsearch(release):
t['container_issnl'] = container.issnl
t['container_type'] = container.container_type
if container.extra:
- if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+ c_extra = container.extra
+ if c_extra.get('kbart') and release.year:
+ in_jstor = check_kbart(release.year, c_extra['kbart'].get('jstor'))
+ in_kbart = in_jstor
+ for archive in ('portico', 'lockss', 'clockss'):
+ in_kbart = in_kbart or check_kbart(release.year, c_extra['kbart'].get(archive))
+
+ if c_extra.get('ia'):
+ if c_extra['ia'].get('sim') and release.year:
+ in_ia_sim = check_kbart(release, c_extra['ia']['sim'].get('year_spans'))
+ if c_extra['ia'].get('longtail_oa'):
+ is_longtail_oa = True
+ if c_extra.get('sherpa_romeo'):
+ if c_extra['sherpa_romeo'].get('color') == 'white':
+ is_oa = False
+ if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):
is_oa = True
- if container.extra.get('in_kbart'):
- # TODO: better KBART check goes here
- in_kbart = True
- if container.extra.get('ia'):
- # TODO: container longtail check goes here
- # TODO: sim/microfilm check goes here
- pass
- # TODO: SHERPA/Romeo goes here
+ if c_extra.get('doaj'):
+ if c_extra['doaj'].get('as_of'):
+ is_oa = True
+ if c_extra.get('road'):
+ if c_extra['road'].get('as_of'):
+ is_oa = True
else:
t['publisher'] = release.publisher
+ if release.jstor_id or (release.doi and release.doi.startswith('10.2307/')):
+ in_jstor = True
+
files = release.files or []
t['file_count'] = len(files)
t['fileset_count'] = len(release.filesets or [])
@@ -118,13 +153,15 @@ def release_to_elasticsearch(release):
if url.url.lower().startswith('http'):
in_web = True
if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
- # TODO: not sure what rel will be
+ # not sure what rel will be for this stuff
in_dweb = True
if is_pdf:
any_pdf_url = url.url
if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
is_preserved = True
good_pdf_url = url.url
+ if '//www.jstor.org/' in url.url:
+ in_jstor = True
if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
in_ia = True
if is_pdf:
@@ -141,18 +178,15 @@ def release_to_elasticsearch(release):
extra = release.extra or dict()
if extra:
- # TODO: longtail OA check from GROBID here
- if extra.get('in_kbart'):
- # NOTE: not actually setting this anywhere
- in_kbart = True
if extra.get('is_oa'):
- # NOTE: not actually setting this anywhere
+ # NOTE: not actually setting this anywhere... but could
is_oa = True
- if extra.get('grobid'):
- if not t.get('container_name'):
- t['container_name'] = extra['grobid'].get('container_name')
- if extra['grobid'].get('longtail_oa'):
- is_longtail_oa = True
+ if extra.get('longtail_oa'):
+ # sometimes set by GROBID/matcher
+ is_oa = True
+ is_longtail_oa = True
+ if not t.get('container_name'):
+ t['container_name'] = extra.get('container_name')
if extra.get('crossref'):
if extra['crossref'].get('archive'):
# all crossref archives are KBART, I believe
@@ -163,8 +197,101 @@ def release_to_elasticsearch(release):
t['is_oa'] = is_oa
t['is_longtail_oa'] = is_longtail_oa
t['in_kbart'] = in_kbart
+ t['in_jstor'] = in_jstor
t['in_web'] = in_web
t['in_dweb'] = in_dweb
- t['in_ia'] = in_ia
- t['is_preserved'] = in_ia or in_kbart
+ t['in_ia'] = bool(in_ia)
+ t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+ return t
+
+def container_to_elasticsearch(entity):
+ """
+ Converts from an entity model/schema to elasticsearch oriented schema.
+
+ Returns: dict
+ Raises exception on error (never returns None)
+ """
+
+ if entity.state in ('redirect', 'deleted'):
+ return dict(
+ ident = entity.ident,
+ state = entity.state,
+ )
+ elif entity.state != 'active':
+ raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = entity.ident,
+ state = entity.state,
+ revision = entity.revision,
+
+ name = entity.name,
+ publisher = entity.publisher,
+ container_type = entity.container_type,
+ issnl = entity.issnl,
+ wikidata_qid = entity.wikidata_qid,
+
+ entity_status = entity.entity_status,
+ language = entity.language,
+ license = entity.license_slug,
+ doi = entity.doi,
+ pmid = entity.pmid,
+ isbn13 = entity.isbn13,
+ core_id = entity.core_id,
+ arxiv_id = entity.core_id,
+ jstor_id = entity.jstor_id,
+ )
+
+ # TODO: region, discipline
+ # TODO: single primary language?
+ for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
+ if entity.extra.get(key):
+ t[key] = entity.extra[key]
+
+ in_doaj = None
+ in_road = None
+ # TODO: not currently implemented
+ in_doi = None
+ # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid"
+ #in_doaj_works = None
+ in_sherpa_romeo = None
+ is_oa = None
+ # TODO: not actually set/stored anywhere?
+ is_longtail_oa = None
+ any_kbart = None
+ any_jstor = None
+ any_ia_sim = None
+
+ extra = entity.extra
+ if extra.get('doaj'):
+ if extra['doaj'].get('as_of'):
+ in_doaj = True
+ if extra.get('road'):
+ if extra['road'].get('as_of'):
+ in_road = True
+ if extra.get('default_license'):
+ if extra['default_license'].startswith('CC-'):
+ is_oa = True
+ if extra.get('sherpa_romeo'):
+ in_sherpa_romeo = True
+ if extra['sherpa_romeo'].get('color') == 'white':
+ is_oa = False
+ if extra.get('kbart'):
+ any_kbart = True
+ if extra['kbart'].get('jstor'):
+ any_jstor = True
+ if extra.get('ia'):
+ if extra['ia'].get('sim'):
+ any_ia_sim = True
+
+ t['in_doaj'] = is_doaj
+ t['in_road'] = is_road
+ t['in_doi'] = in_doi
+ t['in_sherpa_romeo'] = in_sherpa_romeo
+ t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa
+ t['is_longtail_oa'] = is_longtail_oa
+ t['any_kbart'] = any_ia_sim
+ t['any_jstor'] = any_ia_sim
+ t['any_ia_sim'] = bool(any_ia_sim)
return t