aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-29 20:39:22 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-29 21:59:05 -0800
commite047fbe1a9c495e86a6757d44eb32c9109a1b753 (patch)
treed4e1e256248993ea6897dc40055d2a7242ca6526
parent8e8b447a1d142b7815498ffa02263c34207973b4 (diff)
downloadfatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.tar.gz
fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.zip
ES release schema updates
-rw-r--r--extra/elasticsearch/release_schema.json69
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py81
2 files changed, 122 insertions, 28 deletions
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 85026060..98a1c28e 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -27,48 +27,62 @@
"mappings": {
"release": {
"properties": {
- "ident": { "type": "keyword" },
+ "ident": { "type": "keyword", "doc_values": false },
"state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "work_id": { "type": "keyword" },
- "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "revision": { "type": "keyword", "doc_values": false },
+ "work_id": { "type": "keyword", "doc_values": false },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"release_date": { "type": "date" },
- "release_year": { "type": "integer" },
- "release_type": { "type": "keyword" },
+ "release_year": { "type": "integer", "copy_to": "biblio" },
+ "release_type": { "type": "keyword", "copy_to": "biblio" },
"release_stage": { "type": "keyword" },
- "withdrawn_status": { "type": "keyword" },
+ "withdrawn_status": { "type": "keyword", "copy_to": "biblio" },
"language": { "type": "keyword" },
- "doi": { "type": "keyword" },
- "pmid": { "type": "keyword" },
- "pmcid": { "type": "keyword" },
- "isbn13": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "core_id": { "type": "keyword" },
- "axiv_id": { "type": "keyword" },
- "jstor_id": { "type": "keyword" },
- "ark_id": { "type": "keyword" },
- "mag_id": { "type": "keyword" },
+ "volume": { "type": "keyword", "copy_to": "biblio" },
+ "issue": { "type": "keyword", "copy_to": "biblio" },
+ "pages": { "type": "keyword", "copy_to": "biblio" },
+ "first_page": { "type": "keyword" },
+ "number": { "type": "keyword", "copy_to": "biblio" },
+ "doi": { "type": "keyword", "doc_values": false },
+ "doi_prefix": { "type": "keyword" },
+ "doi_registrar": { "type": "keyword" },
+ "pmid": { "type": "keyword", "doc_values": false },
+ "pmcid": { "type": "keyword", "doc_values": false },
+ "isbn13": { "type": "keyword", "doc_values": false },
+ "wikidata_qid": { "type": "keyword", "doc_values": false },
+ "core_id": { "type": "keyword", "doc_values": false },
+ "axiv_id": { "type": "keyword", "doc_values": false },
+ "jstor_id": { "type": "keyword", "doc_values": false },
+ "ark_id": { "type": "keyword", "doc_values": false },
+ "mag_id": { "type": "keyword", "doc_values": false },
"license": { "type": "keyword" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"container_id": { "type": "keyword" },
"container_issnl": { "type": "keyword" },
"container_type": { "type": "keyword" },
"contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "affiliation_rors": { "type": "keyword" },
"creator_ids": { "type": "keyword" },
"ref_count": { "type": "integer" },
"ref_linked_count": { "type": "integer" },
+ "ref_release_ids": { "type": "keyword" },
"file_count": { "type": "integer" },
"fileset_count": { "type": "integer" },
"webcapture_count": { "type": "integer" },
"any_abstract": { "type": "boolean" },
- "best_pdf_url": { "type": "keyword" },
- "ia_pdf_url": { "type": "keyword" },
+ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "best_pdf_url": { "type": "keyword", "doc_values": false },
+ "ia_pdf_url": { "type": "keyword", "doc_values": false },
+ "ia_microfilm_url": { "type": "keyword", "doc_values": false },
"is_oa": { "type": "boolean" },
+ "oa_color": { "type": "keyword" },
"is_longtail_oa": { "type": "boolean" },
"is_preserved": { "type": "boolean" },
"in_kbart": { "type": "boolean" },
@@ -79,7 +93,13 @@
"in_ia_sim": { "type": "boolean" },
"in_shadows": { "type": "boolean" },
"is_superceded": { "type": "boolean" },
+ "is_retracted": { "type": "boolean" },
+ "preservation": { "type": "keyword" },
+ "affilation": { "type": "alias", "path": "affiliations" },
+ "ror": { "type": "alias", "path": "affiliation_rors" },
+ "creator_id": { "type": "alias", "path": "creator_id" },
+ "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
"author": { "type": "alias", "path": "contrib_names" },
"journal": { "type": "alias", "path": "container_name" },
"date": { "type": "alias", "path": "release_date" },
@@ -90,6 +110,9 @@
"lang": { "type": "alias", "path": "language" },
"file_pdf_url": { "type": "alias", "path": "best_pdf_url" },
"release_status": { "type": "alias", "path": "release_stage" },
+ "stage": { "type": "alias", "path": "release_stage" },
+ "type": { "type": "alias", "path": "release_type" },
+ "retracted": { "type": "alias", "path": "is_retracted" },
"is_kept": { "type": "alias", "path": "in_kbart" }
}
}
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index edc68748..b997796d 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -50,6 +50,10 @@ def release_to_elasticsearch(entity, force_bool=True):
release_stage = release.release_stage,
withdrawn_status = release.withdrawn_status,
language = release.language,
+ volume = release.volume,
+ issue = release.issue,
+ pages = release.pages,
+ number = release.number,
license = release.license_slug,
doi = release.ext_ids.doi,
pmid = release.ext_ids.pmid,
@@ -72,7 +76,7 @@ def release_to_elasticsearch(entity, force_bool=True):
in_dweb = False
in_ia = False
in_ia_sim = False
- in_shadow = False
+ in_shadows = False
release_year = release.release_year
if release.release_date:
@@ -85,11 +89,15 @@ def release_to_elasticsearch(entity, force_bool=True):
t['any_abstract'] = len(release.abstracts or []) > 0
t['ref_count'] = len(release.refs or [])
- t['ref_linked_count'] = 0
- if release.refs:
- t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id])
+ ref_release_ids = []
+ for r in (release.refs or []):
+ if r.target_release_id:
+ ref_release_ids.append(r.target_release_id)
+ t['ref_release_ids'] = ref_release_ids
+ t['ref_linked_count'] = len(ref_release_ids)
t['contrib_count'] = len(release.contribs or [])
contrib_names = []
+ contrib_affiliations = []
creator_ids = []
for c in (release.contribs or []):
if c.raw_name:
@@ -98,8 +106,14 @@ def release_to_elasticsearch(entity, force_bool=True):
contrib_names.append(c.surname)
if c.creator_id:
creator_ids.append(c.creator_id)
+ if c.raw_affiliation:
+ contrib_affiliations.append(c.raw_affiliation)
t['contrib_names'] = contrib_names
t['creator_ids'] = creator_ids
+ t['affiliations'] = contrib_affiliations
+
+ # TODO: mapping... probably by lookup?
+ t['affiliation_rors'] = None
container = release.container
if container:
@@ -140,8 +154,13 @@ def release_to_elasticsearch(entity, force_bool=True):
if c_extra.get('szczepanski'):
if c_extra['szczepanski'].get('as_of'):
is_oa = True
- else:
+
+ # fall back to release-level container metadata if container not linked or
+ # missing context
+ if not t.get('publisher'):
t['publisher'] = release.publisher
+ if not t.get('container_name') and release.extra:
+ t['container_name'] = release.extra.get('container_name')
if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
in_jstor = True
@@ -203,6 +222,46 @@ def release_to_elasticsearch(entity, force_bool=True):
if extra['crossref'].get('archive'):
# all crossref archives are KBART, I believe
in_kbart = True
+ # backwards compatible subtitle fetching
+ if not t['subtitle'] and extra.get('subtitle'):
+ if type(extra['subtitle']) == list:
+ t['subtitle'] = extra['subtitle'][0]
+ else:
+ t['subtitle'] = extra['subtitle']
+
+ t['first_page'] = None
+ if release.pages:
+ first = release.pages.split('-')[0]
+ first = first.replace('p', '')
+ if release.pages.isdigit():
+ t['first_page'] = release.pages
+ # TODO: non-numerical first pages
+
+ t['ia_microfilm_url'] = None
+ if in_ia_sim:
+ # TODO: determine URL somehow? I think this is in flux. Will probably
+ # need extra metadata in the container extra field.
+ # special case as a demo for now.
+ if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
+ and release.year in (2011, 2013) \
+ and release.volume.isdigit() \
+ and t['first_page']:
+ t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
+ release.year,
+ release.volume - 1,
+ t['first_page'],
+ )
+
+ t['doi_registrar'] = None
+ if extra and t['doi']:
+ for k in ('crossref', 'datacite', 'jalc'):
+ if k in extra:
+ t['doi_registrar'] = k
+ if not 'doi_registrar' in t:
+ t['doi_registrar'] = 'crossref'
+
+ if t['doi']:
+ t['doi_prefix'] = t['doi'].split('/')[0]
if is_longtail_oa:
is_oa = True
@@ -215,6 +274,7 @@ def release_to_elasticsearch(entity, force_bool=True):
t['in_jstor'] = bool(in_jstor)
t['in_web'] = bool(in_web)
t['in_dweb'] = bool(in_dweb)
+ t['in_shadows'] = bool(in_shadows)
else:
t['is_oa'] = is_oa
t['is_longtail_oa'] = is_longtail_oa
@@ -223,9 +283,20 @@ def release_to_elasticsearch(entity, force_bool=True):
t['in_jstor'] = in_jstor
t['in_web'] = in_web
t['in_dweb'] = in_dweb
+ t['in_shadows'] = in_shadows
t['in_ia'] = bool(in_ia)
t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+
+ if in_ia:
+ t['preservation'] = 'bright'
+ elif in_kbart or in_jstor:
+ t['preservation'] = 'dark_only'
+ elif in_shadows:
+ t['preservation'] = 'shadows_only'
+ else:
+ t['preservation'] = 'none'
+
return t
def container_to_elasticsearch(entity, force_bool=True):