ES release schema updates

author: Bryan Newbold <bnewbold@robocracy.org> 2020-01-29 20:39:22 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-01-29 21:59:05 -0800
commit: e047fbe1a9c495e86a6757d44eb32c9109a1b753 (patch)
tree: d4e1e256248993ea6897dc40055d2a7242ca6526
parent: 8e8b447a1d142b7815498ffa02263c34207973b4 (diff)
download: fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.tar.gz
fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.zip
2 files changed, 122 insertions, 28 deletions
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 85026060..98a1c28e 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -27,48 +27,62 @@
 "mappings": {
     "release": {
         "properties": {
-            "ident":          { "type": "keyword" },
+            "ident":          { "type": "keyword", "doc_values": false },
             "state":          { "type": "keyword" },
-            "revision":       { "type": "keyword" },
-            "work_id":        { "type": "keyword" },
-            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "subtitle":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "revision":       { "type": "keyword", "doc_values": false },
+            "work_id":        { "type": "keyword", "doc_values": false },
+            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "subtitle":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "release_date":   { "type": "date" },
-            "release_year":   { "type": "integer" },
-            "release_type":   { "type": "keyword" },
+            "release_year":   { "type": "integer", "copy_to": "biblio" },
+            "release_type":   { "type": "keyword", "copy_to": "biblio" },
             "release_stage":  { "type": "keyword" },
-            "withdrawn_status": { "type": "keyword" },
+            "withdrawn_status": { "type": "keyword", "copy_to": "biblio" },
             "language":       { "type": "keyword" },
-            "doi":            { "type": "keyword" },
-            "pmid":           { "type": "keyword" },
-            "pmcid":          { "type": "keyword" },
-            "isbn13":         { "type": "keyword" },
-            "wikidata_qid":   { "type": "keyword" },
-            "core_id":        { "type": "keyword" },
-            "axiv_id":        { "type": "keyword" },
-            "jstor_id":       { "type": "keyword" },
-            "ark_id":         { "type": "keyword" },
-            "mag_id":         { "type": "keyword" },
+            "volume":         { "type": "keyword", "copy_to": "biblio" },
+            "issue":          { "type": "keyword", "copy_to": "biblio" },
+            "pages":          { "type": "keyword", "copy_to": "biblio" },
+            "first_page":     { "type": "keyword" },
+            "number":         { "type": "keyword", "copy_to": "biblio" },
+            "doi":            { "type": "keyword", "doc_values": false },
+            "doi_prefix":     { "type": "keyword" },
+            "doi_registrar":  { "type": "keyword" },
+            "pmid":           { "type": "keyword", "doc_values": false },
+            "pmcid":          { "type": "keyword", "doc_values": false },
+            "isbn13":         { "type": "keyword", "doc_values": false },
+            "wikidata_qid":   { "type": "keyword", "doc_values": false },
+            "core_id":        { "type": "keyword", "doc_values": false },
+            "axiv_id":        { "type": "keyword", "doc_values": false },
+            "jstor_id":       { "type": "keyword", "doc_values": false },
+            "ark_id":         { "type": "keyword", "doc_values": false },
+            "mag_id":         { "type": "keyword", "doc_values": false },
             "license":        { "type": "keyword" },
             "publisher":            { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "container_id":         { "type": "keyword" },
             "container_issnl":      { "type": "keyword" },
             "container_type":       { "type": "keyword" },
             "contrib_count":        { "type": "integer" },
-            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "affiliations":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "affiliation_rors":     { "type": "keyword" },
             "creator_ids":          { "type": "keyword" },
             "ref_count":            { "type": "integer" },
             "ref_linked_count":     { "type": "integer" },
+            "ref_release_ids":      { "type": "keyword" },
             "file_count":           { "type": "integer" },
             "fileset_count":        { "type": "integer" },
             "webcapture_count":     { "type": "integer" },
             "any_abstract":         { "type": "boolean" },
 
-            "best_pdf_url":         { "type": "keyword" },
-            "ia_pdf_url":           { "type": "keyword" },
+            "biblio":               { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+            "best_pdf_url":         { "type": "keyword", "doc_values": false },
+            "ia_pdf_url":           { "type": "keyword", "doc_values": false },
+            "ia_microfilm_url":     { "type": "keyword", "doc_values": false },
             "is_oa":                { "type": "boolean" },
+            "oa_color":             { "type": "keyword" },
             "is_longtail_oa":       { "type": "boolean" },
             "is_preserved":         { "type": "boolean" },
             "in_kbart":             { "type": "boolean" },
@@ -79,7 +93,13 @@
             "in_ia_sim":            { "type": "boolean" },
             "in_shadows":           { "type": "boolean" },
             "is_superceded":        { "type": "boolean" },
+            "is_retracted":         { "type": "boolean" },
+            "preservation":         { "type": "keyword" },
 
+            "affilation":     { "type": "alias", "path": "affiliations" },
+            "ror":            { "type": "alias", "path": "affiliation_rors" },
+            "creator_id":     { "type": "alias", "path": "creator_id" },
+            "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
             "author":         { "type": "alias", "path": "contrib_names" },
             "journal":        { "type": "alias", "path": "container_name" },
             "date":           { "type": "alias", "path": "release_date" },
@@ -90,6 +110,9 @@
             "lang":           { "type": "alias", "path": "language" },
             "file_pdf_url":   { "type": "alias", "path": "best_pdf_url" },
             "release_status": { "type": "alias", "path": "release_stage" },
+            "stage":          { "type": "alias", "path": "release_stage" },
+            "type":           { "type": "alias", "path": "release_type" },
+            "retracted":      { "type": "alias", "path": "is_retracted" },
             "is_kept":        { "type": "alias", "path": "in_kbart" }
         }
     }
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index edc68748..b997796d 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -50,6 +50,10 @@ def release_to_elasticsearch(entity, force_bool=True):
         release_stage = release.release_stage,
         withdrawn_status = release.withdrawn_status,
         language = release.language,
+        volume = release.volume,
+        issue = release.issue,
+        pages = release.pages,
+        number = release.number,
         license = release.license_slug,
         doi = release.ext_ids.doi,
         pmid = release.ext_ids.pmid,
@@ -72,7 +76,7 @@ def release_to_elasticsearch(entity, force_bool=True):
     in_dweb = False
     in_ia = False
     in_ia_sim = False
-    in_shadow = False
+    in_shadows = False
 
     release_year = release.release_year
     if release.release_date:
@@ -85,11 +89,15 @@ def release_to_elasticsearch(entity, force_bool=True):
 
     t['any_abstract'] = len(release.abstracts or []) > 0
     t['ref_count'] = len(release.refs or [])
-    t['ref_linked_count'] = 0
-    if release.refs:
-        t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id])
+    ref_release_ids = []
+    for r in (release.refs or []):
+        if r.target_release_id:
+            ref_release_ids.append(r.target_release_id)
+    t['ref_release_ids'] = ref_release_ids
+    t['ref_linked_count'] = len(ref_release_ids)
     t['contrib_count'] = len(release.contribs or [])
     contrib_names = []
+    contrib_affiliations = []
     creator_ids = []
     for c in (release.contribs or []):
         if c.raw_name:
@@ -98,8 +106,14 @@ def release_to_elasticsearch(entity, force_bool=True):
             contrib_names.append(c.surname)
         if c.creator_id:
             creator_ids.append(c.creator_id)
+        if c.raw_affiliation:
+            contrib_affiliations.append(c.raw_affiliation)
     t['contrib_names'] = contrib_names
     t['creator_ids'] = creator_ids
+    t['affiliations'] = contrib_affiliations
+
+    # TODO: mapping... probably by lookup?
+    t['affiliation_rors'] = None
 
     container = release.container
     if container:
@@ -140,8 +154,13 @@ def release_to_elasticsearch(entity, force_bool=True):
             if c_extra.get('szczepanski'):
                 if c_extra['szczepanski'].get('as_of'):
                     is_oa = True
-    else:
+
+    # fall back to release-level container metadata if container not linked or
+    # missing context
+    if not t.get('publisher'):
         t['publisher'] = release.publisher
+    if not t.get('container_name') and release.extra:
+        t['container_name'] = release.extra.get('container_name')
 
     if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
         in_jstor = True
@@ -203,6 +222,46 @@ def release_to_elasticsearch(entity, force_bool=True):
             if extra['crossref'].get('archive'):
                 # all crossref archives are KBART, I believe
                 in_kbart = True
+        # backwards compatible subtitle fetching
+        if not t['subtitle'] and extra.get('subtitle'):
+            if type(extra['subtitle']) == list:
+                t['subtitle'] = extra['subtitle'][0]
+            else:
+                t['subtitle'] = extra['subtitle']
+
+    t['first_page'] = None
+    if release.pages:
+        first = release.pages.split('-')[0]
+        first = first.replace('p', '')
+        if release.pages.isdigit():
+            t['first_page'] = release.pages
+        # TODO: non-numerical first pages
+
+    t['ia_microfilm_url'] = None
+    if in_ia_sim:
+        # TODO: determine URL somehow? I think this is in flux. Will probably
+        # need extra metadata in the container extra field.
+        # special case as a demo for now.
+        if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
+                and release.year in (2011, 2013) \
+                and release.volume.isdigit() \
+                and t['first_page']:
+            t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
+                release.year,
+                release.volume - 1,
+                t['first_page'],
+            )
+
+    t['doi_registrar'] = None
+    if extra and t['doi']:
+        for k in ('crossref', 'datacite', 'jalc'):
+            if k in extra:
+                t['doi_registrar'] = k
+        if not 'doi_registrar' in t:
+            t['doi_registrar'] = 'crossref'
+
+    if t['doi']:
+        t['doi_prefix'] = t['doi'].split('/')[0]
 
     if is_longtail_oa:
         is_oa = True
@@ -215,6 +274,7 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['in_jstor'] = bool(in_jstor)
         t['in_web'] = bool(in_web)
         t['in_dweb'] = bool(in_dweb)
+        t['in_shadows'] = bool(in_shadows)
     else:
         t['is_oa'] = is_oa
         t['is_longtail_oa'] = is_longtail_oa
@@ -223,9 +283,20 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['in_jstor'] = in_jstor
         t['in_web'] = in_web
         t['in_dweb'] = in_dweb
+        t['in_shadows'] = in_shadows
 
     t['in_ia'] = bool(in_ia)
     t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+
+    if in_ia:
+        t['preservation'] = 'bright'
+    elif in_kbart or in_jstor:
+        t['preservation'] = 'dark_only'
+    elif in_shadows:
+        t['preservation'] = 'shadows_only'
+    else:
+        t['preservation'] = 'none'
+
     return t
 
 def container_to_elasticsearch(entity, force_bool=True):
author	Bryan Newbold <bnewbold@robocracy.org>	2020-01-29 20:39:22 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-01-29 21:59:05 -0800
commit	e047fbe1a9c495e86a6757d44eb32c9109a1b753 (patch)
tree	d4e1e256248993ea6897dc40055d2a7242ca6526
parent	8e8b447a1d142b7815498ffa02263c34207973b4 (diff)
download	fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.tar.gz fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.zip