aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/elasticsearch/container_schema.json33
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py34
2 files changed, 38 insertions, 29 deletions
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index b0a47e85..3be261a2 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -27,13 +27,17 @@
"mappings": {
"container": {
"properties": {
- "ident": { "type": "keyword" },
+ "ident": { "type": "keyword", "doc_values": false },
"state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "revision": { "type": "keyword", "doc_values": false },
+ "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+ "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"container_type": { "type": "keyword" },
"issnl": { "type": "keyword" },
+ "issns": { "type": "keyword" },
"wikidata_qid": { "type": "keyword" },
"country": { "type": "keyword" },
"region": { "type": "keyword" },
@@ -43,15 +47,17 @@
"first_year": { "type": "integer" },
"last_year": { "type": "integer" },
- "in_doaj": { "type": "boolean" },
- "in_road": { "type": "boolean" },
- "in_doi": { "type": "boolean" },
- "in_sherpa_romeo":{ "type": "boolean" },
- "is_oa": { "type": "boolean" },
- "is_longtail_oa": { "type": "boolean" },
- "any_kbart": { "type": "boolean" },
- "any_jstor": { "type": "boolean" },
- "any_ia_sim": { "type": "boolean" },
+
+ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "in_doaj": { "type": "boolean" },
+ "in_road": { "type": "boolean" },
+ "is_oa": { "type": "boolean" },
+ "is_longtail_oa": { "type": "boolean" },
+ "any_kbart": { "type": "boolean" },
+ "any_jstor": { "type": "boolean" },
+ "any_ia_sim": { "type": "boolean" },
+ "sherpa_romeo_color": { "type": "keyword" },
"releases_total": { "type": "integer" },
"releases_kbart": { "type": "integer" },
@@ -64,6 +70,7 @@
"year": { "type": "alias", "path": "first_year" },
"type": { "type": "alias", "path": "container_type" },
+ "issn": { "type": "alias", "path": "issns" },
"oa": { "type": "alias", "path": "is_oa" },
"longtail": { "type": "alias", "path": "is_longtail_oa" }
}
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 8141a8b9..edc68748 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -257,23 +257,24 @@ def container_to_elasticsearch(entity, force_bool=True):
wikidata_qid = entity.wikidata_qid,
)
- # TODO: region, discipline
- # TODO: single primary language?
if not entity.extra:
entity.extra = dict()
- for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
+ for key in ('country', 'languages', 'mimetypes', 'original_name',
+ 'first_year', 'last_year', 'aliases', 'abbrev', 'region',
+ 'discipline'):
if entity.extra.get(key):
t[key] = entity.extra[key]
+ t['issns'] = []
+ if entity.issnl:
+ t['issns'].append(entity.issnl)
+ for key in ('issnp', 'issne'):
+ if entity.extra.get(key):
+ t['issns'].append(entity.extra[key])
+
in_doaj = None
in_road = None
- # TODO: not currently implemented
- in_doi = None
- # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid"
- #in_doaj_works = None
- in_sherpa_romeo = None
is_oa = None
- # TODO: not actually set/stored anywhere?
is_longtail_oa = None
any_kbart = None
any_jstor = None
@@ -295,8 +296,9 @@ def container_to_elasticsearch(entity, force_bool=True):
if extra.get('default_license'):
if extra['default_license'].startswith('CC-'):
is_oa = True
+ t['sherpa_romeo_color'] = None
if extra.get('sherpa_romeo'):
- in_sherpa_romeo = True
+ t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color')
if extra['sherpa_romeo'].get('color') == 'white':
is_oa = False
if extra.get('kbart'):
@@ -306,21 +308,21 @@ def container_to_elasticsearch(entity, force_bool=True):
if extra.get('ia'):
if extra['ia'].get('sim'):
any_ia_sim = True
+ if extra['ia'].get('longtail_oa'):
+ is_longtail_oa = True
t['is_superceded'] = bool(extra.get('superceded'))
t['in_doaj'] = bool(in_doaj)
t['in_road'] = bool(in_road)
- t['in_sherpa_romeo'] = bool(in_sherpa_romeo)
t['any_kbart'] = bool(any_kbart)
- t['is_longtail_oa'] = bool(is_longtail_oa)
if force_bool:
- t['in_doi'] = bool(in_doi)
- t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa)
+ t['is_oa'] = bool(in_doaj or in_road or is_oa)
+ t['is_longtail_oa'] = bool(is_longtail_oa)
t['any_jstor'] = bool(any_jstor)
t['any_ia_sim'] = bool(any_ia_sim)
else:
- t['in_doi'] = in_doi
- t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
+ t['is_oa'] = in_doaj or in_road or is_oa
+ t['is_longtail_oa'] = is_longtail_oa
t['any_jstor'] = any_jstor
t['any_ia_sim'] = any_ia_sim
return t