summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-16 13:43:04 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-16 13:43:04 -0700
commitb8eec609b60ca00cb6d8d2bb253e11f8dc474b53 (patch)
tree87a9e71444932f9aa9269fd9788b3bf79621e1e7
parent2d3d0274a23f0e52bff8b786aa7a930cb5b74c99 (diff)
downloadfatcat-scholar-b8eec609b60ca00cb6d8d2bb253e11f8dc474b53.tar.gz
fatcat-scholar-b8eec609b60ca00cb6d8d2bb253e11f8dc474b53.zip
tweak ES schema fields a bit
-rw-r--r--fatcat_scholar/es_transform.py7
-rw-r--r--proposals/work_schema.md35
-rw-r--r--schema/scholar_fulltext.v01.json9
3 files changed, 32 insertions, 19 deletions
diff --git a/fatcat_scholar/es_transform.py b/fatcat_scholar/es_transform.py
index 1f47e2c..089b155 100644
--- a/fatcat_scholar/es_transform.py
+++ b/fatcat_scholar/es_transform.py
@@ -65,6 +65,7 @@ class ScholarBiblio(BaseModel):
container_original_name: Optional[str]
container_ident: Optional[str]
container_issnl: Optional[str]
+ container_wikidata_qid: Optional[str]
issns: List[str]
container_type: Optional[str]
contrib_count: Optional[int]
@@ -112,10 +113,10 @@ class ScholarRelease(BaseModel):
container_type: Optional[str]
class ScholarSim(BaseModel):
- ia_item: str
- ia_collection: str
+ issue_item: str
+ pub_collection: str
+ sim_pubid: str
first_page: Optional[str]
- pub_id: str
class ScholarAbstract(BaseModel):
body: str
diff --git a/proposals/work_schema.md b/proposals/work_schema.md
index 1e0f272..933e750 100644
--- a/proposals/work_schema.md
+++ b/proposals/work_schema.md
@@ -1,19 +1,21 @@
## Top-Level
-- type: _doc
-- key: keyword
-- key_type: keyword (work or page)
-- `work_id`
-- biblio: obj
-- fulltext: obj
-- sim: obj
-- abstracts: nested
+- type: `_doc` (aka, no type, `include_type_name=false`)
+- key: keyword (same as `_id`)
+- `doc_type`: keyword (work or page)
+- `doc_index_ts`: timestamp when document indexed
+- `work_id`: fatcat work ident (optional)
+
+- `biblio`: obj
+- `fulltext`: obj
+- `ia_sim`: obj
+- `abstracts`: nested
body
lang
-- releases: nested (TBD)
-- access
-- tags: array of keywords
+- `releases`: nested (TBD)
+- `access`
+- `tags`: array of keywords
TODO:
- summary fields to index "everything" into?
@@ -50,11 +52,14 @@ NEW:
- `container_name` (etc)
- `container_id`
- `container_issnl`
-- `container_issn` (array)
+- `container_wikidata_qid`
+- `issns` (array)
- `contrib_names`
- `affiliations`
- `creator_ids`
+TODO: should all external identifiers go under `releases` instead of `biblio`? Or some duplicated?
+
## Fulltext
- `status`: web, sim, shadow
@@ -81,6 +86,12 @@ Only index one abstract per language.
Enough details to construct a link or do a lookup or whatever. Note that might
be doing CDL status lookups on SERP pages.
+- `issue_item`: str
+- `pub_collection`: str
+- `sim_pubid`: str
+- `first_page`: str
+
+
Also pass-through archive.org metadata here (collection-level and item-level)
## Access
diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json
index 613ca1e..e09b00c 100644
--- a/schema/scholar_fulltext.v01.json
+++ b/schema/scholar_fulltext.v01.json
@@ -98,6 +98,7 @@
"container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
"container_ident": { "type": "keyword", "normalizer": "default" },
"container_issnl": { "type": "keyword", "normalizer": "default" },
+ "container_wikidata_qid": { "type": "keyword", "normalizer": "default" },
"issns": { "type": "keyword", "normalizer": "default" },
"container_type": { "type": "keyword", "normalizer": "default" },
"contrib_count": { "type": "integer" },
@@ -128,10 +129,10 @@
"type": "object",
"dynamic": false,
"properties": {
- "ia_item": { "type": "keyword", "normalizer": "default" },
- "ia_collection": { "type": "keyword", "normalizer": "default" },
- "first_page": { "type": "keyword", "normalizer": "default" },
- "pub_id": { "type": "keyword", "normalizer": "default" }
+ "issue_item": { "type": "keyword", "normalizer": "default" },
+ "pub_collection": { "type": "keyword", "normalizer": "default" },
+ "sim_pubid": { "type": "keyword", "normalizer": "default" },
+ "first_page": { "type": "keyword", "normalizer": "default" }
}
},