diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-16 13:43:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-16 13:43:04 -0700 |
commit | b8eec609b60ca00cb6d8d2bb253e11f8dc474b53 (patch) | |
tree | 87a9e71444932f9aa9269fd9788b3bf79621e1e7 | |
parent | 2d3d0274a23f0e52bff8b786aa7a930cb5b74c99 (diff) | |
download | fatcat-scholar-b8eec609b60ca00cb6d8d2bb253e11f8dc474b53.tar.gz fatcat-scholar-b8eec609b60ca00cb6d8d2bb253e11f8dc474b53.zip |
tweak ES schema fields a bit
-rw-r--r-- | fatcat_scholar/es_transform.py | 7 | ||||
-rw-r--r-- | proposals/work_schema.md | 35 | ||||
-rw-r--r-- | schema/scholar_fulltext.v01.json | 9 |
3 files changed, 32 insertions, 19 deletions
diff --git a/fatcat_scholar/es_transform.py b/fatcat_scholar/es_transform.py index 1f47e2c..089b155 100644 --- a/fatcat_scholar/es_transform.py +++ b/fatcat_scholar/es_transform.py @@ -65,6 +65,7 @@ class ScholarBiblio(BaseModel): container_original_name: Optional[str] container_ident: Optional[str] container_issnl: Optional[str] + container_wikidata_qid: Optional[str] issns: List[str] container_type: Optional[str] contrib_count: Optional[int] @@ -112,10 +113,10 @@ class ScholarRelease(BaseModel): container_type: Optional[str] class ScholarSim(BaseModel): - ia_item: str - ia_collection: str + issue_item: str + pub_collection: str + sim_pubid: str first_page: Optional[str] - pub_id: str class ScholarAbstract(BaseModel): body: str diff --git a/proposals/work_schema.md b/proposals/work_schema.md index 1e0f272..933e750 100644 --- a/proposals/work_schema.md +++ b/proposals/work_schema.md @@ -1,19 +1,21 @@ ## Top-Level -- type: _doc -- key: keyword -- key_type: keyword (work or page) -- `work_id` -- biblio: obj -- fulltext: obj -- sim: obj -- abstracts: nested +- type: `_doc` (aka, no type, `include_type_name=false`) +- key: keyword (same as `_id`) +- `doc_type`: keyword (work or page) +- `doc_index_ts`: timestamp when document indexed +- `work_id`: fatcat work ident (optional) + +- `biblio`: obj +- `fulltext`: obj +- `ia_sim`: obj +- `abstracts`: nested body lang -- releases: nested (TBD) -- access -- tags: array of keywords +- `releases`: nested (TBD) +- `access` +- `tags`: array of keywords TODO: - summary fields to index "everything" into? @@ -50,11 +52,14 @@ NEW: - `container_name` (etc) - `container_id` - `container_issnl` -- `container_issn` (array) +- `container_wikidata_qid` +- `issns` (array) - `contrib_names` - `affiliations` - `creator_ids` +TODO: should all external identifiers go under `releases` instead of `biblio`? Or some duplicated? + ## Fulltext - `status`: web, sim, shadow @@ -81,6 +86,12 @@ Only index one abstract per language. Enough details to construct a link or do a lookup or whatever. Note that might be doing CDL status lookups on SERP pages. +- `issue_item`: str +- `pub_collection`: str +- `sim_pubid`: str +- `first_page`: str + + Also pass-through archive.org metadata here (collection-level and item-level) ## Access diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json index 613ca1e..e09b00c 100644 --- a/schema/scholar_fulltext.v01.json +++ b/schema/scholar_fulltext.v01.json @@ -98,6 +98,7 @@ "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "container_ident": { "type": "keyword", "normalizer": "default" }, "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_wikidata_qid": { "type": "keyword", "normalizer": "default" }, "issns": { "type": "keyword", "normalizer": "default" }, "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, @@ -128,10 +129,10 @@ "type": "object", "dynamic": false, "properties": { - "ia_item": { "type": "keyword", "normalizer": "default" }, - "ia_collection": { "type": "keyword", "normalizer": "default" }, - "first_page": { "type": "keyword", "normalizer": "default" }, - "pub_id": { "type": "keyword", "normalizer": "default" } + "issue_item": { "type": "keyword", "normalizer": "default" }, + "pub_collection": { "type": "keyword", "normalizer": "default" }, + "sim_pubid": { "type": "keyword", "normalizer": "default" }, + "first_page": { "type": "keyword", "normalizer": "default" } } }, |