From 901cf998ce7d8f896cf5d609719b1defd96d01d4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 16:00:03 -0800 Subject: first implementation of ES file schema Includes a trivial test and transform, but not any workers or doc updates. --- extra/elasticsearch/file_schema.json | 46 ++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 extra/elasticsearch/file_schema.json (limited to 'extra/elasticsearch/file_schema.json') diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json new file mode 100644 index 00000000..66d81e0b --- /dev/null +++ b/extra/elasticsearch/file_schema.json @@ -0,0 +1,46 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + } + } + } + } +}, +"mappings": { + "changelog": { + "properties": { + "ident": { "type": "keyword", "doc_values": false }, + "state": { "type": "keyword" }, + "revision": { "type": "keyword", "doc_values": false }, + + "release_ids": { "type": "keyword", "doc_values": false }, + "release_count": { "type": "integer" }, + "mimetype": { "type": "keyword" }, + "size_bytes": { "type": "integer" }, + "sha1": { "type": "keyword", "doc_values": false }, + "sha256": { "type": "keyword", "doc_values": false }, + "md5": { "type": "keyword", "doc_values": false }, + + "domains": { "type": "keyword" }, + "hosts": { "type": "keyword" }, + "rels": { "type": "keyword" }, + "in_ia": { "type": "boolean" }, + + "release_id": { "type": "alias", "path": "release_ids" }, + "sha1hex": { "type": "alias", "path": "sha1hex" }, + "sha256hex": { "type": "alias", "path": "sha256hex" }, + "md5hex": { "type": "alias", "path": "md5hex" }, + "size": { "type": "alias", "path": "size_bytes" }, + "domain": { "type": "alias", "path": "domains" }, + "host": { "type": "alias", "path": "host" }, + "rel": { "type": "alias", "path": "rel" } + } + } +} +} -- cgit v1.2.3 From e98f389a53d886b4fa8f0237b90b086999770f78 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:26:58 -0800 Subject: elastic schema fixes --- extra/elasticsearch/file_schema.json | 12 ++++++------ extra/elasticsearch/release_schema.json | 2 +- python/fatcat_tools/transforms/elasticsearch.py | 5 +++++ 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'extra/elasticsearch/file_schema.json') diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 66d81e0b..2a7e5be0 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -13,7 +13,7 @@ } }, "mappings": { - "changelog": { + "file": { "properties": { "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, @@ -33,13 +33,13 @@ "in_ia": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, - "sha1hex": { "type": "alias", "path": "sha1hex" }, - "sha256hex": { "type": "alias", "path": "sha256hex" }, - "md5hex": { "type": "alias", "path": "md5hex" }, + "sha1hex": { "type": "alias", "path": "sha1" }, + "sha256hex": { "type": "alias", "path": "sha256" }, + "md5hex": { "type": "alias", "path": "md5" }, "size": { "type": "alias", "path": "size_bytes" }, "domain": { "type": "alias", "path": "domains" }, - "host": { "type": "alias", "path": "host" }, - "rel": { "type": "alias", "path": "rel" } + "host": { "type": "alias", "path": "hosts" }, + "rel": { "type": "alias", "path": "rels" } } } } diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 2b67c5f5..3d301dba 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -99,7 +99,7 @@ "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, - "creator_id": { "type": "alias", "path": "creator_id" }, + "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index f0146d01..42669bbf 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -416,6 +416,11 @@ def _type_of_edit(edit): def changelog_to_elasticsearch(entity): + """ + Note that this importer requires expanded fill info to work. Calling code + may need to re-fetch editgroup from API to get the 'editor' field. Some of + the old kafka feed content doesn't includes editor in particular. + """ editgroup = entity.editgroup t = dict( -- cgit v1.2.3 From 59912583926077260d99a9bf77a938c2215eb6c8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:20:34 -0800 Subject: tweak file ES archive.org domain tracking --- extra/elasticsearch/file_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'extra/elasticsearch/file_schema.json') diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 2a7e5be0..a0ac3346 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -31,6 +31,7 @@ "hosts": { "type": "keyword" }, "rels": { "type": "keyword" }, "in_ia": { "type": "boolean" }, + "in_ia_petabox": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, "sha1hex": { "type": "alias", "path": "sha1" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e1980d90..9aa3cece 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -505,5 +505,11 @@ def file_to_elasticsearch(entity): t['rels'] = list(set([u.rel for u in entity.urls])) t['in_ia'] = bool('archive.org' in t['domains']) + t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + + # ok, but actually remove archive.org hosts, because they make other + # aggregations hard and are a waste of storage + t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] + t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] return t -- cgit v1.2.3 From b7404fb0f696807db3a92bc2c4c73c2d208e59ef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:51:56 -0800 Subject: ES schemas: make keywords case-insensitive by default But not applying asciifolding; don't see any need to do so? --- extra/elasticsearch/changelog_schema.json | 20 +++++-- extra/elasticsearch/container_schema.json | 38 ++++++++----- extra/elasticsearch/file_schema.json | 34 ++++++++---- extra/elasticsearch/release_schema.json | 89 ++++++++++++++++++------------- 4 files changed, 115 insertions(+), 66 deletions(-) (limited to 'extra/elasticsearch/file_schema.json') diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index 77c77238..d958fed9 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -16,13 +28,13 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword", "doc_values": false }, + "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, - "editor_id": { "type": "keyword" }, - "username": { "type": "keyword" }, + "editor_id": { "type": "keyword", "normalizer": "default" }, + "username": { "type": "keyword", "normalize": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, + "agent": { "type": "keyword", "normalize": "caseSensitive" }, "containers": { "type": "integer" }, "new_containers": { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 3be261a2..be3a408e 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,23 +39,23 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_type": { "type": "keyword" }, - "issnl": { "type": "keyword" }, - "issns": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "country": { "type": "keyword" }, - "region": { "type": "keyword" }, - "discipline": { "type": "keyword" }, - "languages": { "type": "keyword" }, - "mimetypes": { "type": "keyword" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "region": { "type": "keyword", "normalizer": "default" }, + "discipline": { "type": "keyword", "normalizer": "default" }, + "languages": { "type": "keyword", "normalizer": "default" }, + "mimetypes": { "type": "keyword", "normalizer": "default" }, "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, @@ -57,7 +69,7 @@ "any_kbart": { "type": "boolean" }, "any_jstor": { "type": "boolean" }, "any_ia_sim": { "type": "boolean" }, - "sherpa_romeo_color": { "type": "keyword" }, + "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index a0ac3346..9c8ee64c 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -15,21 +27,21 @@ "mappings": { "file": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "release_ids": { "type": "keyword", "doc_values": false }, + "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, "release_count": { "type": "integer" }, - "mimetype": { "type": "keyword" }, + "mimetype": { "type": "keyword", "normalizer": "default" }, "size_bytes": { "type": "integer" }, - "sha1": { "type": "keyword", "doc_values": false }, - "sha256": { "type": "keyword", "doc_values": false }, - "md5": { "type": "keyword", "doc_values": false }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "md5": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "domains": { "type": "keyword" }, - "hosts": { "type": "keyword" }, - "rels": { "type": "keyword" }, + "domains": { "type": "keyword", "normalizer": "default" }, + "hosts": { "type": "keyword", "normalizer": "default" }, + "rels": { "type": "keyword", "normalizer": "default" }, "in_ia": { "type": "boolean" }, "in_ia_petabox": { "type": "boolean" }, diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 3d301dba..f983a703 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,58 +20,71 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } +} }, "mappings": { "release": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, - "work_id": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, "release_year": { "type": "integer", "copy_to": "biblio" }, - "release_type": { "type": "keyword", "copy_to": "biblio" }, - "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, - "language": { "type": "keyword" }, - "country": { "type": "keyword" }, - "volume": { "type": "keyword", "copy_to": "biblio" }, - "issue": { "type": "keyword", "copy_to": "biblio" }, - "pages": { "type": "keyword", "copy_to": "biblio" }, - "first_page": { "type": "keyword" }, - "number": { "type": "keyword", "copy_to": "biblio" }, - "doi": { "type": "keyword", "doc_values": false }, - "doi_prefix": { "type": "keyword" }, - "doi_registrar": { "type": "keyword" }, - "pmid": { "type": "keyword", "doc_values": false }, - "pmcid": { "type": "keyword", "doc_values": false }, - "isbn13": { "type": "keyword", "doc_values": false }, - "wikidata_qid": { "type": "keyword", "doc_values": false }, - "core_id": { "type": "keyword", "doc_values": false }, - "axiv_id": { "type": "keyword", "doc_values": false }, - "jstor_id": { "type": "keyword", "doc_values": false }, - "ark_id": { "type": "keyword", "doc_values": false }, - "mag_id": { "type": "keyword", "doc_values": false }, - "license": { "type": "keyword" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_id": { "type": "keyword" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "affiliation_rors": { "type": "keyword" }, - "creator_ids": { "type": "keyword" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, - "ref_release_ids": { "type": "keyword" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, @@ -79,11 +92,11 @@ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "best_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_microfilm_url": { "type": "keyword", "doc_values": false }, + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, "is_oa": { "type": "boolean" }, - "oa_color": { "type": "keyword" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -95,7 +108,7 @@ "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, "is_retracted": { "type": "boolean" }, - "preservation": { "type": "keyword" }, + "preservation": { "type": "keyword", "normalizer": "default" }, "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, -- cgit v1.2.3