From 901cf998ce7d8f896cf5d609719b1defd96d01d4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 16:00:03 -0800 Subject: first implementation of ES file schema Includes a trivial test and transform, but not any workers or doc updates. --- extra/elasticsearch/file_schema.json | 46 ++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 extra/elasticsearch/file_schema.json (limited to 'extra') diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json new file mode 100644 index 00000000..66d81e0b --- /dev/null +++ b/extra/elasticsearch/file_schema.json @@ -0,0 +1,46 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + } + } + } + } +}, +"mappings": { + "changelog": { + "properties": { + "ident": { "type": "keyword", "doc_values": false }, + "state": { "type": "keyword" }, + "revision": { "type": "keyword", "doc_values": false }, + + "release_ids": { "type": "keyword", "doc_values": false }, + "release_count": { "type": "integer" }, + "mimetype": { "type": "keyword" }, + "size_bytes": { "type": "integer" }, + "sha1": { "type": "keyword", "doc_values": false }, + "sha256": { "type": "keyword", "doc_values": false }, + "md5": { "type": "keyword", "doc_values": false }, + + "domains": { "type": "keyword" }, + "hosts": { "type": "keyword" }, + "rels": { "type": "keyword" }, + "in_ia": { "type": "boolean" }, + + "release_id": { "type": "alias", "path": "release_ids" }, + "sha1hex": { "type": "alias", "path": "sha1hex" }, + "sha256hex": { "type": "alias", "path": "sha256hex" }, + "md5hex": { "type": "alias", "path": "md5hex" }, + "size": { "type": "alias", "path": "size_bytes" }, + "domain": { "type": "alias", "path": "domains" }, + "host": { "type": "alias", "path": "host" }, + "rel": { "type": "alias", "path": "rel" } + } + } +} +} -- cgit v1.2.3 From 8e8b447a1d142b7815498ffa02263c34207973b4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 16:21:13 -0800 Subject: container ES schema changes --- extra/elasticsearch/container_schema.json | 33 ++++++++++++++---------- python/fatcat_tools/transforms/elasticsearch.py | 34 +++++++++++++------------ 2 files changed, 38 insertions(+), 29 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index b0a47e85..3be261a2 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -27,13 +27,17 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword" }, + "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "revision": { "type": "keyword", "doc_values": false }, + "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_type": { "type": "keyword" }, "issnl": { "type": "keyword" }, + "issns": { "type": "keyword" }, "wikidata_qid": { "type": "keyword" }, "country": { "type": "keyword" }, "region": { "type": "keyword" }, @@ -43,15 +47,17 @@ "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, - "in_doaj": { "type": "boolean" }, - "in_road": { "type": "boolean" }, - "in_doi": { "type": "boolean" }, - "in_sherpa_romeo":{ "type": "boolean" }, - "is_oa": { "type": "boolean" }, - "is_longtail_oa": { "type": "boolean" }, - "any_kbart": { "type": "boolean" }, - "any_jstor": { "type": "boolean" }, - "any_ia_sim": { "type": "boolean" }, + + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "in_doaj": { "type": "boolean" }, + "in_road": { "type": "boolean" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "any_kbart": { "type": "boolean" }, + "any_jstor": { "type": "boolean" }, + "any_ia_sim": { "type": "boolean" }, + "sherpa_romeo_color": { "type": "keyword" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, @@ -64,6 +70,7 @@ "year": { "type": "alias", "path": "first_year" }, "type": { "type": "alias", "path": "container_type" }, + "issn": { "type": "alias", "path": "issns" }, "oa": { "type": "alias", "path": "is_oa" }, "longtail": { "type": "alias", "path": "is_longtail_oa" } } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 8141a8b9..edc68748 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -257,23 +257,24 @@ def container_to_elasticsearch(entity, force_bool=True): wikidata_qid = entity.wikidata_qid, ) - # TODO: region, discipline - # TODO: single primary language? if not entity.extra: entity.extra = dict() - for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): + for key in ('country', 'languages', 'mimetypes', 'original_name', + 'first_year', 'last_year', 'aliases', 'abbrev', 'region', + 'discipline'): if entity.extra.get(key): t[key] = entity.extra[key] + t['issns'] = [] + if entity.issnl: + t['issns'].append(entity.issnl) + for key in ('issnp', 'issne'): + if entity.extra.get(key): + t['issns'].append(entity.extra[key]) + in_doaj = None in_road = None - # TODO: not currently implemented - in_doi = None - # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" - #in_doaj_works = None - in_sherpa_romeo = None is_oa = None - # TODO: not actually set/stored anywhere? is_longtail_oa = None any_kbart = None any_jstor = None @@ -295,8 +296,9 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('default_license'): if extra['default_license'].startswith('CC-'): is_oa = True + t['sherpa_romeo_color'] = None if extra.get('sherpa_romeo'): - in_sherpa_romeo = True + t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color') if extra['sherpa_romeo'].get('color') == 'white': is_oa = False if extra.get('kbart'): @@ -306,21 +308,21 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('ia'): if extra['ia'].get('sim'): any_ia_sim = True + if extra['ia'].get('longtail_oa'): + is_longtail_oa = True t['is_superceded'] = bool(extra.get('superceded')) t['in_doaj'] = bool(in_doaj) t['in_road'] = bool(in_road) - t['in_sherpa_romeo'] = bool(in_sherpa_romeo) t['any_kbart'] = bool(any_kbart) - t['is_longtail_oa'] = bool(is_longtail_oa) if force_bool: - t['in_doi'] = bool(in_doi) - t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa) + t['is_oa'] = bool(in_doaj or in_road or is_oa) + t['is_longtail_oa'] = bool(is_longtail_oa) t['any_jstor'] = bool(any_jstor) t['any_ia_sim'] = bool(any_ia_sim) else: - t['in_doi'] = in_doi - t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa + t['is_oa'] = in_doaj or in_road or is_oa + t['is_longtail_oa'] = is_longtail_oa t['any_jstor'] = any_jstor t['any_ia_sim'] = any_ia_sim return t -- cgit v1.2.3 From e047fbe1a9c495e86a6757d44eb32c9109a1b753 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 20:39:22 -0800 Subject: ES release schema updates --- extra/elasticsearch/release_schema.json | 69 ++++++++++++++------- python/fatcat_tools/transforms/elasticsearch.py | 81 +++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 28 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 85026060..98a1c28e 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -27,48 +27,62 @@ "mappings": { "release": { "properties": { - "ident": { "type": "keyword" }, + "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "work_id": { "type": "keyword" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "revision": { "type": "keyword", "doc_values": false }, + "work_id": { "type": "keyword", "doc_values": false }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, - "release_year": { "type": "integer" }, - "release_type": { "type": "keyword" }, + "release_year": { "type": "integer", "copy_to": "biblio" }, + "release_type": { "type": "keyword", "copy_to": "biblio" }, "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword" }, + "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "axiv_id": { "type": "keyword" }, - "jstor_id": { "type": "keyword" }, - "ark_id": { "type": "keyword" }, - "mag_id": { "type": "keyword" }, + "volume": { "type": "keyword", "copy_to": "biblio" }, + "issue": { "type": "keyword", "copy_to": "biblio" }, + "pages": { "type": "keyword", "copy_to": "biblio" }, + "first_page": { "type": "keyword" }, + "number": { "type": "keyword", "copy_to": "biblio" }, + "doi": { "type": "keyword", "doc_values": false }, + "doi_prefix": { "type": "keyword" }, + "doi_registrar": { "type": "keyword" }, + "pmid": { "type": "keyword", "doc_values": false }, + "pmcid": { "type": "keyword", "doc_values": false }, + "isbn13": { "type": "keyword", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "doc_values": false }, + "core_id": { "type": "keyword", "doc_values": false }, + "axiv_id": { "type": "keyword", "doc_values": false }, + "jstor_id": { "type": "keyword", "doc_values": false }, + "ark_id": { "type": "keyword", "doc_values": false }, + "mag_id": { "type": "keyword", "doc_values": false }, "license": { "type": "keyword" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_id": { "type": "keyword" }, "container_issnl": { "type": "keyword" }, "container_type": { "type": "keyword" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "affiliation_rors": { "type": "keyword" }, "creator_ids": { "type": "keyword" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, + "ref_release_ids": { "type": "keyword" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "best_pdf_url": { "type": "keyword" }, - "ia_pdf_url": { "type": "keyword" }, + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "best_pdf_url": { "type": "keyword", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "doc_values": false }, "is_oa": { "type": "boolean" }, + "oa_color": { "type": "keyword" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -79,7 +93,13 @@ "in_ia_sim": { "type": "boolean" }, "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, + "is_retracted": { "type": "boolean" }, + "preservation": { "type": "keyword" }, + "affilation": { "type": "alias", "path": "affiliations" }, + "ror": { "type": "alias", "path": "affiliation_rors" }, + "creator_id": { "type": "alias", "path": "creator_id" }, + "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, "date": { "type": "alias", "path": "release_date" }, @@ -90,6 +110,9 @@ "lang": { "type": "alias", "path": "language" }, "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, "release_status": { "type": "alias", "path": "release_stage" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "retracted": { "type": "alias", "path": "is_retracted" }, "is_kept": { "type": "alias", "path": "in_kbart" } } } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index edc68748..b997796d 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -50,6 +50,10 @@ def release_to_elasticsearch(entity, force_bool=True): release_stage = release.release_stage, withdrawn_status = release.withdrawn_status, language = release.language, + volume = release.volume, + issue = release.issue, + pages = release.pages, + number = release.number, license = release.license_slug, doi = release.ext_ids.doi, pmid = release.ext_ids.pmid, @@ -72,7 +76,7 @@ def release_to_elasticsearch(entity, force_bool=True): in_dweb = False in_ia = False in_ia_sim = False - in_shadow = False + in_shadows = False release_year = release.release_year if release.release_date: @@ -85,11 +89,15 @@ def release_to_elasticsearch(entity, force_bool=True): t['any_abstract'] = len(release.abstracts or []) > 0 t['ref_count'] = len(release.refs or []) - t['ref_linked_count'] = 0 - if release.refs: - t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id]) + ref_release_ids = [] + for r in (release.refs or []): + if r.target_release_id: + ref_release_ids.append(r.target_release_id) + t['ref_release_ids'] = ref_release_ids + t['ref_linked_count'] = len(ref_release_ids) t['contrib_count'] = len(release.contribs or []) contrib_names = [] + contrib_affiliations = [] creator_ids = [] for c in (release.contribs or []): if c.raw_name: @@ -98,8 +106,14 @@ def release_to_elasticsearch(entity, force_bool=True): contrib_names.append(c.surname) if c.creator_id: creator_ids.append(c.creator_id) + if c.raw_affiliation: + contrib_affiliations.append(c.raw_affiliation) t['contrib_names'] = contrib_names t['creator_ids'] = creator_ids + t['affiliations'] = contrib_affiliations + + # TODO: mapping... probably by lookup? + t['affiliation_rors'] = None container = release.container if container: @@ -140,8 +154,13 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True - else: + + # fall back to release-level container metadata if container not linked or + # missing context + if not t.get('publisher'): t['publisher'] = release.publisher + if not t.get('container_name') and release.extra: + t['container_name'] = release.extra.get('container_name') if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): in_jstor = True @@ -203,6 +222,46 @@ def release_to_elasticsearch(entity, force_bool=True): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe in_kbart = True + # backwards compatible subtitle fetching + if not t['subtitle'] and extra.get('subtitle'): + if type(extra['subtitle']) == list: + t['subtitle'] = extra['subtitle'][0] + else: + t['subtitle'] = extra['subtitle'] + + t['first_page'] = None + if release.pages: + first = release.pages.split('-')[0] + first = first.replace('p', '') + if release.pages.isdigit(): + t['first_page'] = release.pages + # TODO: non-numerical first pages + + t['ia_microfilm_url'] = None + if in_ia_sim: + # TODO: determine URL somehow? I think this is in flux. Will probably + # need extra metadata in the container extra field. + # special case as a demo for now. + if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ + and release.year in (2011, 2013) \ + and release.volume.isdigit() \ + and t['first_page']: + t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( + release.year, + release.volume - 1, + t['first_page'], + ) + + t['doi_registrar'] = None + if extra and t['doi']: + for k in ('crossref', 'datacite', 'jalc'): + if k in extra: + t['doi_registrar'] = k + if not 'doi_registrar' in t: + t['doi_registrar'] = 'crossref' + + if t['doi']: + t['doi_prefix'] = t['doi'].split('/')[0] if is_longtail_oa: is_oa = True @@ -215,6 +274,7 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = bool(in_jstor) t['in_web'] = bool(in_web) t['in_dweb'] = bool(in_dweb) + t['in_shadows'] = bool(in_shadows) else: t['is_oa'] = is_oa t['is_longtail_oa'] = is_longtail_oa @@ -223,9 +283,20 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = in_jstor t['in_web'] = in_web t['in_dweb'] = in_dweb + t['in_shadows'] = in_shadows t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + + if in_ia: + t['preservation'] = 'bright' + elif in_kbart or in_jstor: + t['preservation'] = 'dark_only' + elif in_shadows: + t['preservation'] = 'shadows_only' + else: + t['preservation'] = 'none' + return t def container_to_elasticsearch(entity, force_bool=True): -- cgit v1.2.3 From d58c3891ac2122dac53ced606568108f543f2d80 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:52:58 -0800 Subject: actually implement changelog transform --- extra/elasticsearch/changelog_schema.json | 11 ++++- python/fatcat_tools/transforms/elasticsearch.py | 62 ++++++++++++++++++------- python/tests/transform_elasticsearch.py | 24 +++++++++- 3 files changed, 78 insertions(+), 19 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index f3211e99..77c77238 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -16,20 +16,29 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword" }, + "editgroup_id": { "type": "keyword", "doc_values": false }, "timestamp": { "type": "date" }, "editor_id": { "type": "keyword" }, "username": { "type": "keyword" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, "agent": { "type": "keyword" }, + "containers": { "type": "integer" }, + "new_containers": { "type": "integer" }, "creators": { "type": "integer" }, + "new_creators": { "type": "integer" }, "files": { "type": "integer" }, + "new_files": { "type": "integer" }, "filessets": { "type": "integer" }, + "new_filessets": { "type": "integer" }, "webcaptures": { "type": "integer" }, + "new_webcaptures": { "type": "integer" }, "releases": { "type": "integer" }, + "new_releases": { "type": "integer" }, "works": { "type": "integer" }, + "new_works": { "type": "integer" }, + "created": { "type": "integer" }, "updated": { "type": "integer" }, "deleted": { "type": "integer" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 812cd1fd..c8547b27 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -401,36 +401,64 @@ def container_to_elasticsearch(entity, force_bool=True): return t +def _type_of_edit(edit): + if edit.revision == None and edit.redirect_ident == None: + return 'delete' + elif edit.redirect_ident: + # redirect + return 'update' + elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision: + return 'create' + else: + return 'update' + + def changelog_to_elasticsearch(entity): editgroup = entity.editgroup t = dict( index=entity.index, editgroup_id=entity.editgroup_id, - timestamp=entity.timestamp, + timestamp=entity.timestamp.isoformat(), editor_id=editgroup.editor_id, + username=editgroup.editor.username, + is_bot=editgroup.editor.is_bot, + is_admin=editgroup.editor.is_admin, ) extra = editgroup.extra or dict() if extra.get('agent'): t['agent'] = extra['agent'] - t['containers'] = len(editgroup.edits.containers) - t['creators'] = len(editgroup.edits.containers) - t['files'] = len(editgroup.edits.containers) - t['filesets'] = len(editgroup.edits.containers) - t['webcaptures'] = len(editgroup.edits.containers) - t['releases'] = len(editgroup.edits.containers) - t['works'] = len(editgroup.edits.containers) - - # TODO: parse and pull out counts - #created = 0 - #updated = 0 - #deleted = 0 - #t['created'] = created - #t['updated'] = updated - #t['deleted'] = deleted - #t['total'] = created + updated + deleted + containers = [_type_of_edit(e) for e in editgroup.edits.containers] + creators = [_type_of_edit(e) for e in editgroup.edits.creators] + files = [_type_of_edit(e) for e in editgroup.edits.files] + filesets = [_type_of_edit(e) for e in editgroup.edits.filesets] + webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures] + releases = [_type_of_edit(e) for e in editgroup.edits.releases] + works = [_type_of_edit(e) for e in editgroup.edits.works] + + t['containers'] = len(containers) + t['new_containers'] = len([e for e in containers if e == 'create']) + t['creators'] = len(creators) + t['new_creators'] = len([e for e in creators if e == 'create']) + t['files'] = len(files) + t['new_files'] = len([e for e in files if e == 'create']) + t['filesets'] = len(filesets) + t['new_filesets'] = len([e for e in filesets if e == 'create']) + t['webcaptures'] = len(webcaptures) + t['new_webcaptures'] = len([e for e in webcaptures if e == 'create']) + t['releases'] = len(releases) + t['new_releases'] = len([e for e in releases if e == 'create']) + t['works'] = len(works) + t['new_works'] = len([e for e in works if e == 'create']) + + all_edits = containers + creators + files + filesets + webcaptures + releases + works + + t['created'] = len([e for e in all_edits if e == 'create']) + t['updated'] = len([e for e in all_edits if e == 'update']) + t['deleted'] = len([e for e in all_edits if e == 'delete']) + t['total'] = len(all_edits) return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index ab613a0a..89a4eef8 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,9 +106,31 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'publisher' in es['rel'] + assert 'web' in es['rel'] # XXX: implement hosts and domain parsing with urlcanon #assert 'journals.plos.org' in es['host'] #assert 'plos.org' in es['domain'] +def test_elasticsearch_changelog_transform(matched_importer): + ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) + + es = changelog_to_elasticsearch(ce) + assert es['index'] == 3469683 + # len("2020-01-30T05:04:39") => 19 + assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19] + assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa" + assert es['username'] == "crawl-bot" + assert es['is_bot'] == True + assert es['is_admin'] == True + assert es['agent'] == "fatcat_tools.IngestFileResultImporter" + + assert es['total'] == 50 + assert es['files'] == 50 + assert es['new_files'] == 50 + assert es['created'] == 50 + + assert es['releases'] == 0 + assert es['new_releases'] == 0 + assert es['updated'] == 0 + assert es['deleted'] == 0 -- cgit v1.2.3 From d5d83762063b8ec7f512c20567f46c03f2e6b542 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:57:32 -0800 Subject: update ES docs and proposal --- extra/elasticsearch/README.md | 2 ++ proposals/2020_elasticsearch_schemas.md | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3a48a178..3e0857b4 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -40,9 +40,11 @@ Drop and rebuild the schema: http delete :9200/fatcat_release http delete :9200/fatcat_container + http delete :9200/fatcat_file http delete :9200/fatcat_changelog http put :9200/fatcat_release < release_schema.json http put :9200/fatcat_container < container_schema.json + http put :9200/fatcat_file < file_schema.json http put :9200/fatcat_changelog < changelog_schema.json Put a single object (good for debugging): diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 83db884f..5fb28d19 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -14,8 +14,6 @@ Simple additions: - pages - `first_page` (parsed from pages) (?) - number -- `in_shadow` -- OA license slug (?) - `doi_prefix` - `doi_registrar` (based on extra) - `first_author` (surname; for matching) @@ -25,6 +23,8 @@ Simple additions: - referenced releases idents - contrib creator idents +Add affiliations, both as raw strings and ROR identifiers. + ## Preservation Summary Field @@ -128,8 +128,8 @@ hit does not}"). ## Container Fields -- `all_issns` -- `release_count` +- `issn` (all issns) +- `original_name` The `release_count` would not be indexed (left null) by default, and would be "patched" in to entities by a separate script (periodically?). -- cgit v1.2.3 From bf718fd076476c1a54e80ca88cd02ede606ab6f3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:23:39 -0800 Subject: add country to v03b release schema --- extra/elasticsearch/release_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 2 ++ 2 files changed, 3 insertions(+) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 98a1c28e..2b67c5f5 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -40,6 +40,7 @@ "release_stage": { "type": "keyword" }, "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, "language": { "type": "keyword" }, + "country": { "type": "keyword" }, "volume": { "type": "keyword", "copy_to": "biblio" }, "issue": { "type": "keyword", "copy_to": "biblio" }, "pages": { "type": "keyword", "copy_to": "biblio" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index c8547b27..f0146d01 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -155,6 +155,8 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True + if c_extra.get('country'): + t['country'] = c_extra['country'] # fall back to release-level container metadata if container not linked or # missing context -- cgit v1.2.3 From e98f389a53d886b4fa8f0237b90b086999770f78 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:26:58 -0800 Subject: elastic schema fixes --- extra/elasticsearch/file_schema.json | 12 ++++++------ extra/elasticsearch/release_schema.json | 2 +- python/fatcat_tools/transforms/elasticsearch.py | 5 +++++ 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 66d81e0b..2a7e5be0 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -13,7 +13,7 @@ } }, "mappings": { - "changelog": { + "file": { "properties": { "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, @@ -33,13 +33,13 @@ "in_ia": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, - "sha1hex": { "type": "alias", "path": "sha1hex" }, - "sha256hex": { "type": "alias", "path": "sha256hex" }, - "md5hex": { "type": "alias", "path": "md5hex" }, + "sha1hex": { "type": "alias", "path": "sha1" }, + "sha256hex": { "type": "alias", "path": "sha256" }, + "md5hex": { "type": "alias", "path": "md5" }, "size": { "type": "alias", "path": "size_bytes" }, "domain": { "type": "alias", "path": "domains" }, - "host": { "type": "alias", "path": "host" }, - "rel": { "type": "alias", "path": "rel" } + "host": { "type": "alias", "path": "hosts" }, + "rel": { "type": "alias", "path": "rels" } } } } diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 2b67c5f5..3d301dba 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -99,7 +99,7 @@ "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, - "creator_id": { "type": "alias", "path": "creator_id" }, + "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index f0146d01..42669bbf 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -416,6 +416,11 @@ def _type_of_edit(edit): def changelog_to_elasticsearch(entity): + """ + Note that this importer requires expanded fill info to work. Calling code + may need to re-fetch editgroup from API to get the 'editor' field. Some of + the old kafka feed content doesn't includes editor in particular. + """ editgroup = entity.editgroup t = dict( -- cgit v1.2.3 From 59912583926077260d99a9bf77a938c2215eb6c8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:20:34 -0800 Subject: tweak file ES archive.org domain tracking --- extra/elasticsearch/file_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'extra') diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 2a7e5be0..a0ac3346 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -31,6 +31,7 @@ "hosts": { "type": "keyword" }, "rels": { "type": "keyword" }, "in_ia": { "type": "boolean" }, + "in_ia_petabox": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, "sha1hex": { "type": "alias", "path": "sha1" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e1980d90..9aa3cece 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -505,5 +505,11 @@ def file_to_elasticsearch(entity): t['rels'] = list(set([u.rel for u in entity.urls])) t['in_ia'] = bool('archive.org' in t['domains']) + t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + + # ok, but actually remove archive.org hosts, because they make other + # aggregations hard and are a waste of storage + t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] + t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] return t -- cgit v1.2.3 From b7404fb0f696807db3a92bc2c4c73c2d208e59ef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:51:56 -0800 Subject: ES schemas: make keywords case-insensitive by default But not applying asciifolding; don't see any need to do so? --- extra/elasticsearch/changelog_schema.json | 20 +++++-- extra/elasticsearch/container_schema.json | 38 ++++++++----- extra/elasticsearch/file_schema.json | 34 ++++++++---- extra/elasticsearch/release_schema.json | 89 ++++++++++++++++++------------- 4 files changed, 115 insertions(+), 66 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index 77c77238..d958fed9 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -16,13 +28,13 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword", "doc_values": false }, + "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, - "editor_id": { "type": "keyword" }, - "username": { "type": "keyword" }, + "editor_id": { "type": "keyword", "normalizer": "default" }, + "username": { "type": "keyword", "normalize": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, + "agent": { "type": "keyword", "normalize": "caseSensitive" }, "containers": { "type": "integer" }, "new_containers": { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 3be261a2..be3a408e 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,23 +39,23 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_type": { "type": "keyword" }, - "issnl": { "type": "keyword" }, - "issns": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "country": { "type": "keyword" }, - "region": { "type": "keyword" }, - "discipline": { "type": "keyword" }, - "languages": { "type": "keyword" }, - "mimetypes": { "type": "keyword" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "region": { "type": "keyword", "normalizer": "default" }, + "discipline": { "type": "keyword", "normalizer": "default" }, + "languages": { "type": "keyword", "normalizer": "default" }, + "mimetypes": { "type": "keyword", "normalizer": "default" }, "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, @@ -57,7 +69,7 @@ "any_kbart": { "type": "boolean" }, "any_jstor": { "type": "boolean" }, "any_ia_sim": { "type": "boolean" }, - "sherpa_romeo_color": { "type": "keyword" }, + "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index a0ac3346..9c8ee64c 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -15,21 +27,21 @@ "mappings": { "file": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "release_ids": { "type": "keyword", "doc_values": false }, + "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, "release_count": { "type": "integer" }, - "mimetype": { "type": "keyword" }, + "mimetype": { "type": "keyword", "normalizer": "default" }, "size_bytes": { "type": "integer" }, - "sha1": { "type": "keyword", "doc_values": false }, - "sha256": { "type": "keyword", "doc_values": false }, - "md5": { "type": "keyword", "doc_values": false }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "md5": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "domains": { "type": "keyword" }, - "hosts": { "type": "keyword" }, - "rels": { "type": "keyword" }, + "domains": { "type": "keyword", "normalizer": "default" }, + "hosts": { "type": "keyword", "normalizer": "default" }, + "rels": { "type": "keyword", "normalizer": "default" }, "in_ia": { "type": "boolean" }, "in_ia_petabox": { "type": "boolean" }, diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 3d301dba..f983a703 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,58 +20,71 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } +} }, "mappings": { "release": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, - "work_id": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, "release_year": { "type": "integer", "copy_to": "biblio" }, - "release_type": { "type": "keyword", "copy_to": "biblio" }, - "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, - "language": { "type": "keyword" }, - "country": { "type": "keyword" }, - "volume": { "type": "keyword", "copy_to": "biblio" }, - "issue": { "type": "keyword", "copy_to": "biblio" }, - "pages": { "type": "keyword", "copy_to": "biblio" }, - "first_page": { "type": "keyword" }, - "number": { "type": "keyword", "copy_to": "biblio" }, - "doi": { "type": "keyword", "doc_values": false }, - "doi_prefix": { "type": "keyword" }, - "doi_registrar": { "type": "keyword" }, - "pmid": { "type": "keyword", "doc_values": false }, - "pmcid": { "type": "keyword", "doc_values": false }, - "isbn13": { "type": "keyword", "doc_values": false }, - "wikidata_qid": { "type": "keyword", "doc_values": false }, - "core_id": { "type": "keyword", "doc_values": false }, - "axiv_id": { "type": "keyword", "doc_values": false }, - "jstor_id": { "type": "keyword", "doc_values": false }, - "ark_id": { "type": "keyword", "doc_values": false }, - "mag_id": { "type": "keyword", "doc_values": false }, - "license": { "type": "keyword" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_id": { "type": "keyword" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "affiliation_rors": { "type": "keyword" }, - "creator_ids": { "type": "keyword" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, - "ref_release_ids": { "type": "keyword" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, @@ -79,11 +92,11 @@ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "best_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_microfilm_url": { "type": "keyword", "doc_values": false }, + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, "is_oa": { "type": "boolean" }, - "oa_color": { "type": "keyword" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -95,7 +108,7 @@ "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, "is_retracted": { "type": "boolean" }, - "preservation": { "type": "keyword" }, + "preservation": { "type": "keyword", "normalizer": "default" }, "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, -- cgit v1.2.3 From ca283a45cc151f3346e403c8d57f55ec75f40672 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 01:00:12 -0800 Subject: JSON typo in release mapping --- extra/elasticsearch/release_schema.json | 1 - 1 file changed, 1 deletion(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index f983a703..07601f36 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -35,7 +35,6 @@ } } } -} }, "mappings": { "release": { -- cgit v1.2.3 From caa588612b91181950697756eace8fda270fd092 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 01:03:08 -0800 Subject: add upper-case work-around from kibana map join --- extra/elasticsearch/release_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 1 + 2 files changed, 2 insertions(+) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 07601f36..c0bbda22 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -53,6 +53,7 @@ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "language": { "type": "keyword", "normalizer": "default" }, "country": { "type": "keyword", "normalizer": "default" }, + "country_upper": { "type": "keyword", "normalizer": "caseSensitive" }, "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 9aa3cece..ded239d3 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -157,6 +157,7 @@ def release_to_elasticsearch(entity, force_bool=True): is_oa = True if c_extra.get('country'): t['country'] = c_extra['country'] + t['country_upper'] = c_extra['country'].upper() # fall back to release-level container metadata if container not linked or # missing context -- cgit v1.2.3 From 8aac86c4484f0376c46cdd51c69d5ada478b7f72 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 01:12:51 -0800 Subject: fix json typos in changelog schema --- extra/elasticsearch/changelog_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index d958fed9..d8342549 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -31,10 +31,10 @@ "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, "editor_id": { "type": "keyword", "normalizer": "default" }, - "username": { "type": "keyword", "normalize": "caseSensitive" }, + "username": { "type": "keyword", "normalizer": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword", "normalize": "caseSensitive" }, + "agent": { "type": "keyword", "normalizer": "caseSensitive" }, "containers": { "type": "integer" }, "new_containers": { "type": "integer" }, -- cgit v1.2.3 From fbd79c7315cad4789eb0e92c136c59da8f38c4f3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 13:33:38 -0800 Subject: ES release schema: fix typo --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index c0bbda22..2cc9169c 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -110,7 +110,7 @@ "is_retracted": { "type": "boolean" }, "preservation": { "type": "keyword", "normalizer": "default" }, - "affilation": { "type": "alias", "path": "affiliations" }, + "affiliation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, -- cgit v1.2.3 From 8007cdfc4e06753a9bbba56d1fa7f9686775e5e8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 4 Feb 2020 15:10:26 -0800 Subject: fix axiv/arxiv typo in release schema --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 2cc9169c..607bacf1 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -67,7 +67,7 @@ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, -- cgit v1.2.3 From 3655bbe6c539fdeccfbfaa19b6fc93a4859e0ca7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 5 Feb 2020 15:42:43 -0800 Subject: ES release: actually do want doc_values for work_id Eg, for fast "unique count" --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 607bacf1..b85fc8a4 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -42,7 +42,7 @@ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, "state": { "type": "keyword", "normalizer": "default" }, "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default" }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, -- cgit v1.2.3 From 2f8788152ff740d049d11e2e263cac978d526e2a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Feb 2020 14:22:59 -0800 Subject: release schema: do doc_value on DOIs Because DOIs are pseudo-structured (prefix, and often structure within the publisher-controlled area), I suspect we will in fact be wanting to do analytics over these strings. --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index b85fc8a4..1b91696c 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -59,7 +59,7 @@ "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "first_page": { "type": "keyword", "normalizer": "default" }, "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, - "doi": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doi": { "type": "keyword", "normalizer": "default" }, "doi_prefix": { "type": "keyword", "normalizer": "default" }, "doi_registrar": { "type": "keyword", "normalizer": "default" }, "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, -- cgit v1.2.3 From 0450f22006c9b991cdc4695458fc3b3e3e97bfbb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 11:22:30 -0800 Subject: ES release: last minor tweaks --- extra/elasticsearch/release_schema.json | 8 +++++--- python/fatcat_tools/transforms/elasticsearch.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 1b91696c..666a672f 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -52,8 +52,8 @@ "release_stage": { "type": "keyword", "normalizer": "default" }, "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "language": { "type": "keyword", "normalizer": "default" }, - "country": { "type": "keyword", "normalizer": "default" }, - "country_upper": { "type": "keyword", "normalizer": "caseSensitive" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" }, "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, @@ -71,8 +71,10 @@ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_id": { "type": "keyword", "normalizer": "default" }, "container_issnl": { "type": "keyword", "normalizer": "default" }, @@ -110,7 +112,7 @@ "is_retracted": { "type": "boolean" }, "preservation": { "type": "keyword", "normalizer": "default" }, - "affiliation": { "type": "alias", "path": "affiliations" }, + "affiliation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e00d7830..cbafca7e 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -156,8 +156,8 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra['szczepanski'].get('as_of'): is_oa = True if c_extra.get('country'): - t['country'] = c_extra['country'] - t['country_upper'] = c_extra['country'].upper() + t['country_code'] = c_extra['country'] + t['country_code_upper'] = c_extra['country'].upper() # fall back to release-level container metadata if container not linked or # missing context -- cgit v1.2.3 From 4e6bc246d01183f4c7ffad7d0d474e683f04c07f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 11:28:05 -0800 Subject: ES container last tweaks --- extra/elasticsearch/container_schema.json | 7 ++++--- python/fatcat_tools/transforms/elasticsearch.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index be3a408e..5cd85b04 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -47,11 +47,12 @@ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, "container_type": { "type": "keyword", "normalizer": "default" }, "issnl": { "type": "keyword", "normalizer": "default" }, "issns": { "type": "keyword", "normalizer": "default" }, "wikidata_qid": { "type": "keyword", "normalizer": "default" }, - "country": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, "region": { "type": "keyword", "normalizer": "default" }, "discipline": { "type": "keyword", "normalizer": "default" }, "languages": { "type": "keyword", "normalizer": "default" }, @@ -74,8 +75,8 @@ "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, "releases_ia": { "type": "integer" }, - "releases_sim": { "type": "integer" }, - "releases_shadow": { "type": "integer" }, + "releases_ia_sim": { "type": "integer" }, + "releases_shadows": { "type": "integer" }, "releases_any_file": { "type": "integer" }, "releases_any_fileset": { "type": "integer" }, "releases_any_webcapture": { "type": "integer" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index cbafca7e..8581febd 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -342,6 +342,9 @@ def container_to_elasticsearch(entity, force_bool=True): if entity.extra.get(key): t[key] = entity.extra[key] + if 'country' in t: + t['country_code'] = t.pop('country') + t['issns'] = [] if entity.issnl: t['issns'].append(entity.issnl) -- cgit v1.2.3 From 0ab3f66664fd4cc63cf9040e351d725c6a5c22b9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 12:27:28 -0800 Subject: update ES transform README - smaller batch sizes to prevent esbulk errors - file transform/index --- extra/elasticsearch/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3e0857b4..df4cb918 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -59,8 +59,9 @@ Bulk insert from a file on disk: Or, in a bulk production live-stream conversion: export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release - time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_release -type release + time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_file -type file ## Index Aliases -- cgit v1.2.3