From 901cf998ce7d8f896cf5d609719b1defd96d01d4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 16:00:03 -0800 Subject: first implementation of ES file schema Includes a trivial test and transform, but not any workers or doc updates. --- extra/elasticsearch/file_schema.json | 46 +++++++++++++++++++++++++ python/fatcat_tools/transforms/__init__.py | 2 +- python/fatcat_tools/transforms/elasticsearch.py | 45 ++++++++++++++++++++++++ python/tests/transform_tests.py | 25 ++++++++++++-- 4 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 extra/elasticsearch/file_schema.json diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json new file mode 100644 index 00000000..66d81e0b --- /dev/null +++ b/extra/elasticsearch/file_schema.json @@ -0,0 +1,46 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + } + } + } + } +}, +"mappings": { + "changelog": { + "properties": { + "ident": { "type": "keyword", "doc_values": false }, + "state": { "type": "keyword" }, + "revision": { "type": "keyword", "doc_values": false }, + + "release_ids": { "type": "keyword", "doc_values": false }, + "release_count": { "type": "integer" }, + "mimetype": { "type": "keyword" }, + "size_bytes": { "type": "integer" }, + "sha1": { "type": "keyword", "doc_values": false }, + "sha256": { "type": "keyword", "doc_values": false }, + "md5": { "type": "keyword", "doc_values": false }, + + "domains": { "type": "keyword" }, + "hosts": { "type": "keyword" }, + "rels": { "type": "keyword" }, + "in_ia": { "type": "boolean" }, + + "release_id": { "type": "alias", "path": "release_ids" }, + "sha1hex": { "type": "alias", "path": "sha1hex" }, + "sha256hex": { "type": "alias", "path": "sha256hex" }, + "md5hex": { "type": "alias", "path": "md5hex" }, + "size": { "type": "alias", "path": "size_bytes" }, + "domain": { "type": "alias", "path": "domains" }, + "host": { "type": "alias", "path": "host" }, + "rel": { "type": "alias", "path": "rel" } + } + } +} +} diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py index 6a4b1bba..3f4700ff 100644 --- a/python/fatcat_tools/transforms/__init__.py +++ b/python/fatcat_tools/transforms/__init__.py @@ -1,5 +1,5 @@ from .entities import entity_to_dict, entity_from_json, entity_from_dict -from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch, file_to_elasticsearch from .csl import release_to_csl, citeproc_csl from .ingest import release_ingest_request diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 3a53db4d..8141a8b9 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -357,3 +357,48 @@ def changelog_to_elasticsearch(entity): #t['deleted'] = deleted #t['total'] = created + updated + deleted return t + + +def file_to_elasticsearch(entity): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + t = dict( + ident = entity.ident, + state = entity.state, + revision = entity.revision, + release_ids = entity.release_ids, + release_count = len(entity.release_ids), + mimetype = entity.mimetype, + size_bytes = entity.size, + sha1 = entity.sha1, + sha256 = entity.sha256, + md5 = entity.md5, + rel = [u.rel for u in entity.urls], + ) + + # TODO: domain, hosts (from urls; use proper urlcanon) + t['rel'] = list(set([u.rel for u in entity.urls])) + t['host'] = [] + t['domain'] = [] + + in_ia = False + for u in entity.urls: + if '://archive.org/' in u.url or '://web.archive.org/' in u.url: + in_ia = True + t['in_ia'] = bool(in_ia) + + return t diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py index f254e117..7b583ac4 100644 --- a/python/tests/transform_tests.py +++ b/python/tests/transform_tests.py @@ -7,6 +7,7 @@ from fixtures import api from import_journal_metadata import journal_metadata_importer from import_crossref import crossref_importer +from import_matched import matched_importer def test_basic_elasticsearch_convert(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: @@ -72,14 +73,34 @@ def test_rich_elasticsearch_convert(): assert es['ref_count'] == 2 assert es['ref_linked_count'] == 1 -def test_elasticsearch_from_json(): +def test_elasticsearch_release_from_json(): r = entity_from_json(open('./tests/files/math_universe.json', 'r').read(), ReleaseEntity) release_to_elasticsearch(r) -def test_elasticsearch_container_convert(journal_metadata_importer): +def test_elasticsearch_container_transform(journal_metadata_importer): with open('tests/files/journal_metadata.sample.json', 'r') as f: raw = json.loads(f.readline()) c = journal_metadata_importer.parse_record(raw) c.state = 'active' es = container_to_elasticsearch(c) assert es['publisher'] == c.publisher + +def test_elasticsearch_file_transform(matched_importer): + with open('tests/files/example_matched.json', 'r') as f: + raw = json.loads(f.readline()) + f = matched_importer.parse_record(raw) + + f.state = 'active' + es = file_to_elasticsearch(f) + assert es['sha1'] == f.sha1 + assert es['sha256'] == f.sha256 + assert es['md5'] == f.md5 + assert es['size_bytes'] == f.size + assert es['mimetype'] == f.mimetype + assert es['in_ia'] == True + assert 'publisher' in es['rel'] + + # XXX: implement hosts and domain parsing with urlcanon + #assert 'journals.plos.org' in es['host'] + #assert 'plos.org' in es['domain'] + -- cgit v1.2.3 From 8e8b447a1d142b7815498ffa02263c34207973b4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 16:21:13 -0800 Subject: container ES schema changes --- extra/elasticsearch/container_schema.json | 33 ++++++++++++++---------- python/fatcat_tools/transforms/elasticsearch.py | 34 +++++++++++++------------ 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index b0a47e85..3be261a2 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -27,13 +27,17 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword" }, + "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "revision": { "type": "keyword", "doc_values": false }, + "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_type": { "type": "keyword" }, "issnl": { "type": "keyword" }, + "issns": { "type": "keyword" }, "wikidata_qid": { "type": "keyword" }, "country": { "type": "keyword" }, "region": { "type": "keyword" }, @@ -43,15 +47,17 @@ "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, - "in_doaj": { "type": "boolean" }, - "in_road": { "type": "boolean" }, - "in_doi": { "type": "boolean" }, - "in_sherpa_romeo":{ "type": "boolean" }, - "is_oa": { "type": "boolean" }, - "is_longtail_oa": { "type": "boolean" }, - "any_kbart": { "type": "boolean" }, - "any_jstor": { "type": "boolean" }, - "any_ia_sim": { "type": "boolean" }, + + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "in_doaj": { "type": "boolean" }, + "in_road": { "type": "boolean" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "any_kbart": { "type": "boolean" }, + "any_jstor": { "type": "boolean" }, + "any_ia_sim": { "type": "boolean" }, + "sherpa_romeo_color": { "type": "keyword" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, @@ -64,6 +70,7 @@ "year": { "type": "alias", "path": "first_year" }, "type": { "type": "alias", "path": "container_type" }, + "issn": { "type": "alias", "path": "issns" }, "oa": { "type": "alias", "path": "is_oa" }, "longtail": { "type": "alias", "path": "is_longtail_oa" } } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 8141a8b9..edc68748 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -257,23 +257,24 @@ def container_to_elasticsearch(entity, force_bool=True): wikidata_qid = entity.wikidata_qid, ) - # TODO: region, discipline - # TODO: single primary language? if not entity.extra: entity.extra = dict() - for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): + for key in ('country', 'languages', 'mimetypes', 'original_name', + 'first_year', 'last_year', 'aliases', 'abbrev', 'region', + 'discipline'): if entity.extra.get(key): t[key] = entity.extra[key] + t['issns'] = [] + if entity.issnl: + t['issns'].append(entity.issnl) + for key in ('issnp', 'issne'): + if entity.extra.get(key): + t['issns'].append(entity.extra[key]) + in_doaj = None in_road = None - # TODO: not currently implemented - in_doi = None - # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" - #in_doaj_works = None - in_sherpa_romeo = None is_oa = None - # TODO: not actually set/stored anywhere? is_longtail_oa = None any_kbart = None any_jstor = None @@ -295,8 +296,9 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('default_license'): if extra['default_license'].startswith('CC-'): is_oa = True + t['sherpa_romeo_color'] = None if extra.get('sherpa_romeo'): - in_sherpa_romeo = True + t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color') if extra['sherpa_romeo'].get('color') == 'white': is_oa = False if extra.get('kbart'): @@ -306,21 +308,21 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('ia'): if extra['ia'].get('sim'): any_ia_sim = True + if extra['ia'].get('longtail_oa'): + is_longtail_oa = True t['is_superceded'] = bool(extra.get('superceded')) t['in_doaj'] = bool(in_doaj) t['in_road'] = bool(in_road) - t['in_sherpa_romeo'] = bool(in_sherpa_romeo) t['any_kbart'] = bool(any_kbart) - t['is_longtail_oa'] = bool(is_longtail_oa) if force_bool: - t['in_doi'] = bool(in_doi) - t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa) + t['is_oa'] = bool(in_doaj or in_road or is_oa) + t['is_longtail_oa'] = bool(is_longtail_oa) t['any_jstor'] = bool(any_jstor) t['any_ia_sim'] = bool(any_ia_sim) else: - t['in_doi'] = in_doi - t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa + t['is_oa'] = in_doaj or in_road or is_oa + t['is_longtail_oa'] = is_longtail_oa t['any_jstor'] = any_jstor t['any_ia_sim'] = any_ia_sim return t -- cgit v1.2.3 From e047fbe1a9c495e86a6757d44eb32c9109a1b753 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 20:39:22 -0800 Subject: ES release schema updates --- extra/elasticsearch/release_schema.json | 69 ++++++++++++++------- python/fatcat_tools/transforms/elasticsearch.py | 81 +++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 28 deletions(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 85026060..98a1c28e 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -27,48 +27,62 @@ "mappings": { "release": { "properties": { - "ident": { "type": "keyword" }, + "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "work_id": { "type": "keyword" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "revision": { "type": "keyword", "doc_values": false }, + "work_id": { "type": "keyword", "doc_values": false }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, - "release_year": { "type": "integer" }, - "release_type": { "type": "keyword" }, + "release_year": { "type": "integer", "copy_to": "biblio" }, + "release_type": { "type": "keyword", "copy_to": "biblio" }, "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword" }, + "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "axiv_id": { "type": "keyword" }, - "jstor_id": { "type": "keyword" }, - "ark_id": { "type": "keyword" }, - "mag_id": { "type": "keyword" }, + "volume": { "type": "keyword", "copy_to": "biblio" }, + "issue": { "type": "keyword", "copy_to": "biblio" }, + "pages": { "type": "keyword", "copy_to": "biblio" }, + "first_page": { "type": "keyword" }, + "number": { "type": "keyword", "copy_to": "biblio" }, + "doi": { "type": "keyword", "doc_values": false }, + "doi_prefix": { "type": "keyword" }, + "doi_registrar": { "type": "keyword" }, + "pmid": { "type": "keyword", "doc_values": false }, + "pmcid": { "type": "keyword", "doc_values": false }, + "isbn13": { "type": "keyword", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "doc_values": false }, + "core_id": { "type": "keyword", "doc_values": false }, + "axiv_id": { "type": "keyword", "doc_values": false }, + "jstor_id": { "type": "keyword", "doc_values": false }, + "ark_id": { "type": "keyword", "doc_values": false }, + "mag_id": { "type": "keyword", "doc_values": false }, "license": { "type": "keyword" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_id": { "type": "keyword" }, "container_issnl": { "type": "keyword" }, "container_type": { "type": "keyword" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "affiliation_rors": { "type": "keyword" }, "creator_ids": { "type": "keyword" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, + "ref_release_ids": { "type": "keyword" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "best_pdf_url": { "type": "keyword" }, - "ia_pdf_url": { "type": "keyword" }, + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "best_pdf_url": { "type": "keyword", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "doc_values": false }, "is_oa": { "type": "boolean" }, + "oa_color": { "type": "keyword" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -79,7 +93,13 @@ "in_ia_sim": { "type": "boolean" }, "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, + "is_retracted": { "type": "boolean" }, + "preservation": { "type": "keyword" }, + "affilation": { "type": "alias", "path": "affiliations" }, + "ror": { "type": "alias", "path": "affiliation_rors" }, + "creator_id": { "type": "alias", "path": "creator_id" }, + "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, "date": { "type": "alias", "path": "release_date" }, @@ -90,6 +110,9 @@ "lang": { "type": "alias", "path": "language" }, "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, "release_status": { "type": "alias", "path": "release_stage" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "retracted": { "type": "alias", "path": "is_retracted" }, "is_kept": { "type": "alias", "path": "in_kbart" } } } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index edc68748..b997796d 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -50,6 +50,10 @@ def release_to_elasticsearch(entity, force_bool=True): release_stage = release.release_stage, withdrawn_status = release.withdrawn_status, language = release.language, + volume = release.volume, + issue = release.issue, + pages = release.pages, + number = release.number, license = release.license_slug, doi = release.ext_ids.doi, pmid = release.ext_ids.pmid, @@ -72,7 +76,7 @@ def release_to_elasticsearch(entity, force_bool=True): in_dweb = False in_ia = False in_ia_sim = False - in_shadow = False + in_shadows = False release_year = release.release_year if release.release_date: @@ -85,11 +89,15 @@ def release_to_elasticsearch(entity, force_bool=True): t['any_abstract'] = len(release.abstracts or []) > 0 t['ref_count'] = len(release.refs or []) - t['ref_linked_count'] = 0 - if release.refs: - t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id]) + ref_release_ids = [] + for r in (release.refs or []): + if r.target_release_id: + ref_release_ids.append(r.target_release_id) + t['ref_release_ids'] = ref_release_ids + t['ref_linked_count'] = len(ref_release_ids) t['contrib_count'] = len(release.contribs or []) contrib_names = [] + contrib_affiliations = [] creator_ids = [] for c in (release.contribs or []): if c.raw_name: @@ -98,8 +106,14 @@ def release_to_elasticsearch(entity, force_bool=True): contrib_names.append(c.surname) if c.creator_id: creator_ids.append(c.creator_id) + if c.raw_affiliation: + contrib_affiliations.append(c.raw_affiliation) t['contrib_names'] = contrib_names t['creator_ids'] = creator_ids + t['affiliations'] = contrib_affiliations + + # TODO: mapping... probably by lookup? + t['affiliation_rors'] = None container = release.container if container: @@ -140,8 +154,13 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True - else: + + # fall back to release-level container metadata if container not linked or + # missing context + if not t.get('publisher'): t['publisher'] = release.publisher + if not t.get('container_name') and release.extra: + t['container_name'] = release.extra.get('container_name') if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): in_jstor = True @@ -203,6 +222,46 @@ def release_to_elasticsearch(entity, force_bool=True): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe in_kbart = True + # backwards compatible subtitle fetching + if not t['subtitle'] and extra.get('subtitle'): + if type(extra['subtitle']) == list: + t['subtitle'] = extra['subtitle'][0] + else: + t['subtitle'] = extra['subtitle'] + + t['first_page'] = None + if release.pages: + first = release.pages.split('-')[0] + first = first.replace('p', '') + if release.pages.isdigit(): + t['first_page'] = release.pages + # TODO: non-numerical first pages + + t['ia_microfilm_url'] = None + if in_ia_sim: + # TODO: determine URL somehow? I think this is in flux. Will probably + # need extra metadata in the container extra field. + # special case as a demo for now. + if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ + and release.year in (2011, 2013) \ + and release.volume.isdigit() \ + and t['first_page']: + t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( + release.year, + release.volume - 1, + t['first_page'], + ) + + t['doi_registrar'] = None + if extra and t['doi']: + for k in ('crossref', 'datacite', 'jalc'): + if k in extra: + t['doi_registrar'] = k + if not 'doi_registrar' in t: + t['doi_registrar'] = 'crossref' + + if t['doi']: + t['doi_prefix'] = t['doi'].split('/')[0] if is_longtail_oa: is_oa = True @@ -215,6 +274,7 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = bool(in_jstor) t['in_web'] = bool(in_web) t['in_dweb'] = bool(in_dweb) + t['in_shadows'] = bool(in_shadows) else: t['is_oa'] = is_oa t['is_longtail_oa'] = is_longtail_oa @@ -223,9 +283,20 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = in_jstor t['in_web'] = in_web t['in_dweb'] = in_dweb + t['in_shadows'] = in_shadows t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + + if in_ia: + t['preservation'] = 'bright' + elif in_kbart or in_jstor: + t['preservation'] = 'dark_only' + elif in_shadows: + t['preservation'] = 'shadows_only' + else: + t['preservation'] = 'none' + return t def container_to_elasticsearch(entity, force_bool=True): -- cgit v1.2.3 From 5d458a3df7e58e6551d8ec72979e376c62fdd2f7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:52:33 -0800 Subject: fix some transform bugs, add some tests --- python/fatcat_tools/transforms/elasticsearch.py | 14 +-- python/fatcat_transform.py | 26 ++++- python/tests/files/changelog_3469683.json | 1 + .../files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json | 1 + .../files/release_etodop5banbndg3faecnfm6ozi.json | 1 + python/tests/transform_elasticsearch.py | 114 +++++++++++++++++++++ python/tests/transform_tests.py | 106 ------------------- 7 files changed, 149 insertions(+), 114 deletions(-) create mode 100644 python/tests/files/changelog_3469683.json create mode 100644 python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json create mode 100644 python/tests/files/release_etodop5banbndg3faecnfm6ozi.json create mode 100644 python/tests/transform_elasticsearch.py delete mode 100644 python/tests/transform_tests.py diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index b997796d..812cd1fd 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -20,6 +20,7 @@ def test_check_kbart(): assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + def release_to_elasticsearch(entity, force_bool=True): """ Converts from an entity model/schema to elasticsearch oriented schema. @@ -233,8 +234,8 @@ def release_to_elasticsearch(entity, force_bool=True): if release.pages: first = release.pages.split('-')[0] first = first.replace('p', '') - if release.pages.isdigit(): - t['first_page'] = release.pages + if first.isdigit(): + t['first_page'] = first # TODO: non-numerical first pages t['ia_microfilm_url'] = None @@ -243,12 +244,12 @@ def release_to_elasticsearch(entity, force_bool=True): # need extra metadata in the container extra field. # special case as a demo for now. if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ - and release.year in (2011, 2013) \ - and release.volume.isdigit() \ + and release.release_year in (2011, 2013) \ + and release.issue.isdigit() \ and t['first_page']: t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( - release.year, - release.volume - 1, + release.release_year, + int(release.issue) - 1, t['first_page'], ) @@ -299,6 +300,7 @@ def release_to_elasticsearch(entity, force_bool=True): return t + def container_to_elasticsearch(entity, force_bool=True): """ Converts from an entity model/schema to elasticsearch oriented schema. diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index ccb13871..42d2ea99 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 """ +Utility script for doing bulk conversion/tranforms of entity JSON schema to +other formats """ import sys @@ -15,10 +17,11 @@ from citeproc_styles import get_style_filepath import fatcat_openapi_client from fatcat_openapi_client.rest import ApiException -from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ChangelogEntry +from fatcat_openapi_client import ReleaseEntity, ContainerEntity, FileEntity, ChangelogEntry from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ release_to_elasticsearch, container_to_elasticsearch, \ - changelog_to_elasticsearch, public_api, release_to_csl, citeproc_csl + file_to_elasticsearch, changelog_to_elasticsearch, public_api, \ + release_to_csl, citeproc_csl def run_elasticsearch_releases(args): @@ -39,6 +42,15 @@ def run_elasticsearch_containers(args): args.json_output.write( json.dumps(container_to_elasticsearch(entity)) + '\n') +def run_elasticsearch_files(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, FileEntity, api_client=args.api.api_client) + args.json_output.write( + json.dumps(file_to_elasticsearch(entity)) + '\n') + def run_elasticsearch_changelogs(args): for line in args.json_input: line = line.strip() @@ -87,6 +99,16 @@ def main(): help="where to send output", default=sys.stdout, type=argparse.FileType('w')) + sub_elasticsearch_files = subparsers.add_parser('elasticsearch-files', + help="convert fatcat file JSON schema to elasticsearch file schema") + sub_elasticsearch_files.set_defaults(func=run_elasticsearch_files) + sub_elasticsearch_files.add_argument('json_input', + help="JSON-per-line of file entities", + default=sys.stdin, type=argparse.FileType('r')) + sub_elasticsearch_files.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs', help="convert fatcat changelog JSON schema to elasticsearch changelog schema") sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs) diff --git a/python/tests/files/changelog_3469683.json b/python/tests/files/changelog_3469683.json new file mode 100644 index 00000000..7a847b16 --- /dev/null +++ b/python/tests/files/changelog_3469683.json @@ -0,0 +1 @@ +{"index":3469683,"editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","timestamp":"2020-01-30T05:04:39.738601Z","editgroup":{"editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","editor_id":"scmbogxw25evtcesfcab5qaboa","editor":{"editor_id":"scmbogxw25evtcesfcab5qaboa","username":"crawl-bot","is_admin":true,"is_bot":true,"is_active":true},"changelog_index":3469683,"created":"2020-01-30T05:04:39.738601Z","description":"Files crawled from web using sandcrawler ingest tool","extra":{"agent":"fatcat_tools.IngestFileResultImporter","git_rev":"v0.3.1-280-ga889f32"},"edits":{"containers":[],"creators":[],"files":[{"edit_id":"ba819a2b-a4d0-43e6-9e5c-505284c8ae42","ident":"e3lmbzqyjjam3a5nqnccc6d654","revision":"7a606095-9d07-41ee-898a-bcf8b6bc0004","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/23802359.2020.1715878"}},{"edit_id":"a71c4b91-d599-4422-a7a6-527562161278","ident":"e62h2fa6fba6ve3lukv7n635fq","revision":"1374f1bd-684a-48b9-aaff-65b9e90083b5","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1186/s13071-020-3909-6"}},{"edit_id":"82e4d65d-0335-4a40-b9c9-bf38f9bd7b19","ident":"fam7ii245zasvnesikw7bhmoii","revision":"327e3358-2b2b-4919-9613-449bdbb76c55","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i1.8294"}},{"edit_id":"49e987dc-fc6f-4391-8b75-33176f03b5cb","ident":"fa6sljsebjapfojqapxd3dj4um","revision":"7f51f4fa-e448-410a-b7d5-7ae9cdf9fcb8","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33536/jcpe.v4i1.293"}},{"edit_id":"e62ed1ca-3961-423b-ab05-967694e32f70","ident":"fhhzbabf3zcx7p2tor2omqveyq","revision":"d40c3f8c-255f-4913-ace8-06db6af66697","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.59"}},{"edit_id":"337b8ed6-1248-4872-81da-d16f6db021e6","ident":"fllcoo4smfdyrh5q5lmu72e7cq","revision":"724b26cd-6ab1-46a1-bc88-a87410fdf102","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/1343943x.2020.1717970"}},{"edit_id":"43a0c9d5-692e-4c73-b154-d8769854d268","ident":"fzshcc6sfzegbduum2763o3lgy","revision":"a8f5e34b-fbf4-4d52-987e-887455e6bd50","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i1.8099"}},{"edit_id":"48767850-1141-47e3-80ef-556605e3588c","ident":"grdztt2vwjd65ovcifeo3ysbam","revision":"12dceb74-6333-4a17-b1e8-4afac9df1888","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.26565/2220-8089-2019-36-02"}},{"edit_id":"4b3b92ac-c250-4b78-9e91-43fc893c935e","ident":"hgpzsozky5amvcts45qb6nhqum","revision":"b5951fd6-dd36-422c-9770-8cdfb7e6d82d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934410"}},{"edit_id":"dabff0df-6f52-4adb-a63f-1965f30d8bd2","ident":"hpxdh7mykng77jyfoolpixzw2y","revision":"0e06e57c-0756-4ee5-8102-0d96478aa23f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.26452/ijrps.v11i1.1916"}},{"edit_id":"b757f2b5-5f97-45b8-bfcd-9c9fb08047b8","ident":"htorhznxdfdppbvpf57nrz536q","revision":"034f4640-fbb6-4123-9ad8-f934220ab820","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra09196c"}},{"edit_id":"5662469f-a4d3-4339-88a0-4d27ef3f5f58","ident":"if6a63p7rnaxbjkcr4egtihj74","revision":"08d384cf-396c-41e1-a14d-6bde24b47323","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1192/bjb.2019.92"}},{"edit_id":"540a1ec1-7879-42f4-b749-d3287eb26ef7","ident":"i76ou5g2jndo7gyjrxdhoks3bm","revision":"eddbf701-59c3-4f33-b26b-87ad29297f65","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934355"}},{"edit_id":"04a0b805-b09f-4ff6-80b3-77cb643c72d7","ident":"jl2z2az4sjfpdigdts3xjm24vy","revision":"a541e130-07be-43ae-8fbf-b0295d5b576f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/23802359.2020.1719936"}},{"edit_id":"3e4566b9-7e3f-4e02-aa81-b6299545b150","ident":"jsrgx72devbadbyyiqwm2bl4aa","revision":"f51f164e-0e96-4245-973b-3408250ebc3b","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.09"}},{"edit_id":"80c2f0ca-ba43-4481-b5a5-c15d87d4d7b4","ident":"kijz6wlf25dito3av5snrz345a","revision":"fa1b5279-2221-4063-853e-070ca2d5954b","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s2045796020000037"}},{"edit_id":"c0f44f12-e4c7-45d4-a79a-a3ce2d628e78","ident":"ky3cjfbzejecxaa5tjaaucurb4","revision":"0e227663-f825-42e8-bf57-ba50fea60bf5","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s204579602000013x"}},{"edit_id":"92093947-7ff5-48ad-bc47-b26d4c0959c8","ident":"llfciusk6jf6rd35ofpuufmgfe","revision":"ed89f8b0-98cd-4886-b99e-b5d276b2ac9f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934349"}},{"edit_id":"4cb0e042-4298-4d0b-98b8-3c0c808642ea","ident":"mdw33cq7svdfnlaevnpih7bsyu","revision":"4a0ed3ce-56f2-4299-861c-051e7c06499d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i2.8073"}},{"edit_id":"611fc1e1-79e1-4e10-a702-0420919407f6","ident":"mwghms3u2zecdf2x5zk7tzs4mq","revision":"101d7d0f-6aca-4df3-a08a-f4a3f26d3cd4","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1186/s12864-020-6509-0"}},{"edit_id":"b9a350bd-3497-4c83-97c1-0a20a420a287","ident":"ooejwh3g35cfrmmk4bvjcqxrai","revision":"8a4e8a05-3fb4-4d83-a545-35f152e24f58","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.6018/cpd.340501"}},{"edit_id":"aeab2a58-ec4e-4a2a-82ee-a0d2428eca50","ident":"pepmo2ajfzh7ldtqdugj4p5zvy","revision":"7d13f42d-ce94-4537-8c56-0158d9bf99e9","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.6018/cpd.347981"}},{"edit_id":"f3cd1575-fcfd-4f6b-ab48-28b06592df85","ident":"qcw5h7c5uncbbphtrp3zibn5y4","revision":"fa7b2868-777e-4046-98ca-b146c0056180","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s1742170519000425"}},{"edit_id":"3c0ffa45-7f39-4393-a73c-0615bf9543f6","ident":"qfmufcdlrbfyhcjemvut5om23i","revision":"fac63461-4790-49a8-9b75-b23e9a369529","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.34225/jidc.2019.14.2.57"}},{"edit_id":"4ad40f3a-55f4-4aa8-9016-d04f904b2163","ident":"qrpp45vopfbvzanalh4lmpiz24","revision":"2ac1355f-cba9-49ed-b260-9eea5ec09473","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.19"}},{"edit_id":"1ec3a67b-796c-4393-93c2-ddac4ea66e0f","ident":"q7cv6lezvjalpg5xd4tckkbmsi","revision":"702adc9f-9ecb-4030-a095-936f15839c31","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934359"}},{"edit_id":"a8ab2cf7-8aad-4060-b281-371d41df32cf","ident":"rqcd44zbfvcovipfpnqpowiq2e","revision":"bd51fd60-438a-42fe-80e9-104705b0580f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934350"}},{"edit_id":"6d626d5c-1c7c-42e3-8d00-c313a91fc7c1","ident":"ruzbactehngbrlyx4vq3zlaqlu","revision":"5da34289-7da0-42ad-98bb-1c0b21bb6e69","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.55"}},{"edit_id":"4651ac6b-228c-4918-ac74-0757ce64c031","ident":"rvbhhfvwc5dkhdpxekutlew7di","revision":"53b088f6-653c-4e6d-974a-8e5e6e154f12","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0190"}},{"edit_id":"156b4a96-af8b-4c01-a8ec-6aa53736450f","ident":"r6j7ad7vmvg75lvel6v4e4gbb4","revision":"46647620-a36a-482b-a651-492a4e6ca1bb","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.53"}},{"edit_id":"67e6f76f-859d-422d-8c90-715b3173e24c","ident":"scajd3ykrjatbjmifvxwcl3yhu","revision":"d99f1da3-484a-4977-bc14-6e952d007acd","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.11"}},{"edit_id":"22395a5e-2a2e-4bf2-9547-0fb5122f8a7f","ident":"sk22wngc3fh23b74nmkbgeeyya","revision":"7423067d-3a18-4305-98b4-5cee8127e24a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0300"}},{"edit_id":"4ea1f9dd-d85e-450b-bc2a-995364fcf3fe","ident":"tma3dvw77bghjfaqrdoiwquuge","revision":"c86364bf-501b-42b4-a3ed-ef334ff26e0a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.36447/estudios2014.v33-34.art3"}},{"edit_id":"87d158ff-b0e0-4944-88d9-017dcef70d6e","ident":"tvuumx4n75dorebv7bu7imyiym","revision":"ae17166e-43ce-4d5a-95de-4befeeb76fa6","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.35236/jots.663726"}},{"edit_id":"2114c75b-ac23-471a-83d9-c76ae5f6bf42","ident":"twnql5u4mbfqffal2atv5qucoq","revision":"cb73ac6c-bc2b-422b-bc27-7032d14cac4d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0162"}},{"edit_id":"2f1d20fd-e69e-4490-9f70-f2b043f53634","ident":"tzwyuimcgrepvhide2sj3lovjm","revision":"b9cf9229-db47-4fa0-a8f6-45320b5af440","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934348"}},{"edit_id":"fdb6c388-9b22-45ff-8db1-1531eac43bbf","ident":"vfwz3fuvbbgcxek2hi6vjo525q","revision":"6c57ecb2-a870-4078-af17-a779ca3ceb28","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.4197/eco.29-1.8"}},{"edit_id":"3b8cf6fa-75e7-4b15-a83c-784552ff76e9","ident":"vtq5tvfltbfv3pizvux32ek5hi","revision":"7c0b5d64-6d20-407b-9de3-482a9c75f40e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jurisprudence.v8i2.6977"}},{"edit_id":"899303e8-ce86-4f53-909f-249ef45d9a3b","ident":"vuneewh3fncxpb7bzviptx6kze","revision":"3fd4ff1c-1855-41e2-9bae-7edbae86783a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.3846/jcem.2020.11826"}},{"edit_id":"c0595f10-72dc-4144-b243-5b344912842f","ident":"v2i53kwtnbea7kfhsnvcpkvixe","revision":"09891662-3d67-4d7f-a4ed-7f9bfa3f426f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934347"}},{"edit_id":"80f26273-d243-4dcf-af4a-e05226ba679c","ident":"wtish6c32randpuucjxq5byjo4","revision":"a69cd59a-52bf-4c48-9323-62a4070b2440","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.29261/pakvetj/2019.033"}},{"edit_id":"79f1bdc1-2627-4328-97ff-71731295fcad","ident":"xkf7a4cavnhahnfsjj5w5aoopu","revision":"b826cb89-efcc-427f-a378-0829bb2b871c","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.36447/estudios2014.v33-34.art6"}},{"edit_id":"695feb37-ab8e-4ecd-8c9e-fae06ff63e39","ident":"yx24fslfafb6dgx7gvjbmoma5m","revision":"4109aeb1-fcc1-47f3-9d2c-8e3abfbc697e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra10366j"}},{"edit_id":"48f2eb73-a5a3-440a-8bd1-c581797a82ca","ident":"3p5rah3wbfftdht7rabkpjfcrm","revision":"df754aec-4750-45eb-97e6-943928dad661","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.01"}},{"edit_id":"7a937ed3-5362-40ed-8f96-e6c6231e0adf","ident":"4r3madqhfzb5jb7jd7xmv55em4","revision":"6feccd54-38bd-4d51-8f42-fe211baf5ba3","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.35236/jots.668781"}},{"edit_id":"3beaa94e-c4e1-4a6b-96a7-d455ad13b7aa","ident":"53ooeweri5efjm5vhl2bwjcfze","revision":"a9d318ad-eb2a-4590-a463-8d6016e2e887","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s2045796020000049"}},{"edit_id":"8a0b80e2-c8b2-4457-ac89-2fc7ad7548f7","ident":"546k37iji5bfffakw2egl2azxy","revision":"3a4d93d0-c59e-4f81-8fbb-40ce21b11b1e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra08019h"}},{"edit_id":"15fa5e34-9829-483e-bfa5-a4011b974c6b","ident":"6cy3aonbdfgxbjxnujg3hsqx7q","revision":"021fed8f-5109-4389-9e7f-13a70cbaf4a3","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.2478/joim-2019-0023"}},{"edit_id":"6bc507c8-ceea-49a5-8c1f-9c652463588e","ident":"6rxwlcytwzeopgrhidvi236b2q","revision":"60307cd5-28a3-4063-9111-d5e90e1cb346","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.3846/aviation.2019.11913"}},{"edit_id":"283013c6-4400-4fdc-b2b2-1dbd1a262332","ident":"7j4w24plxzc3nnrkorbowmnra4","revision":"0fa4112c-a596-49a2-9d36-82c213db3fb8","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.22"}}],"filesets":[],"webcaptures":[],"releases":[],"works":[]}}} \ No newline at end of file diff --git a/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json b/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json new file mode 100644 index 00000000..bed8977d --- /dev/null +++ b/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json @@ -0,0 +1 @@ +{"release_ids":["5tbuas2e4vd6jaowbgzmmhhqxe"],"mimetype":"application/pdf","urls":[{"url":"https://web.archive.org/web/20200130042753/https://www.zhros.ru/jour/article/download/811/542","rel":"webarchive"},{"url":"https://www.zhros.ru/jour/article/download/811/542","rel":"web"}],"sha256":"1665cdb90b73c684233038601c52995acef77bb37aefc6e63ae13e4194d48261","sha1":"3ad4df99ff1354ec0b5a333a59fba9a3a5d9812a","md5":"39159f9c8e98a245f954c9000b0f2810","size":739980,"revision":"dcc7a975-725d-4bc9-8c3f-cd0476cd485e","ident":"bcah4zp5tvdhjl5bqci2c2lgfa","state":"active"} \ No newline at end of file diff --git a/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json b/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json new file mode 100644 index 00000000..1204c95d --- /dev/null +++ b/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json @@ -0,0 +1 @@ +{"abstracts":[],"refs":[{"index":0,"extra":{"issue":"Suppl 1","volume":"118"},"key":"10.1111/j.1471-0528.2011.03098.x-BIB1|cit1","year":2011,"container_name":"BJOG","title":"Saving Mothers' Lives: reviewing maternal deaths to make motherhood safer-2006-2008. The Eighth Report of the Confidential Enquiries into Maternal Deaths in the United Kingdom"}],"contribs":[{"index":0,"raw_name":"Philip Steer","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"Wiley","pages":"1404-1404","issue":"11","volume":"118","ext_ids":{"doi":"10.1111/j.1471-0528.2011.03098.x"},"release_year":2011,"release_date":"2011-09-09","release_stage":"published","release_type":"article-journal","container_id":"hl5g6d5msjcl7hlbyyvcsbhc2u","webcaptures":[],"filesets":[],"files":[],"container":{"wikidata_qid":"Q15724571","issnl":"1470-0328","publisher":"Wiley (Blackwell Publishing)","container_type":"journal","name":"BJOG: an International Journal of Obstetrics and Gynaecology","extra":{"abbrev":"BJOG","country":"gb","ia":{"sim":{"year_spans":[[1902,1915],[1921,2015]]}},"issne":"1471-0528","issnp":"1470-0328","kbart":{"clockss":{"year_spans":[[1989,1989],[1993,1993],[2002,2003],[2009,2017]]},"portico":{"year_spans":[[1902,2019]]}},"languages":["en"],"sherpa_romeo":{"color":"yellow"},"urls":["http://www.bjog.org/view/0/index.html"]},"revision":"ec26766c-c1fe-453b-837d-087cc254fe07","ident":"hl5g6d5msjcl7hlbyyvcsbhc2u","state":"active"},"work_id":"wmwe5wwkzfcs7gyjfgdeanksha","title":"Saving Mothers' Lives. Reviewing maternal deaths to make motherhood safer: 2006-2008","state":"active","ident":"etodop5banbndg3faecnfm6ozi","revision":"deb7e050-6df6-42ed-9704-788a0e30facf","extra":{"crossref":{"type":"journal-article"},"subtitle":["Correpondence"]}} \ No newline at end of file diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py new file mode 100644 index 00000000..ab613a0a --- /dev/null +++ b/python/tests/transform_elasticsearch.py @@ -0,0 +1,114 @@ + +import json +import pytest +from fatcat_tools import * +from fatcat_openapi_client import * +from fixtures import api +from import_journal_metadata import journal_metadata_importer + +from import_crossref import crossref_importer +from import_matched import matched_importer + +def test_basic_elasticsearch_convert(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + r = crossref_importer.parse_record(raw) + r.state = 'active' + release_to_elasticsearch(r) + +def test_rich_elasticsearch_convert(): + r = ReleaseEntity( + title="something", + release_year=1234, + license_slug="CC-BY-NC", + ext_ids=ReleaseExtIds(), + refs=[ + ReleaseRef(), + ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), + ], + ) + r.state = 'active' + r.container = ContainerEntity( + name="dummy journal", + extra={ + "ia": { + "sim": { + "year_spans": [[1000, 1100]], + }, + }, + "kbart": { + "lockss": { + "year_spans": [[1200, 1300]], + }, + "jstor": { + "year_spans": [[1950, 1960], [1980, 2005]], + }, + }, + "sherpa_romeo": {"color": "blue"}, + "doaj": {"as_of": "2010-02-03"}, + }, + ) + r.files = [FileEntity( + mimetype="application/pdf", + urls=[ + FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), + FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"), + FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), + ], + extra={ + "shadows": {}, + }, + )] + es = release_to_elasticsearch(r) + assert es['release_year'] == r.release_year + assert es['in_ia'] == True + assert es['in_jstor'] == False + assert es['in_ia_sim'] == False + assert es['in_ia'] == True + assert es['in_web'] == True + assert es['in_dweb'] == True + assert es['is_oa'] == True + assert es['is_longtail_oa'] == False + assert es['ref_count'] == 2 + assert es['ref_linked_count'] == 1 + +def test_elasticsearch_release_from_json(): + r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity) + es = release_to_elasticsearch(r) + + assert es['subtitle'] == "Correpondence" + assert es['ident'] == "etodop5banbndg3faecnfm6ozi" + assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology" + assert es['first_page'] == "1404" + assert es['issue'] == "11" + assert es['volume'] == "118" + assert es['number'] == None + assert es['in_ia_sim'] == True + assert es['in_kbart'] == True + +def test_elasticsearch_container_transform(journal_metadata_importer): + with open('tests/files/journal_metadata.sample.json', 'r') as f: + raw = json.loads(f.readline()) + c = journal_metadata_importer.parse_record(raw) + c.state = 'active' + es = container_to_elasticsearch(c) + assert es['publisher'] == c.publisher + +def test_elasticsearch_file_transform(matched_importer): + f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity) + + f.state = 'active' + es = file_to_elasticsearch(f) + assert es['sha1'] == f.sha1 + assert es['sha256'] == f.sha256 + assert es['md5'] == f.md5 + assert es['size_bytes'] == f.size + assert es['mimetype'] == f.mimetype + assert es['in_ia'] == True + assert 'publisher' in es['rel'] + + # XXX: implement hosts and domain parsing with urlcanon + #assert 'journals.plos.org' in es['host'] + #assert 'plos.org' in es['domain'] + diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py deleted file mode 100644 index 7b583ac4..00000000 --- a/python/tests/transform_tests.py +++ /dev/null @@ -1,106 +0,0 @@ - -import json -import pytest -from fatcat_tools import * -from fatcat_openapi_client import * -from fixtures import api -from import_journal_metadata import journal_metadata_importer - -from import_crossref import crossref_importer -from import_matched import matched_importer - -def test_basic_elasticsearch_convert(crossref_importer): - with open('tests/files/crossref-works.single.json', 'r') as f: - # not a single line - raw = json.loads(f.read()) - r = crossref_importer.parse_record(raw) - r.state = 'active' - release_to_elasticsearch(r) - -def test_rich_elasticsearch_convert(): - r = ReleaseEntity( - title="something", - release_year=1234, - license_slug="CC-BY-NC", - ext_ids=ReleaseExtIds(), - refs=[ - ReleaseRef(), - ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), - ], - ) - r.state = 'active' - r.container = ContainerEntity( - name="dummy journal", - extra={ - "ia": { - "sim": { - "year_spans": [[1000, 1100]], - }, - }, - "kbart": { - "lockss": { - "year_spans": [[1200, 1300]], - }, - "jstor": { - "year_spans": [[1950, 1960], [1980, 2005]], - }, - }, - "sherpa_romeo": {"color": "blue"}, - "doaj": {"as_of": "2010-02-03"}, - }, - ) - r.files = [FileEntity( - mimetype="application/pdf", - urls=[ - FileUrl(rel="dweb", url="dat://a954329dlk/thingie"), - FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"), - FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"), - ], - extra={ - "shadows": {}, - }, - )] - es = release_to_elasticsearch(r) - assert es['release_year'] == r.release_year - assert es['in_ia'] == True - assert es['in_jstor'] == False - assert es['in_ia_sim'] == False - assert es['in_ia'] == True - assert es['in_web'] == True - assert es['in_dweb'] == True - assert es['is_oa'] == True - assert es['is_longtail_oa'] == False - assert es['ref_count'] == 2 - assert es['ref_linked_count'] == 1 - -def test_elasticsearch_release_from_json(): - r = entity_from_json(open('./tests/files/math_universe.json', 'r').read(), ReleaseEntity) - release_to_elasticsearch(r) - -def test_elasticsearch_container_transform(journal_metadata_importer): - with open('tests/files/journal_metadata.sample.json', 'r') as f: - raw = json.loads(f.readline()) - c = journal_metadata_importer.parse_record(raw) - c.state = 'active' - es = container_to_elasticsearch(c) - assert es['publisher'] == c.publisher - -def test_elasticsearch_file_transform(matched_importer): - with open('tests/files/example_matched.json', 'r') as f: - raw = json.loads(f.readline()) - f = matched_importer.parse_record(raw) - - f.state = 'active' - es = file_to_elasticsearch(f) - assert es['sha1'] == f.sha1 - assert es['sha256'] == f.sha256 - assert es['md5'] == f.md5 - assert es['size_bytes'] == f.size - assert es['mimetype'] == f.mimetype - assert es['in_ia'] == True - assert 'publisher' in es['rel'] - - # XXX: implement hosts and domain parsing with urlcanon - #assert 'journals.plos.org' in es['host'] - #assert 'plos.org' in es['domain'] - -- cgit v1.2.3 From d58c3891ac2122dac53ced606568108f543f2d80 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:52:58 -0800 Subject: actually implement changelog transform --- extra/elasticsearch/changelog_schema.json | 11 ++++- python/fatcat_tools/transforms/elasticsearch.py | 62 ++++++++++++++++++------- python/tests/transform_elasticsearch.py | 24 +++++++++- 3 files changed, 78 insertions(+), 19 deletions(-) diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index f3211e99..77c77238 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -16,20 +16,29 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword" }, + "editgroup_id": { "type": "keyword", "doc_values": false }, "timestamp": { "type": "date" }, "editor_id": { "type": "keyword" }, "username": { "type": "keyword" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, "agent": { "type": "keyword" }, + "containers": { "type": "integer" }, + "new_containers": { "type": "integer" }, "creators": { "type": "integer" }, + "new_creators": { "type": "integer" }, "files": { "type": "integer" }, + "new_files": { "type": "integer" }, "filessets": { "type": "integer" }, + "new_filessets": { "type": "integer" }, "webcaptures": { "type": "integer" }, + "new_webcaptures": { "type": "integer" }, "releases": { "type": "integer" }, + "new_releases": { "type": "integer" }, "works": { "type": "integer" }, + "new_works": { "type": "integer" }, + "created": { "type": "integer" }, "updated": { "type": "integer" }, "deleted": { "type": "integer" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 812cd1fd..c8547b27 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -401,36 +401,64 @@ def container_to_elasticsearch(entity, force_bool=True): return t +def _type_of_edit(edit): + if edit.revision == None and edit.redirect_ident == None: + return 'delete' + elif edit.redirect_ident: + # redirect + return 'update' + elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision: + return 'create' + else: + return 'update' + + def changelog_to_elasticsearch(entity): editgroup = entity.editgroup t = dict( index=entity.index, editgroup_id=entity.editgroup_id, - timestamp=entity.timestamp, + timestamp=entity.timestamp.isoformat(), editor_id=editgroup.editor_id, + username=editgroup.editor.username, + is_bot=editgroup.editor.is_bot, + is_admin=editgroup.editor.is_admin, ) extra = editgroup.extra or dict() if extra.get('agent'): t['agent'] = extra['agent'] - t['containers'] = len(editgroup.edits.containers) - t['creators'] = len(editgroup.edits.containers) - t['files'] = len(editgroup.edits.containers) - t['filesets'] = len(editgroup.edits.containers) - t['webcaptures'] = len(editgroup.edits.containers) - t['releases'] = len(editgroup.edits.containers) - t['works'] = len(editgroup.edits.containers) - - # TODO: parse and pull out counts - #created = 0 - #updated = 0 - #deleted = 0 - #t['created'] = created - #t['updated'] = updated - #t['deleted'] = deleted - #t['total'] = created + updated + deleted + containers = [_type_of_edit(e) for e in editgroup.edits.containers] + creators = [_type_of_edit(e) for e in editgroup.edits.creators] + files = [_type_of_edit(e) for e in editgroup.edits.files] + filesets = [_type_of_edit(e) for e in editgroup.edits.filesets] + webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures] + releases = [_type_of_edit(e) for e in editgroup.edits.releases] + works = [_type_of_edit(e) for e in editgroup.edits.works] + + t['containers'] = len(containers) + t['new_containers'] = len([e for e in containers if e == 'create']) + t['creators'] = len(creators) + t['new_creators'] = len([e for e in creators if e == 'create']) + t['files'] = len(files) + t['new_files'] = len([e for e in files if e == 'create']) + t['filesets'] = len(filesets) + t['new_filesets'] = len([e for e in filesets if e == 'create']) + t['webcaptures'] = len(webcaptures) + t['new_webcaptures'] = len([e for e in webcaptures if e == 'create']) + t['releases'] = len(releases) + t['new_releases'] = len([e for e in releases if e == 'create']) + t['works'] = len(works) + t['new_works'] = len([e for e in works if e == 'create']) + + all_edits = containers + creators + files + filesets + webcaptures + releases + works + + t['created'] = len([e for e in all_edits if e == 'create']) + t['updated'] = len([e for e in all_edits if e == 'update']) + t['deleted'] = len([e for e in all_edits if e == 'delete']) + t['total'] = len(all_edits) return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index ab613a0a..89a4eef8 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,9 +106,31 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'publisher' in es['rel'] + assert 'web' in es['rel'] # XXX: implement hosts and domain parsing with urlcanon #assert 'journals.plos.org' in es['host'] #assert 'plos.org' in es['domain'] +def test_elasticsearch_changelog_transform(matched_importer): + ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) + + es = changelog_to_elasticsearch(ce) + assert es['index'] == 3469683 + # len("2020-01-30T05:04:39") => 19 + assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19] + assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa" + assert es['username'] == "crawl-bot" + assert es['is_bot'] == True + assert es['is_admin'] == True + assert es['agent'] == "fatcat_tools.IngestFileResultImporter" + + assert es['total'] == 50 + assert es['files'] == 50 + assert es['new_files'] == 50 + assert es['created'] == 50 + + assert es['releases'] == 0 + assert es['new_releases'] == 0 + assert es['updated'] == 0 + assert es['deleted'] == 0 -- cgit v1.2.3 From d5d83762063b8ec7f512c20567f46c03f2e6b542 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:57:32 -0800 Subject: update ES docs and proposal --- extra/elasticsearch/README.md | 2 ++ proposals/2020_elasticsearch_schemas.md | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3a48a178..3e0857b4 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -40,9 +40,11 @@ Drop and rebuild the schema: http delete :9200/fatcat_release http delete :9200/fatcat_container + http delete :9200/fatcat_file http delete :9200/fatcat_changelog http put :9200/fatcat_release < release_schema.json http put :9200/fatcat_container < container_schema.json + http put :9200/fatcat_file < file_schema.json http put :9200/fatcat_changelog < changelog_schema.json Put a single object (good for debugging): diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 83db884f..5fb28d19 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -14,8 +14,6 @@ Simple additions: - pages - `first_page` (parsed from pages) (?) - number -- `in_shadow` -- OA license slug (?) - `doi_prefix` - `doi_registrar` (based on extra) - `first_author` (surname; for matching) @@ -25,6 +23,8 @@ Simple additions: - referenced releases idents - contrib creator idents +Add affiliations, both as raw strings and ROR identifiers. + ## Preservation Summary Field @@ -128,8 +128,8 @@ hit does not}"). ## Container Fields -- `all_issns` -- `release_count` +- `issn` (all issns) +- `original_name` The `release_count` would not be indexed (left null) by default, and would be "patched" in to entities by a separate script (periodically?). -- cgit v1.2.3 From bf718fd076476c1a54e80ca88cd02ede606ab6f3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:23:39 -0800 Subject: add country to v03b release schema --- extra/elasticsearch/release_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 98a1c28e..2b67c5f5 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -40,6 +40,7 @@ "release_stage": { "type": "keyword" }, "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, "language": { "type": "keyword" }, + "country": { "type": "keyword" }, "volume": { "type": "keyword", "copy_to": "biblio" }, "issue": { "type": "keyword", "copy_to": "biblio" }, "pages": { "type": "keyword", "copy_to": "biblio" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index c8547b27..f0146d01 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -155,6 +155,8 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True + if c_extra.get('country'): + t['country'] = c_extra['country'] # fall back to release-level container metadata if container not linked or # missing context -- cgit v1.2.3 From e98f389a53d886b4fa8f0237b90b086999770f78 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:26:58 -0800 Subject: elastic schema fixes --- extra/elasticsearch/file_schema.json | 12 ++++++------ extra/elasticsearch/release_schema.json | 2 +- python/fatcat_tools/transforms/elasticsearch.py | 5 +++++ 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 66d81e0b..2a7e5be0 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -13,7 +13,7 @@ } }, "mappings": { - "changelog": { + "file": { "properties": { "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, @@ -33,13 +33,13 @@ "in_ia": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, - "sha1hex": { "type": "alias", "path": "sha1hex" }, - "sha256hex": { "type": "alias", "path": "sha256hex" }, - "md5hex": { "type": "alias", "path": "md5hex" }, + "sha1hex": { "type": "alias", "path": "sha1" }, + "sha256hex": { "type": "alias", "path": "sha256" }, + "md5hex": { "type": "alias", "path": "md5" }, "size": { "type": "alias", "path": "size_bytes" }, "domain": { "type": "alias", "path": "domains" }, - "host": { "type": "alias", "path": "host" }, - "rel": { "type": "alias", "path": "rel" } + "host": { "type": "alias", "path": "hosts" }, + "rel": { "type": "alias", "path": "rels" } } } } diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 2b67c5f5..3d301dba 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -99,7 +99,7 @@ "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, - "creator_id": { "type": "alias", "path": "creator_id" }, + "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index f0146d01..42669bbf 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -416,6 +416,11 @@ def _type_of_edit(edit): def changelog_to_elasticsearch(entity): + """ + Note that this importer requires expanded fill info to work. Calling code + may need to re-fetch editgroup from API to get the 'editor' field. Some of + the old kafka feed content doesn't includes editor in particular. + """ editgroup = entity.editgroup t = dict( -- cgit v1.2.3 From 749295a16dd18aa64b87f6b7c3e0e6052931161a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:54:17 -0800 Subject: new biblio-only general search The other fields are now "copy_to" the merged biblio field. --- python/fatcat_web/search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 7c60a6dd..6b2b9cc1 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -77,7 +77,7 @@ def do_release_search(q, limit=30, fulltext_only=True, offset=0): "default_operator": "AND", "analyze_wildcard": True, "lenient": True, - "fields": ["title^5", "contrib_names^2", "container_title"], + "fields": ["biblio"], }, }, } @@ -106,7 +106,7 @@ def do_container_search(q, limit=30, offset=0): "default_operator": "AND", "analyze_wildcard": True, "lenient": True, - "fields": ["name^5", "publisher"], + "fields": ["biblio"], }, }, } -- cgit v1.2.3 From ade1eb9ff955ca5ba58acdc8b76e344c9cc54790 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:56:27 -0800 Subject: fix ES file schema plural field names --- python/fatcat_tools/transforms/elasticsearch.py | 7 +++---- python/tests/transform_elasticsearch.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 42669bbf..5a492fb4 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -497,13 +497,12 @@ def file_to_elasticsearch(entity): sha1 = entity.sha1, sha256 = entity.sha256, md5 = entity.md5, - rel = [u.rel for u in entity.urls], ) # TODO: domain, hosts (from urls; use proper urlcanon) - t['rel'] = list(set([u.rel for u in entity.urls])) - t['host'] = [] - t['domain'] = [] + t['rels'] = list(set([u.rel for u in entity.urls])) + t['hosts'] = [] + t['domains'] = [] in_ia = False for u in entity.urls: diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index 89a4eef8..c247e745 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,7 +106,7 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rel'] + assert 'web' in es['rels'] # XXX: implement hosts and domain parsing with urlcanon #assert 'journals.plos.org' in es['host'] -- cgit v1.2.3 From 461376e6c6107da9a1c0a41c379465ef1c39f051 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 23:56:47 -0800 Subject: pipenv: add tldextract (url parser) and update deps --- python/Pipfile | 1 + python/Pipfile.lock | 294 ++++++++++++++++++++++++++++------------------------ 2 files changed, 159 insertions(+), 136 deletions(-) diff --git a/python/Pipfile b/python/Pipfile index 1a19a145..a5389e4f 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -51,6 +51,7 @@ dateparser = ">=0.7" langdetect = "*" pathlib2 = "*" pycountry = "*" +tldextract = "*" [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 diff --git a/python/Pipfile.lock b/python/Pipfile.lock index d813dd32..f3cbfbd6 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "edac46fb8a4b49aea89946621337abba881fff6f6768107d2529c752bcf702ac" + "sha256": "8288b1a7102a0a34644ef56817d60e936dc57695604208036f229e96bc14c42f" }, "pipfile-spec": 6, "requires": { @@ -235,6 +235,7 @@ }, "flask-misaka": { "hashes": [ + "sha256:bcfdacc0803ccea75d377737e82c83489b2153d922c9d9f9eabc5148d216ed70", "sha256:d0cfb0efd9e5afacda76defd4a605a68390f4fb1bef283c71534fd3ce0d3efb5", "sha256:f423c3beb5502742a57330a272f81d53223f6f99d45cc45b03926e3a3034f589" ], @@ -292,10 +293,10 @@ }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:6e7a3c2934694d59ad334c93dd1b6c96699cf24c53fdb8ec848ac6b23e685734", + "sha256:d6609ae5ec3d56212ca7d802eda654eaf2310000816ce815361041465b108be4" ], - "version": "==2.10.3" + "version": "==2.11.0" }, "kazoo": { "hashes": [ @@ -321,34 +322,35 @@ }, "lxml": { "hashes": [ - "sha256:00ac0d64949fef6b3693813fe636a2d56d97a5a49b5bbb86e4cc4cc50ebc9ea2", - "sha256:0571e607558665ed42e450d7bf0e2941d542c18e117b1ebbf0ba72f287ad841c", - "sha256:0e3f04a7615fdac0be5e18b2406529521d6dbdb0167d2a690ee328bef7807487", - "sha256:13cf89be53348d1c17b453867da68704802966c433b2bb4fa1f970daadd2ef70", - "sha256:217262fcf6a4c2e1c7cb1efa08bd9ebc432502abc6c255c4abab611e8be0d14d", - "sha256:223e544828f1955daaf4cefbb4853bc416b2ec3fd56d4f4204a8b17007c21250", - "sha256:277cb61fede2f95b9c61912fefb3d43fbd5f18bf18a14fae4911b67984486f5d", - "sha256:3213f753e8ae86c396e0e066866e64c6b04618e85c723b32ecb0909885211f74", - "sha256:4690984a4dee1033da0af6df0b7a6bde83f74e1c0c870623797cec77964de34d", - "sha256:4fcc472ef87f45c429d3b923b925704aa581f875d65bac80f8ab0c3296a63f78", - "sha256:61409bd745a265a742f2693e4600e4dbd45cc1daebe1d5fad6fcb22912d44145", - "sha256:678f1963f755c5d9f5f6968dded7b245dd1ece8cf53c1aa9d80e6734a8c7f41d", - "sha256:6c6d03549d4e2734133badb9ab1c05d9f0ef4bcd31d83e5d2b4747c85cfa21da", - "sha256:6e74d5f4d6ecd6942375c52ffcd35f4318a61a02328f6f1bd79fcb4ffedf969e", - "sha256:7b4fc7b1ecc987ca7aaf3f4f0e71bbfbd81aaabf87002558f5bc95da3a865bcd", - "sha256:7ed386a40e172ddf44c061ad74881d8622f791d9af0b6f5be20023029129bc85", - "sha256:8f54f0924d12c47a382c600c880770b5ebfc96c9fd94cf6f6bdc21caf6163ea7", - "sha256:ad9b81351fdc236bda538efa6879315448411a81186c836d4b80d6ca8217cdb9", - "sha256:bbd00e21ea17f7bcc58dccd13869d68441b32899e89cf6cfa90d624a9198ce85", - "sha256:c3c289762cc09735e2a8f8a49571d0e8b4f57ea831ea11558247b5bdea0ac4db", - "sha256:cf4650942de5e5685ad308e22bcafbccfe37c54aa7c0e30cd620c2ee5c93d336", - "sha256:cfcbc33c9c59c93776aa41ab02e55c288a042211708b72fdb518221cc803abc8", - "sha256:e301055deadfedbd80cf94f2f65ff23126b232b0d1fea28f332ce58137bcdb18", - "sha256:ebbfe24df7f7b5c6c7620702496b6419f6a9aa2fd7f005eb731cc80d7b4692b9", - "sha256:eff69ddbf3ad86375c344339371168640951c302450c5d3e9936e98d6459db06", - "sha256:f6ed60a62c5f1c44e789d2cf14009423cb1646b44a43e40a9cf6a21f077678a1" - ], - "version": "==4.4.2" + "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd", + "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c", + "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081", + "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f", + "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261", + "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a", + "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9", + "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a", + "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb", + "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60", + "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128", + "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a", + "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717", + "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89", + "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72", + "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8", + "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3", + "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7", + "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8", + "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77", + "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1", + "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15", + "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679", + "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012", + "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6", + "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc", + "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca" + ], + "version": "==4.5.0" }, "markupsafe": { "hashes": [ @@ -356,13 +358,16 @@ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", @@ -379,7 +384,9 @@ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" ], "version": "==1.1.1" }, @@ -442,6 +449,7 @@ }, "pykafka": { "hashes": [ + "sha256:6b075909a52cb0c95325bc16ab797bbcdbb37386652ea460705ed4472ce91459", "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577" ], "index": "pypi", @@ -498,11 +506,11 @@ }, "python-dotenv": { "hashes": [ - "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", - "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + "sha256:440c7c23d53b7d352f9c94d6f70860242c2f071cf5c029dd661ccb22d64ae42b", + "sha256:f254bfd0c970d64ccbb6c9ebef3667ab301a71473569c991253a481f1c98dddc" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.10.5" }, "python-magic": { "hashes": [ @@ -528,9 +536,6 @@ "version": "==2019.3" }, "raven": { - "extras": [ - "flask" - ], "hashes": [ "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54", "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4" @@ -572,27 +577,35 @@ "index": "pypi", "version": "==2.22.0" }, + "requests-file": { + "hashes": [ + "sha256:75c175eed739270aec3c5279ffd74e6527dada275c5c0d76b5817e9c86bb7dea", + "sha256:8f04aa6201bacda0567e7ac7f677f1499b0fc76b22140c54bc06edf1ba92e2fa" + ], + "version": "==1.4.3" + }, "requests-oauthlib": { "hashes": [ "sha256:7f71572defaecd16372f9006f33c2ec8c077c3cfa6f5911a9a90202beb513f3d", - "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a" + "sha256:b4261601a71fd721a8bd6d7aa1cc1d6a8a93b4a9f5e96626f8e4d91e8beeaa6a", + "sha256:fa6c47b933f01060936d87ae9327fead68768b69c6c9ea2109c48be30f2d4dbc" ], "version": "==1.3.0" }, "sickle": { "hashes": [ - "sha256:76d66ed4607af2cd36ee15568a98e7f147d4ec3dd227bd047664a1ca88b21944", - "sha256:b0aaa41d97a0c355aa6099b4dfa46c03f0bf828e6171960a15d68bd0548215ec" + "sha256:c0841b4df5edea33d2da01e893b3feadb1a8eacd2ca4dab9248e6047455ba14a", + "sha256:efdfa46c40cd34b3550f11b59c607ff0bd63adbc074efaa8b4dd662108269a2f" ], "index": "pypi", - "version": "==0.6.4" + "version": "==0.6.5" }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "version": "==1.13.0" + "version": "==1.14.0" }, "soupsieve": { "hashes": [ @@ -607,6 +620,14 @@ ], "version": "==0.8.6" }, + "tldextract": { + "hashes": [ + "sha256:16b2f7e81d89c2a5a914d25bdbddd3932c31a6b510db886c3ce0764a195c0ee7", + "sha256:9aa21a1f7827df4209e242ec4fc2293af5940ec730cde46ea80f66ed97bfc808" + ], + "index": "pypi", + "version": "==2.2.2" + }, "tzlocal": { "hashes": [ "sha256:11c9f16e0a633b4b60e1eede97d8a46340d042e67b670b290ca526576e039048", @@ -616,10 +637,10 @@ }, "urllib3": { "hashes": [ - "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293", - "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745" + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" ], - "version": "==1.25.7" + "version": "==1.25.8" }, "wcwidth": { "hashes": [ @@ -630,10 +651,10 @@ }, "werkzeug": { "hashes": [ - "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", - "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + "sha256:1e0dedc2acb1f46827daa2e399c1485c8fa17c0d8e70b6b875b4e7f54bf408d2", + "sha256:b353856d37dec59d6511359f97f6a4b2468442e454bd1c98298ddce53cac1f04" ], - "version": "==0.16.0" + "version": "==0.16.1" }, "wtforms": { "hashes": [ @@ -682,39 +703,39 @@ }, "coverage": { "hashes": [ - "sha256:189aac76d6e0d7af15572c51892e7326ee451c076c5a50a9d266406cd6c49708", - "sha256:1bf7ba2af1d373a1750888724f84cffdfc697738f29a353c98195f98fc011509", - "sha256:1f4ee8e2e4243971618bc16fcc4478317405205f135e95226c2496e2a3b8dbbf", - "sha256:225e79a5d485bc1642cb7ba02281419c633c216cdc6b26c26494ba959f09e69f", - "sha256:23688ff75adfa8bfa2a67254d889f9bdf9302c27241d746e17547c42c732d3f4", - "sha256:28f7f73b34a05e23758e860a89a7f649b85c6749e252eff60ebb05532d180e86", - "sha256:2d0cb9b1fe6ad0d915d45ad3d87f03a38e979093a98597e755930db1f897afae", - "sha256:47874b4711c5aeb295c31b228a758ce3d096be83dc37bd56da48ed99efb8813b", - "sha256:511ec0c00840e12fb4e852e4db58fa6a01ca4da72f36a9766fae344c3d502033", - "sha256:53e7438fef0c97bc248f88ba1edd10268cd94d5609970aaf87abbe493691af87", - "sha256:569f9ee3025682afda6e9b0f5bb14897c0db03f1a1dc088b083dd36e743f92bb", - "sha256:593853aa1ac6dcc6405324d877544c596c9d948ef20d2e9512a0f5d2d3202356", - "sha256:5b0a07158360d22492f9abd02a0f2ee7981b33f0646bf796598b7673f6bbab14", - "sha256:7ca3db38a61f3655a2613ee2c190d63639215a7a736d3c64cc7bbdb002ce6310", - "sha256:7d1cc7acc9ce55179616cf72154f9e648136ea55987edf84addbcd9886ffeba2", - "sha256:88b51153657612aea68fa684a5b88037597925260392b7bb4509d4f9b0bdd889", - "sha256:955ec084f549128fa2702f0b2dc696392001d986b71acd8fd47424f28289a9c3", - "sha256:b251c7092cbb6d789d62dc9c9e7c4fb448c9138b51285c36aeb72462cad3600e", - "sha256:bd82b684bb498c60ef47bb1541a50e6d006dde8579934dcbdbc61d67d1ea70d9", - "sha256:bfe102659e2ec13b86c7f3b1db6c9a4e7beea4255058d006351339e6b342d5d2", - "sha256:c1e4e39e43057396a5e9d069bfbb6ffeee892e40c5d2effbd8cd71f34ee66c4d", - "sha256:cb2b74c123f65e8166f7e1265829a6c8ed755c3cd16d7f50e75a83456a5f3fd7", - "sha256:cca38ded59105f7705ef6ffe1e960b8db6c7d8279c1e71654a4775ab4454ca15", - "sha256:cf908840896f7aa62d0ec693beb53264b154f972eb8226fb864ac38975590c4f", - "sha256:d095a7b473f8a95f7efe821f92058c8a2ecfb18f8db6677ae3819e15dc11aaae", - "sha256:d22b4297e7e4225ccf01f1aa55e7a96412ea0796b532dd614c3fcbafa341128e", - "sha256:d4a2b578a7a70e0c71f662705262f87a456f1e6c1e40ada7ea699abaf070a76d", - "sha256:ddeb42a3d5419434742bf4cc71c9eaa22df3b76808e23a82bd0b0bd360f1a9f1", - "sha256:e65a5aa1670db6263f19fdc03daee1d7dbbadb5cb67fd0a1f16033659db13c1d", - "sha256:eaad65bd20955131bcdb3967a4dea66b4e4d4ca488efed7c00d91ee0173387e8", - "sha256:f45fba420b94165c17896861bb0e8b27fb7abdcedfeb154895d8553df90b7b00" - ], - "version": "==5.0.2" + "sha256:15cf13a6896048d6d947bf7d222f36e4809ab926894beb748fc9caa14605d9c3", + "sha256:1daa3eceed220f9fdb80d5ff950dd95112cd27f70d004c7918ca6dfc6c47054c", + "sha256:1e44a022500d944d42f94df76727ba3fc0a5c0b672c358b61067abb88caee7a0", + "sha256:25dbf1110d70bab68a74b4b9d74f30e99b177cde3388e07cc7272f2168bd1477", + "sha256:3230d1003eec018ad4a472d254991e34241e0bbd513e97a29727c7c2f637bd2a", + "sha256:3dbb72eaeea5763676a1a1efd9b427a048c97c39ed92e13336e726117d0b72bf", + "sha256:5012d3b8d5a500834783689a5d2292fe06ec75dc86ee1ccdad04b6f5bf231691", + "sha256:51bc7710b13a2ae0c726f69756cf7ffd4362f4ac36546e243136187cfcc8aa73", + "sha256:527b4f316e6bf7755082a783726da20671a0cc388b786a64417780b90565b987", + "sha256:722e4557c8039aad9592c6a4213db75da08c2cd9945320220634f637251c3894", + "sha256:76e2057e8ffba5472fd28a3a010431fd9e928885ff480cb278877c6e9943cc2e", + "sha256:77afca04240c40450c331fa796b3eab6f1e15c5ecf8bf2b8bee9706cd5452fef", + "sha256:7afad9835e7a651d3551eab18cbc0fdb888f0a6136169fbef0662d9cdc9987cf", + "sha256:9bea19ac2f08672636350f203db89382121c9c2ade85d945953ef3c8cf9d2a68", + "sha256:a8b8ac7876bc3598e43e2603f772d2353d9931709345ad6c1149009fd1bc81b8", + "sha256:b0840b45187699affd4c6588286d429cd79a99d509fe3de0f209594669bb0954", + "sha256:b26aaf69713e5674efbde4d728fb7124e429c9466aeaf5f4a7e9e699b12c9fe2", + "sha256:b63dd43f455ba878e5e9f80ba4f748c0a2156dde6e0e6e690310e24d6e8caf40", + "sha256:be18f4ae5a9e46edae3f329de2191747966a34a3d93046dbdf897319923923bc", + "sha256:c312e57847db2526bc92b9bfa78266bfbaabac3fdcd751df4d062cd4c23e46dc", + "sha256:c60097190fe9dc2b329a0eb03393e2e0829156a589bd732e70794c0dd804258e", + "sha256:c62a2143e1313944bf4a5ab34fd3b4be15367a02e9478b0ce800cb510e3bbb9d", + "sha256:cc1109f54a14d940b8512ee9f1c3975c181bbb200306c6d8b87d93376538782f", + "sha256:cd60f507c125ac0ad83f05803063bed27e50fa903b9c2cfee3f8a6867ca600fc", + "sha256:d513cc3db248e566e07a0da99c230aca3556d9b09ed02f420664e2da97eac301", + "sha256:d649dc0bcace6fcdb446ae02b98798a856593b19b637c1b9af8edadf2b150bea", + "sha256:d7008a6796095a79544f4da1ee49418901961c97ca9e9d44904205ff7d6aa8cb", + "sha256:da93027835164b8223e8e5af2cf902a4c80ed93cb0909417234f4a9df3bcd9af", + "sha256:e69215621707119c6baf99bda014a45b999d37602cb7043d943c76a59b05bf52", + "sha256:ea9525e0fef2de9208250d6c5aeeee0138921057cd67fcef90fbed49c4d62d37", + "sha256:fca1669d464f0c9831fd10be2eef6b86f5ebd76c724d1e0706ebdff86bb4adf0" + ], + "version": "==5.0.3" }, "decorator": { "hashes": [ @@ -732,11 +753,11 @@ }, "importlib-metadata": { "hashes": [ - "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45", - "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f" + "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", + "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" ], "markers": "python_version < '3.8'", - "version": "==1.3.0" + "version": "==1.5.0" }, "ipython": { "hashes": [ @@ -762,10 +783,10 @@ }, "jedi": { "hashes": [ - "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064", - "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807" + "sha256:b4f4052551025c6b0b0b193b29a6ff7bdb74c52450631206c262aef9f7159ad2", + "sha256:d5c871cb9360b414f981e7072c52c33258d598305280fef91c6cae34739d65d5" ], - "version": "==0.15.2" + "version": "==0.16.0" }, "lazy-object-proxy": { "hashes": [ @@ -802,24 +823,24 @@ }, "more-itertools": { "hashes": [ - "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d", - "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564" + "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", + "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507" ], - "version": "==8.0.2" + "version": "==8.2.0" }, "packaging": { "hashes": [ - "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb", - "sha256:fe1d8331dfa7cc0a883b49d75fc76380b2ab2734b220fbb87d774e4fd4b851f8" + "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73", + "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334" ], - "version": "==20.0" + "version": "==20.1" }, "parso": { "hashes": [ - "sha256:55cf25df1a35fd88b878715874d2c4dc1ad3f0eebd1e0266a67e1f55efccfbe1", - "sha256:5c1f7791de6bd5dbbeac8db0ef5594b36799de198b3f7f7014643b0c5536b9d3" + "sha256:1376bdc8cb81377ca481976933773295218a2df47d3e1182ba76d372b1acb128", + "sha256:597f36de5102a8db05ffdf7ecdc761838b86565a4a111604c6e78beaedf1b045" ], - "version": "==0.5.2" + "version": "==0.6.0" }, "pathlib2": { "hashes": [ @@ -831,11 +852,11 @@ }, "pexpect": { "hashes": [ - "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", - "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" ], "markers": "sys_platform != 'win32'", - "version": "==4.7.0" + "version": "==4.8.0" }, "pg-view": { "hashes": [ @@ -922,11 +943,11 @@ }, "pytest": { "hashes": [ - "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa", - "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4" + "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d", + "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6" ], "index": "pypi", - "version": "==5.3.2" + "version": "==5.3.5" }, "pytest-cov": { "hashes": [ @@ -984,10 +1005,10 @@ }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "version": "==1.13.0" + "version": "==1.14.0" }, "traitlets": { "hashes": [ @@ -998,36 +1019,37 @@ }, "typed-ast": { "hashes": [ - "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", - "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", - "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", - "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", - "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", - "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", - "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", - "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", - "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", - "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", - "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", - "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", - "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", - "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", - "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", - "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", - "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", - "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", - "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", - "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" ], "markers": "implementation_name == 'cpython' and python_version < '3.8'", - "version": "==1.4.0" + "version": "==1.4.1" }, "urllib3": { "hashes": [ - "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293", - "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745" + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" ], - "version": "==1.25.7" + "version": "==1.25.8" }, "wcwidth": { "hashes": [ @@ -1044,10 +1066,10 @@ }, "zipp": { "hashes": [ - "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", - "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + "sha256:ccc94ed0909b58ffe34430ea5451f07bc0c76467d7081619a454bf5c98b89e28", + "sha256:feae2f18633c32fc71f2de629bfb3bd3c9325cd4419642b1f1da42ee488d9b98" ], - "version": "==0.6.0" + "version": "==2.1.0" } } } -- cgit v1.2.3 From 4cbee44529dd967c966ed3f2cc2bb80176be4e43 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:08:41 -0800 Subject: implement host+domain parsing for file ES transform --- python/fatcat_tools/transforms/elasticsearch.py | 14 +++++--------- python/tests/transform_elasticsearch.py | 7 +++---- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5a492fb4..e1980d90 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -1,6 +1,6 @@ - import collections +import tldextract from fatcat_openapi_client import ApiClient @@ -499,15 +499,11 @@ def file_to_elasticsearch(entity): md5 = entity.md5, ) - # TODO: domain, hosts (from urls; use proper urlcanon) + parsed_urls = [tldextract.extract(u.url) for u in entity.urls] + t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) - t['hosts'] = [] - t['domains'] = [] - in_ia = False - for u in entity.urls: - if '://archive.org/' in u.url or '://web.archive.org/' in u.url: - in_ia = True - t['in_ia'] = bool(in_ia) + t['in_ia'] = bool('archive.org' in t['domains']) return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c247e745..e67681c6 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer): assert es['size_bytes'] == f.size assert es['mimetype'] == f.mimetype assert es['in_ia'] == True - assert 'web' in es['rels'] - # XXX: implement hosts and domain parsing with urlcanon - #assert 'journals.plos.org' in es['host'] - #assert 'plos.org' in es['domain'] + assert 'web' in es['rels'] + assert 'www.zhros.ru' in es['hosts'] + assert 'zhros.ru' in es['domains'] def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3 From 59912583926077260d99a9bf77a938c2215eb6c8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:20:34 -0800 Subject: tweak file ES archive.org domain tracking --- extra/elasticsearch/file_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 2a7e5be0..a0ac3346 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -31,6 +31,7 @@ "hosts": { "type": "keyword" }, "rels": { "type": "keyword" }, "in_ia": { "type": "boolean" }, + "in_ia_petabox": { "type": "boolean" }, "release_id": { "type": "alias", "path": "release_ids" }, "sha1hex": { "type": "alias", "path": "sha1" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e1980d90..9aa3cece 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -505,5 +505,11 @@ def file_to_elasticsearch(entity): t['rels'] = list(set([u.rel for u in entity.urls])) t['in_ia'] = bool('archive.org' in t['domains']) + t['in_ia_petabox'] = bool('archive.org' in t['hosts']) + + # ok, but actually remove archive.org hosts, because they make other + # aggregations hard and are a waste of storage + t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] + t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] return t -- cgit v1.2.3 From b7404fb0f696807db3a92bc2c4c73c2d208e59ef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:51:56 -0800 Subject: ES schemas: make keywords case-insensitive by default But not applying asciifolding; don't see any need to do so? --- extra/elasticsearch/changelog_schema.json | 20 +++++-- extra/elasticsearch/container_schema.json | 38 ++++++++----- extra/elasticsearch/file_schema.json | 34 ++++++++---- extra/elasticsearch/release_schema.json | 89 ++++++++++++++++++------------- 4 files changed, 115 insertions(+), 66 deletions(-) diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index 77c77238..d958fed9 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -16,13 +28,13 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword", "doc_values": false }, + "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, - "editor_id": { "type": "keyword" }, - "username": { "type": "keyword" }, + "editor_id": { "type": "keyword", "normalizer": "default" }, + "username": { "type": "keyword", "normalize": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, + "agent": { "type": "keyword", "normalize": "caseSensitive" }, "containers": { "type": "integer" }, "new_containers": { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 3be261a2..be3a408e 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,23 +39,23 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_type": { "type": "keyword" }, - "issnl": { "type": "keyword" }, - "issns": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "country": { "type": "keyword" }, - "region": { "type": "keyword" }, - "discipline": { "type": "keyword" }, - "languages": { "type": "keyword" }, - "mimetypes": { "type": "keyword" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "region": { "type": "keyword", "normalizer": "default" }, + "discipline": { "type": "keyword", "normalizer": "default" }, + "languages": { "type": "keyword", "normalizer": "default" }, + "mimetypes": { "type": "keyword", "normalizer": "default" }, "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, @@ -57,7 +69,7 @@ "any_kbart": { "type": "boolean" }, "any_jstor": { "type": "boolean" }, "any_ia_sim": { "type": "boolean" }, - "sherpa_romeo_color": { "type": "keyword" }, + "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index a0ac3346..9c8ee64c 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -15,21 +27,21 @@ "mappings": { "file": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "release_ids": { "type": "keyword", "doc_values": false }, + "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, "release_count": { "type": "integer" }, - "mimetype": { "type": "keyword" }, + "mimetype": { "type": "keyword", "normalizer": "default" }, "size_bytes": { "type": "integer" }, - "sha1": { "type": "keyword", "doc_values": false }, - "sha256": { "type": "keyword", "doc_values": false }, - "md5": { "type": "keyword", "doc_values": false }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "md5": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "domains": { "type": "keyword" }, - "hosts": { "type": "keyword" }, - "rels": { "type": "keyword" }, + "domains": { "type": "keyword", "normalizer": "default" }, + "hosts": { "type": "keyword", "normalizer": "default" }, + "rels": { "type": "keyword", "normalizer": "default" }, "in_ia": { "type": "boolean" }, "in_ia_petabox": { "type": "boolean" }, diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 3d301dba..f983a703 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,58 +20,71 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } +} }, "mappings": { "release": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, - "work_id": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, "release_year": { "type": "integer", "copy_to": "biblio" }, - "release_type": { "type": "keyword", "copy_to": "biblio" }, - "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, - "language": { "type": "keyword" }, - "country": { "type": "keyword" }, - "volume": { "type": "keyword", "copy_to": "biblio" }, - "issue": { "type": "keyword", "copy_to": "biblio" }, - "pages": { "type": "keyword", "copy_to": "biblio" }, - "first_page": { "type": "keyword" }, - "number": { "type": "keyword", "copy_to": "biblio" }, - "doi": { "type": "keyword", "doc_values": false }, - "doi_prefix": { "type": "keyword" }, - "doi_registrar": { "type": "keyword" }, - "pmid": { "type": "keyword", "doc_values": false }, - "pmcid": { "type": "keyword", "doc_values": false }, - "isbn13": { "type": "keyword", "doc_values": false }, - "wikidata_qid": { "type": "keyword", "doc_values": false }, - "core_id": { "type": "keyword", "doc_values": false }, - "axiv_id": { "type": "keyword", "doc_values": false }, - "jstor_id": { "type": "keyword", "doc_values": false }, - "ark_id": { "type": "keyword", "doc_values": false }, - "mag_id": { "type": "keyword", "doc_values": false }, - "license": { "type": "keyword" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_id": { "type": "keyword" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "affiliation_rors": { "type": "keyword" }, - "creator_ids": { "type": "keyword" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, - "ref_release_ids": { "type": "keyword" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, @@ -79,11 +92,11 @@ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "best_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_microfilm_url": { "type": "keyword", "doc_values": false }, + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, "is_oa": { "type": "boolean" }, - "oa_color": { "type": "keyword" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -95,7 +108,7 @@ "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, "is_retracted": { "type": "boolean" }, - "preservation": { "type": "keyword" }, + "preservation": { "type": "keyword", "normalizer": "default" }, "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, -- cgit v1.2.3 From ca283a45cc151f3346e403c8d57f55ec75f40672 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 01:00:12 -0800 Subject: JSON typo in release mapping --- extra/elasticsearch/release_schema.json | 1 - 1 file changed, 1 deletion(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index f983a703..07601f36 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -35,7 +35,6 @@ } } } -} }, "mappings": { "release": { -- cgit v1.2.3 From caa588612b91181950697756eace8fda270fd092 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 01:03:08 -0800 Subject: add upper-case work-around from kibana map join --- extra/elasticsearch/release_schema.json | 1 + python/fatcat_tools/transforms/elasticsearch.py | 1 + 2 files changed, 2 insertions(+) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 07601f36..c0bbda22 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -53,6 +53,7 @@ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "language": { "type": "keyword", "normalizer": "default" }, "country": { "type": "keyword", "normalizer": "default" }, + "country_upper": { "type": "keyword", "normalizer": "caseSensitive" }, "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 9aa3cece..ded239d3 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -157,6 +157,7 @@ def release_to_elasticsearch(entity, force_bool=True): is_oa = True if c_extra.get('country'): t['country'] = c_extra['country'] + t['country_upper'] = c_extra['country'].upper() # fall back to release-level container metadata if container not linked or # missing context -- cgit v1.2.3 From 8aac86c4484f0376c46cdd51c69d5ada478b7f72 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 01:12:51 -0800 Subject: fix json typos in changelog schema --- extra/elasticsearch/changelog_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index d958fed9..d8342549 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -31,10 +31,10 @@ "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, "editor_id": { "type": "keyword", "normalizer": "default" }, - "username": { "type": "keyword", "normalize": "caseSensitive" }, + "username": { "type": "keyword", "normalizer": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword", "normalize": "caseSensitive" }, + "agent": { "type": "keyword", "normalizer": "caseSensitive" }, "containers": { "type": "integer" }, "new_containers": { "type": "integer" }, -- cgit v1.2.3 From 5ba91951bb4ebc59cb59340e82cba2a7d763dc59 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 13:22:45 -0800 Subject: fix release es transform missing 'issue' --- python/fatcat_tools/transforms/elasticsearch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index ded239d3..b5abe2ae 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -248,6 +248,7 @@ def release_to_elasticsearch(entity, force_bool=True): # special case as a demo for now. if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ and release.release_year in (2011, 2013) \ + and release.issue \ and release.issue.isdigit() \ and t['first_page']: t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( -- cgit v1.2.3 From 0d037d0d2f73b18014d8d98a06fa3f7bc2c9b794 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 16:52:29 -0800 Subject: pipenv: lock zipp version to work around python3.6 requirement --- python/Pipfile | 4 ++++ python/Pipfile.lock | 23 ++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/python/Pipfile b/python/Pipfile index a5389e4f..838cc2c0 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -53,6 +53,10 @@ pathlib2 = "*" pycountry = "*" tldextract = "*" +# this is only to lock to a python3.5-compatible version. needed by an +# importlib-metadata, under pytest +zipp = "<2.0.0" + [requires] # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 # (Xenial), currently the default on Internet Archive production VMs, as well diff --git a/python/Pipfile.lock b/python/Pipfile.lock index f3cbfbd6..05894aee 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "8288b1a7102a0a34644ef56817d60e936dc57695604208036f229e96bc14c42f" + "sha256": "0ce2be10751676f4bc5c6f411825efad087a7576cae7cf09e8540aefe321c53b" }, "pipfile-spec": 6, "requires": { @@ -293,10 +293,10 @@ }, "jinja2": { "hashes": [ - "sha256:6e7a3c2934694d59ad334c93dd1b6c96699cf24c53fdb8ec848ac6b23e685734", - "sha256:d6609ae5ec3d56212ca7d802eda654eaf2310000816ce815361041465b108be4" + "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250", + "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49" ], - "version": "==2.11.0" + "version": "==2.11.1" }, "kazoo": { "hashes": [ @@ -663,6 +663,14 @@ ], "index": "pypi", "version": "==2.2.1" + }, + "zipp": { + "hashes": [ + "sha256:15428d652e993b6ce86694c3cccf0d71aa7afdc6ef1807fa25a920e9444e0281", + "sha256:d9d2efe11d3a3fb9184da550d35bd1319dc8e30a63255927c82bb42fca1f4f7c" + ], + "index": "pypi", + "version": "==1.1.0" } }, "develop": { @@ -1066,10 +1074,11 @@ }, "zipp": { "hashes": [ - "sha256:ccc94ed0909b58ffe34430ea5451f07bc0c76467d7081619a454bf5c98b89e28", - "sha256:feae2f18633c32fc71f2de629bfb3bd3c9325cd4419642b1f1da42ee488d9b98" + "sha256:15428d652e993b6ce86694c3cccf0d71aa7afdc6ef1807fa25a920e9444e0281", + "sha256:d9d2efe11d3a3fb9184da550d35bd1319dc8e30a63255927c82bb42fca1f4f7c" ], - "version": "==2.1.0" + "index": "pypi", + "version": "==1.1.0" } } } -- cgit v1.2.3 From 741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 13:31:59 -0800 Subject: ES releases: host/domain fixes --- python/fatcat_tools/transforms/elasticsearch.py | 4 ++-- python/tests/transform_elasticsearch.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index b5abe2ae..f8bc05fb 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -502,7 +502,7 @@ def file_to_elasticsearch(entity): ) parsed_urls = [tldextract.extract(u.url) for u in entity.urls] - t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls])) + t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls])) t['domains'] = list(set([pu.registered_domain for pu in parsed_urls])) t['rels'] = list(set([u.rel for u in entity.urls])) @@ -512,6 +512,6 @@ def file_to_elasticsearch(entity): # ok, but actually remove archive.org hosts, because they make other # aggregations hard and are a waste of storage t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] - t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')] + t['domains'] = [h for h in t['domains'] if h not in ('archive.org')] return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index e67681c6..c94ab375 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -110,6 +110,9 @@ def test_elasticsearch_file_transform(matched_importer): assert 'web' in es['rels'] assert 'www.zhros.ru' in es['hosts'] assert 'zhros.ru' in es['domains'] + assert not '.archive.org' in (es['hosts'] + es['domains']) + assert not 'archive.org' in (es['hosts'] + es['domains']) + assert not 'web.archive.org' in (es['hosts'] + es['domains']) def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3 From fbd79c7315cad4789eb0e92c136c59da8f38c4f3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 31 Jan 2020 13:33:38 -0800 Subject: ES release schema: fix typo --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index c0bbda22..2cc9169c 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -110,7 +110,7 @@ "is_retracted": { "type": "boolean" }, "preservation": { "type": "keyword", "normalizer": "default" }, - "affilation": { "type": "alias", "path": "affiliations" }, + "affiliation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, -- cgit v1.2.3 From 8007cdfc4e06753a9bbba56d1fa7f9686775e5e8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 4 Feb 2020 15:10:26 -0800 Subject: fix axiv/arxiv typo in release schema --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 2cc9169c..607bacf1 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -67,7 +67,7 @@ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, -- cgit v1.2.3 From 3655bbe6c539fdeccfbfaa19b6fc93a4859e0ca7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 5 Feb 2020 15:42:43 -0800 Subject: ES release: actually do want doc_values for work_id Eg, for fast "unique count" --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 607bacf1..b85fc8a4 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -42,7 +42,7 @@ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, "state": { "type": "keyword", "normalizer": "default" }, "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default" }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, -- cgit v1.2.3 From 83387210e6775751e5eb690a7d8b56fe99dbe380 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 7 Feb 2020 14:38:13 -0800 Subject: ES files: don't remove archive.org domains/hosts --- python/fatcat_tools/transforms/elasticsearch.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index f8bc05fb..e00d7830 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -509,9 +509,4 @@ def file_to_elasticsearch(entity): t['in_ia'] = bool('archive.org' in t['domains']) t['in_ia_petabox'] = bool('archive.org' in t['hosts']) - # ok, but actually remove archive.org hosts, because they make other - # aggregations hard and are a waste of storage - t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')] - t['domains'] = [h for h in t['domains'] if h not in ('archive.org')] - return t -- cgit v1.2.3 From 2f8788152ff740d049d11e2e263cac978d526e2a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Feb 2020 14:22:59 -0800 Subject: release schema: do doc_value on DOIs Because DOIs are pseudo-structured (prefix, and often structure within the publisher-controlled area), I suspect we will in fact be wanting to do analytics over these strings. --- extra/elasticsearch/release_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index b85fc8a4..1b91696c 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -59,7 +59,7 @@ "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "first_page": { "type": "keyword", "normalizer": "default" }, "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, - "doi": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doi": { "type": "keyword", "normalizer": "default" }, "doi_prefix": { "type": "keyword", "normalizer": "default" }, "doi_registrar": { "type": "keyword", "normalizer": "default" }, "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, -- cgit v1.2.3 From ed38bfde4e1eaddd6d710802b6f372c7b0aab26b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 14 Feb 2020 00:07:56 -0800 Subject: ES updates: fix tests to accept archive.org in host/domain --- python/tests/transform_elasticsearch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index c94ab375..a954fc4d 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -110,9 +110,10 @@ def test_elasticsearch_file_transform(matched_importer): assert 'web' in es['rels'] assert 'www.zhros.ru' in es['hosts'] assert 'zhros.ru' in es['domains'] + assert 'archive.org' in (es['hosts'] + es['domains']) + assert 'web.archive.org' in (es['hosts'] + es['domains']) + # old regression assert not '.archive.org' in (es['hosts'] + es['domains']) - assert not 'archive.org' in (es['hosts'] + es['domains']) - assert not 'web.archive.org' in (es['hosts'] + es['domains']) def test_elasticsearch_changelog_transform(matched_importer): ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry) -- cgit v1.2.3 From 0450f22006c9b991cdc4695458fc3b3e3e97bfbb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 11:22:30 -0800 Subject: ES release: last minor tweaks --- extra/elasticsearch/release_schema.json | 8 +++++--- python/fatcat_tools/transforms/elasticsearch.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 1b91696c..666a672f 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -52,8 +52,8 @@ "release_stage": { "type": "keyword", "normalizer": "default" }, "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "language": { "type": "keyword", "normalizer": "default" }, - "country": { "type": "keyword", "normalizer": "default" }, - "country_upper": { "type": "keyword", "normalizer": "caseSensitive" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" }, "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, @@ -71,8 +71,10 @@ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_id": { "type": "keyword", "normalizer": "default" }, "container_issnl": { "type": "keyword", "normalizer": "default" }, @@ -110,7 +112,7 @@ "is_retracted": { "type": "boolean" }, "preservation": { "type": "keyword", "normalizer": "default" }, - "affiliation": { "type": "alias", "path": "affiliations" }, + "affiliation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, "creator_id": { "type": "alias", "path": "creator_ids" }, "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e00d7830..cbafca7e 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -156,8 +156,8 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra['szczepanski'].get('as_of'): is_oa = True if c_extra.get('country'): - t['country'] = c_extra['country'] - t['country_upper'] = c_extra['country'].upper() + t['country_code'] = c_extra['country'] + t['country_code_upper'] = c_extra['country'].upper() # fall back to release-level container metadata if container not linked or # missing context -- cgit v1.2.3 From 4e6bc246d01183f4c7ffad7d0d474e683f04c07f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 11:28:05 -0800 Subject: ES container last tweaks --- extra/elasticsearch/container_schema.json | 7 ++++--- python/fatcat_tools/transforms/elasticsearch.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index be3a408e..5cd85b04 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -47,11 +47,12 @@ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, "container_type": { "type": "keyword", "normalizer": "default" }, "issnl": { "type": "keyword", "normalizer": "default" }, "issns": { "type": "keyword", "normalizer": "default" }, "wikidata_qid": { "type": "keyword", "normalizer": "default" }, - "country": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, "region": { "type": "keyword", "normalizer": "default" }, "discipline": { "type": "keyword", "normalizer": "default" }, "languages": { "type": "keyword", "normalizer": "default" }, @@ -74,8 +75,8 @@ "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, "releases_ia": { "type": "integer" }, - "releases_sim": { "type": "integer" }, - "releases_shadow": { "type": "integer" }, + "releases_ia_sim": { "type": "integer" }, + "releases_shadows": { "type": "integer" }, "releases_any_file": { "type": "integer" }, "releases_any_fileset": { "type": "integer" }, "releases_any_webcapture": { "type": "integer" }, diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index cbafca7e..8581febd 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -342,6 +342,9 @@ def container_to_elasticsearch(entity, force_bool=True): if entity.extra.get(key): t[key] = entity.extra[key] + if 'country' in t: + t['country_code'] = t.pop('country') + t['issns'] = [] if entity.issnl: t['issns'].append(entity.issnl) -- cgit v1.2.3 From c57a743cb8b774750c99c6f079438666a87f6476 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 11:38:51 -0800 Subject: bulk ES transform: skip non-active entities --- python/fatcat_transform.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index 42d2ea99..9ddbaa4d 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -30,6 +30,8 @@ def run_elasticsearch_releases(args): if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) + if entity['state'] != 'active': + continue args.json_output.write( json.dumps(release_to_elasticsearch(entity)) + '\n') @@ -39,6 +41,8 @@ def run_elasticsearch_containers(args): if not line: continue entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) + if entity['state'] != 'active': + continue args.json_output.write( json.dumps(container_to_elasticsearch(entity)) + '\n') @@ -48,6 +52,8 @@ def run_elasticsearch_files(args): if not line: continue entity = entity_from_json(line, FileEntity, api_client=args.api.api_client) + if entity['state'] != 'active': + continue args.json_output.write( json.dumps(file_to_elasticsearch(entity)) + '\n') @@ -66,6 +72,8 @@ def run_citeproc_releases(args): if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) + if entity['state'] != 'active': + continue csl_json = release_to_csl(entity) csl_json['id'] = "release:" + (entity.ident or "unknown") out = citeproc_csl(csl_json, args.style, args.html) -- cgit v1.2.3 From 21239503ddd71c69ddf651260f2953c93f227dfc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 12:16:22 -0800 Subject: fix fatcat_transform state filters --- python/fatcat_transform.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index 9ddbaa4d..23a56109 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -30,7 +30,7 @@ def run_elasticsearch_releases(args): if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) - if entity['state'] != 'active': + if entity.state != 'active': continue args.json_output.write( json.dumps(release_to_elasticsearch(entity)) + '\n') @@ -41,7 +41,7 @@ def run_elasticsearch_containers(args): if not line: continue entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) - if entity['state'] != 'active': + if entity.state != 'active': continue args.json_output.write( json.dumps(container_to_elasticsearch(entity)) + '\n') @@ -52,7 +52,7 @@ def run_elasticsearch_files(args): if not line: continue entity = entity_from_json(line, FileEntity, api_client=args.api.api_client) - if entity['state'] != 'active': + if entity.state != 'active': continue args.json_output.write( json.dumps(file_to_elasticsearch(entity)) + '\n') @@ -72,7 +72,7 @@ def run_citeproc_releases(args): if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) - if entity['state'] != 'active': + if entity.state != 'active': continue csl_json = release_to_csl(entity) csl_json['id'] = "release:" + (entity.ident or "unknown") -- cgit v1.2.3 From 0ab3f66664fd4cc63cf9040e351d725c6a5c22b9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 12:27:28 -0800 Subject: update ES transform README - smaller batch sizes to prevent esbulk errors - file transform/index --- extra/elasticsearch/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 3e0857b4..df4cb918 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -59,8 +59,9 @@ Bulk insert from a file on disk: Or, in a bulk production live-stream conversion: export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_release -type release - time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_release -type release + time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_container -type container + time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 2000 -id ident -w 8 -index fatcat_file -type file ## Index Aliases -- cgit v1.2.3 From 81e0784813500a39955c20278140e25d7940d9c6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 22:04:35 -0800 Subject: improve is_oa flag accuracy Particularly, the ezb=green match seems mostly incorrect. Note that pmcid being assigned could still be in an embargo window? --- proposals/2020_elasticsearch_schemas.md | 4 ++-- python/fatcat_tools/transforms/elasticsearch.py | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 5fb28d19..c3e79073 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which is: - `bright` -- `dark_only` -- `shadow_only` +- `dark` +- `shadows_only` - `none` Note that these don't align with OA color or work-level preservation (aka, no diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 8581febd..87e054ec 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -149,9 +149,6 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('road'): if c_extra['road'].get('as_of'): is_oa = True - if c_extra.get('ezb'): - if c_extra['ezb'].get('color') == 'green': - is_oa = True if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True @@ -210,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): is_oa = True + if release.license_slug.startswith("ARXIV-"): + is_oa = True extra = release.extra or dict() if extra: @@ -293,10 +292,10 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) - if in_ia: + if in_ia or t.get('pmcid') or t.get('arxiv_id'): t['preservation'] = 'bright' elif in_kbart or in_jstor: - t['preservation'] = 'dark_only' + t['preservation'] = 'dark' elif in_shadows: t['preservation'] = 'shadows_only' else: @@ -367,9 +366,6 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('road'): if extra['road'].get('as_of'): in_road = True - if extra.get('ezb'): - if extra['ezb'].get('color') == 'green': - is_oa = True if extra.get('szczepanski'): if extra['szczepanski'].get('as_of'): is_oa = True -- cgit v1.2.3