From f6f7450903bdbe36bd5fff146b942e34ad221557 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 25 Jan 2019 18:41:33 -0800 Subject: transform and import fixes/tweaks --- extra/elasticsearch/README.md | 16 ++-- extra/elasticsearch/changelog_schema.json | 5 +- extra/elasticsearch/fatcat_schema.json | 109 ++++++++++++++++++++++ python/fatcat_export.py | 45 ++++++++- python/fatcat_tools/__init__.py | 4 +- python/fatcat_tools/importers/common.py | 4 +- python/fatcat_tools/importers/journal_metadata.py | 10 +- python/fatcat_tools/transforms.py | 51 +++++++--- 8 files changed, 214 insertions(+), 30 deletions(-) create mode 100644 extra/elasticsearch/fatcat_schema.json diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 761ad6ab..691c6ed5 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -38,13 +38,17 @@ There is a Dockerfile in this directory which includes this installation. Drop and rebuild the schema: - http delete :9200/fatcat - http put :9200/fatcat < release_schema.json + http delete :9200/fatcat_release + http delete :9200/fatcat_container + http delete :9200/fatcat_changelog + http put :9200/fatcat_release < release_schema.json + http put :9200/fatcat_container < container_schema.json + http put :9200/fatcat_changelog < changelog_schema.json Put a single object (good for debugging): - head -n1 examples.json | http post :9200/fatcat/release/0 - http get :9200/fatcat/release/0 + head -n1 examples.json | http post :9200/fatcat_release/release/0 + http get :9200/fatcat_release/release/0 Bulk insert from a file on disk: @@ -53,14 +57,14 @@ Bulk insert from a file on disk: Or, in a bulk production live-stream conversion: export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release + time zcat /srv/fatcat_release/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release ## Full-Text Querying A generic full-text "query string" query look like this (replace "blood" with actual query string, and "size" field with the max results to return): - GET /fatcat/release/_search + GET /fatcat_release/release/_search { "query": { "query_string": { diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index 7a7ec90c..f3211e99 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -16,8 +16,9 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgorup_id": { "type": "keyword" }, - "timestamp": { "type": "datetime" }, + "editgroup_id": { "type": "keyword" }, + "timestamp": { "type": "date" }, + "editor_id": { "type": "keyword" }, "username": { "type": "keyword" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json new file mode 100644 index 00000000..05583330 --- /dev/null +++ b/extra/elasticsearch/fatcat_schema.json @@ -0,0 +1,109 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + } + } + } +}, +"mappings": { + "release": { + "properties": { + "ident": { "type": "keyword" }, + "state": { "type": "keyword" }, + "revision": { "type": "keyword" }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "release_date": { "type": "date" }, + "release_year": { "type": "integer" }, + "release_type": { "type": "keyword" }, + "release_status": { "type": "keyword" }, + "language": { "type": "keyword" }, + "doi": { "type": "keyword" }, + "pmid": { "type": "keyword" }, + "pmcid": { "type": "keyword" }, + "isbn13": { "type": "keyword" }, + "wikidata_qid": { "type": "keyword" }, + "core_id": { "type": "keyword" }, + "axiv_id": { "type": "keyword" }, + "jstor_id": { "type": "keyword" }, + "license": { "type": "keyword" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_issnl": { "type": "keyword" }, + "container_type": { "type": "keyword" }, + "contrib_count": { "type": "integer" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "ref_count": { "type": "integer" }, + "file_count": { "type": "integer" }, + "fileset_count": { "type": "integer" }, + "webcapture_count": { "type": "integer" }, + "any_abstract": { "type": "boolean" }, + + "best_pdf_url": { "type": "keyword" }, + "ia_pdf_url": { "type": "keyword" }, + "is_oa": { "type": "boolean" }, + "is_longtail_oa": { "type": "boolean" }, + "is_preserved": { "type": "boolean" }, + "in_kbart": { "type": "boolean" }, + "in_jstor": { "type": "boolean" }, + "in_dweb": { "type": "boolean" }, + "in_web": { "type": "boolean" }, + "in_ia": { "type": "boolean" }, + "in_ia_sim": { "type": "boolean" }, + "in_shadows": { "type": "boolean" }, + + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "year": { "type": "alias", "path": "release_year" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "oa": { "type": "alias", "path": "is_oa" }, + "longtail": { "type": "alias", "path": "is_longtail_oa" }, + "lang": { "type": "alias", "path": "language" }, + "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, + "is_kept": { "type": "alias", "path": "in_kbart" } + } + }, + "changelog": { + "properties": { + "index": { "type": "integer" }, + "editgorup_id": { "type": "keyword" }, + "timestamp": { "type": "date" }, + "username": { "type": "keyword" }, + "is_bot": { "type": "boolean" }, + "is_admin": { "type": "boolean" }, + "agent": { "type": "keyword" }, + "containers": { "type": "integer" }, + "creators": { "type": "integer" }, + "files": { "type": "integer" }, + "filessets": { "type": "integer" }, + "webcaptures": { "type": "integer" }, + "releases": { "type": "integer" }, + "works": { "type": "integer" }, + "created": { "type": "integer" }, + "updated": { "type": "integer" }, + "deleted": { "type": "integer" }, + "total": { "type": "integer" } + } + } +} +} diff --git a/python/fatcat_export.py b/python/fatcat_export.py index cf8bf1c3..027d6c0a 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -12,9 +12,10 @@ import json import argparse import fatcat_client from fatcat_client.rest import ApiException -from fatcat_client import ReleaseEntity +from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ - release_to_elasticsearch, public_api + release_to_elasticsearch, container_to_elasticsearch, \ + changelog_to_elasticsearch, public_api def run_export_releases(args): @@ -30,9 +31,27 @@ def run_transform_releases(args): line = line.strip() if not line: continue - release = entity_from_json(line, ReleaseEntity) + entity = entity_from_json(line, ReleaseEntity) args.json_output.write( - json.dumps(release_to_elasticsearch(release)) + '\n') + json.dumps(release_to_elasticsearch(entity)) + '\n') + +def run_transform_containers(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, ContainerEntity) + args.json_output.write( + json.dumps(container_to_elasticsearch(entity)) + '\n') + +def run_transform_changelogs(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, ChangelogEntry) + args.json_output.write( + json.dumps(changelog_to_elasticsearch(entity)) + '\n') def run_export_changelog(args): api = args.api @@ -74,6 +93,24 @@ def main(): help="where to send output", default=sys.stdout, type=argparse.FileType('w')) + sub_transform_containers = subparsers.add_parser('transform-containers') + sub_transform_containers.set_defaults(func=run_transform_containers) + sub_transform_containers.add_argument('json_input', + help="JSON-per-line of container entities", + default=sys.stdin, type=argparse.FileType('r')) + sub_transform_containers.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + + sub_transform_changelogs = subparsers.add_parser('transform-changelogs') + sub_transform_changelogs.set_defaults(func=run_transform_changelogs) + sub_transform_changelogs.add_argument('json_input', + help="JSON-per-line of changelog entries", + default=sys.stdin, type=argparse.FileType('r')) + sub_transform_changelogs.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + sub_changelog = subparsers.add_parser('changelog') sub_changelog.set_defaults(func=run_export_changelog) sub_changelog.add_argument('--start', diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index e2b1e3a2..64c45062 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,4 +1,6 @@ from .api_auth import authenticated_api, public_api from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_dict, entity_from_json, release_to_elasticsearch +from .transforms import entity_to_dict, entity_from_json, \ + release_to_elasticsearch, container_to_elasticsearch, \ + changelog_to_elasticsearch diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index ebdce56f..a29b3019 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -236,8 +236,8 @@ class EntityImporter: self._entity_queue.append(entity) if len(self._entity_queue) >= self.edit_batch_size: self.insert_batch(self._entity_queue) - self.counts['insert'] += len(_entity_queue) - self._entity_queue = 0 + self.counts['insert'] += len(self._entity_queue) + self._entity_queue = [] def want(self, raw_record): """ diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 7f6b1ee8..be62d63a 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -44,7 +44,7 @@ class JournalMetadataImporter(EntityImporter): editgroup_extra=eg_extra) def want(self, raw_record): - if raw_record.get('issnl'): + if raw_record.get('issnl') and raw_record.get('name'): return True return False @@ -55,6 +55,10 @@ class JournalMetadataImporter(EntityImporter): returns a ContainerEntity (or None if invalid or couldn't parse) """ + if not row.get('name'): + # Name is required (by schema) + return None + extra = dict() for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', 'coden', 'aliases', 'original_name', 'first_year', 'last_year', @@ -76,8 +80,10 @@ class JournalMetadataImporter(EntityImporter): extra_ia = dict() # TODO: would like an ia.longtail_ia flag if row.get('sim'): + # NB: None case of the .get() here is blech, but othrwise + # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on extra_ia['sim'] = { - 'year_spans': row['sim']['year_spans'], + 'year_spans': row['sim'].get('year_spans'), } if extra_ia: extra['ia'] = extra_ia diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index a85c877c..7bb75c3e 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -231,20 +231,12 @@ def container_to_elasticsearch(entity): container_type = entity.container_type, issnl = entity.issnl, wikidata_qid = entity.wikidata_qid, - - entity_status = entity.entity_status, - language = entity.language, - license = entity.license_slug, - doi = entity.doi, - pmid = entity.pmid, - isbn13 = entity.isbn13, - core_id = entity.core_id, - arxiv_id = entity.core_id, - jstor_id = entity.jstor_id, ) # TODO: region, discipline # TODO: single primary language? + if not entity.extra: + entity.extra = dict() for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): if entity.extra.get(key): t[key] = entity.extra[key] @@ -285,13 +277,46 @@ def container_to_elasticsearch(entity): if extra['ia'].get('sim'): any_ia_sim = True - t['in_doaj'] = is_doaj - t['in_road'] = is_road + t['in_doaj'] = in_doaj + t['in_road'] = in_road t['in_doi'] = in_doi t['in_sherpa_romeo'] = in_sherpa_romeo - t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa + t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa t['is_longtail_oa'] = is_longtail_oa t['any_kbart'] = any_ia_sim t['any_jstor'] = any_ia_sim t['any_ia_sim'] = bool(any_ia_sim) return t + + +def changelog_to_elasticsearch(entity): + + editgroup = entity.editgroup + t = dict( + index=entity.index, + editgroup_id=entity.editgroup_id, + timestamp=entity.timestamp, + editor_id=editgroup.editor_id, + ) + + extra = editgroup.extra or dict() + if extra.get('agent'): + t['agent'] = extra['agent'] + + t['containers'] = len(editgroup.edits.containers) + t['creators'] = len(editgroup.edits.containers) + t['files'] = len(editgroup.edits.containers) + t['filesets'] = len(editgroup.edits.containers) + t['webcaptures'] = len(editgroup.edits.containers) + t['releases'] = len(editgroup.edits.containers) + t['works'] = len(editgroup.edits.containers) + + # TODO: parse and pull out counts + #created = 0 + #updated = 0 + #deleted = 0 + #t['created'] = created + #t['updated'] = updated + #t['deleted'] = deleted + #t['total'] = created + updated + deleted + return t -- cgit v1.2.3