summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-01-25 18:41:33 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-01-25 18:41:33 -0800
commitf6f7450903bdbe36bd5fff146b942e34ad221557 (patch)
treec50332c832f414b5c0070e58a42ceb4751ed4d81
parent16256f8ed119c072c09b13b0b1a6d4a56bed5113 (diff)
downloadfatcat-f6f7450903bdbe36bd5fff146b942e34ad221557.tar.gz
fatcat-f6f7450903bdbe36bd5fff146b942e34ad221557.zip
transform and import fixes/tweaks
-rw-r--r--extra/elasticsearch/README.md16
-rw-r--r--extra/elasticsearch/changelog_schema.json5
-rw-r--r--extra/elasticsearch/fatcat_schema.json109
-rwxr-xr-xpython/fatcat_export.py45
-rw-r--r--python/fatcat_tools/__init__.py4
-rw-r--r--python/fatcat_tools/importers/common.py4
-rw-r--r--python/fatcat_tools/importers/journal_metadata.py10
-rw-r--r--python/fatcat_tools/transforms.py51
8 files changed, 214 insertions, 30 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 761ad6ab..691c6ed5 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -38,13 +38,17 @@ There is a Dockerfile in this directory which includes this installation.
Drop and rebuild the schema:
- http delete :9200/fatcat
- http put :9200/fatcat < release_schema.json
+ http delete :9200/fatcat_release
+ http delete :9200/fatcat_container
+ http delete :9200/fatcat_changelog
+ http put :9200/fatcat_release < release_schema.json
+ http put :9200/fatcat_container < container_schema.json
+ http put :9200/fatcat_changelog < changelog_schema.json
Put a single object (good for debugging):
- head -n1 examples.json | http post :9200/fatcat/release/0
- http get :9200/fatcat/release/0
+ head -n1 examples.json | http post :9200/fatcat_release/release/0
+ http get :9200/fatcat_release/release/0
Bulk insert from a file on disk:
@@ -53,14 +57,14 @@ Bulk insert from a file on disk:
Or, in a bulk production live-stream conversion:
export LC_ALL=C.UTF-8
- time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
+ time zcat /srv/fatcat_release/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
## Full-Text Querying
A generic full-text "query string" query look like this (replace "blood" with
actual query string, and "size" field with the max results to return):
- GET /fatcat/release/_search
+ GET /fatcat_release/release/_search
{
"query": {
"query_string": {
diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index 7a7ec90c..f3211e99 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -16,8 +16,9 @@
"changelog": {
"properties": {
"index": { "type": "integer" },
- "editgorup_id": { "type": "keyword" },
- "timestamp": { "type": "datetime" },
+ "editgroup_id": { "type": "keyword" },
+ "timestamp": { "type": "date" },
+ "editor_id": { "type": "keyword" },
"username": { "type": "keyword" },
"is_bot": { "type": "boolean" },
"is_admin": { "type": "boolean" },
diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json
new file mode 100644
index 00000000..05583330
--- /dev/null
+++ b/extra/elasticsearch/fatcat_schema.json
@@ -0,0 +1,109 @@
+{
+"settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [ "lowercase", "asciifolding" ]
+ },
+ "textIcu": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ },
+ "textIcuSearch": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ }
+ }
+ }
+ }
+},
+"mappings": {
+ "release": {
+ "properties": {
+ "ident": { "type": "keyword" },
+ "state": { "type": "keyword" },
+ "revision": { "type": "keyword" },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "release_date": { "type": "date" },
+ "release_year": { "type": "integer" },
+ "release_type": { "type": "keyword" },
+ "release_status": { "type": "keyword" },
+ "language": { "type": "keyword" },
+ "doi": { "type": "keyword" },
+ "pmid": { "type": "keyword" },
+ "pmcid": { "type": "keyword" },
+ "isbn13": { "type": "keyword" },
+ "wikidata_qid": { "type": "keyword" },
+ "core_id": { "type": "keyword" },
+ "axiv_id": { "type": "keyword" },
+ "jstor_id": { "type": "keyword" },
+ "license": { "type": "keyword" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "container_issnl": { "type": "keyword" },
+ "container_type": { "type": "keyword" },
+ "contrib_count": { "type": "integer" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "ref_count": { "type": "integer" },
+ "file_count": { "type": "integer" },
+ "fileset_count": { "type": "integer" },
+ "webcapture_count": { "type": "integer" },
+ "any_abstract": { "type": "boolean" },
+
+ "best_pdf_url": { "type": "keyword" },
+ "ia_pdf_url": { "type": "keyword" },
+ "is_oa": { "type": "boolean" },
+ "is_longtail_oa": { "type": "boolean" },
+ "is_preserved": { "type": "boolean" },
+ "in_kbart": { "type": "boolean" },
+ "in_jstor": { "type": "boolean" },
+ "in_dweb": { "type": "boolean" },
+ "in_web": { "type": "boolean" },
+ "in_ia": { "type": "boolean" },
+ "in_ia_sim": { "type": "boolean" },
+ "in_shadows": { "type": "boolean" },
+
+ "author": { "type": "alias", "path": "contrib_names" },
+ "journal": { "type": "alias", "path": "container_name" },
+ "date": { "type": "alias", "path": "release_date" },
+ "year": { "type": "alias", "path": "release_year" },
+ "issn": { "type": "alias", "path": "container_issnl" },
+ "oa": { "type": "alias", "path": "is_oa" },
+ "longtail": { "type": "alias", "path": "is_longtail_oa" },
+ "lang": { "type": "alias", "path": "language" },
+ "file_pdf_url": { "type": "alias", "path": "best_pdf_url" },
+ "is_kept": { "type": "alias", "path": "in_kbart" }
+ }
+ },
+ "changelog": {
+ "properties": {
+ "index": { "type": "integer" },
+ "editgorup_id": { "type": "keyword" },
+ "timestamp": { "type": "date" },
+ "username": { "type": "keyword" },
+ "is_bot": { "type": "boolean" },
+ "is_admin": { "type": "boolean" },
+ "agent": { "type": "keyword" },
+ "containers": { "type": "integer" },
+ "creators": { "type": "integer" },
+ "files": { "type": "integer" },
+ "filessets": { "type": "integer" },
+ "webcaptures": { "type": "integer" },
+ "releases": { "type": "integer" },
+ "works": { "type": "integer" },
+ "created": { "type": "integer" },
+ "updated": { "type": "integer" },
+ "deleted": { "type": "integer" },
+ "total": { "type": "integer" }
+ }
+ }
+}
+}
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index cf8bf1c3..027d6c0a 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -12,9 +12,10 @@ import json
import argparse
import fatcat_client
from fatcat_client.rest import ApiException
-from fatcat_client import ReleaseEntity
+from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry
from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \
- release_to_elasticsearch, public_api
+ release_to_elasticsearch, container_to_elasticsearch, \
+ changelog_to_elasticsearch, public_api
def run_export_releases(args):
@@ -30,9 +31,27 @@ def run_transform_releases(args):
line = line.strip()
if not line:
continue
- release = entity_from_json(line, ReleaseEntity)
+ entity = entity_from_json(line, ReleaseEntity)
args.json_output.write(
- json.dumps(release_to_elasticsearch(release)) + '\n')
+ json.dumps(release_to_elasticsearch(entity)) + '\n')
+
+def run_transform_containers(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, ContainerEntity)
+ args.json_output.write(
+ json.dumps(container_to_elasticsearch(entity)) + '\n')
+
+def run_transform_changelogs(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, ChangelogEntry)
+ args.json_output.write(
+ json.dumps(changelog_to_elasticsearch(entity)) + '\n')
def run_export_changelog(args):
api = args.api
@@ -74,6 +93,24 @@ def main():
help="where to send output",
default=sys.stdout, type=argparse.FileType('w'))
+ sub_transform_containers = subparsers.add_parser('transform-containers')
+ sub_transform_containers.set_defaults(func=run_transform_containers)
+ sub_transform_containers.add_argument('json_input',
+ help="JSON-per-line of container entities",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_transform_containers.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+
+ sub_transform_changelogs = subparsers.add_parser('transform-changelogs')
+ sub_transform_changelogs.set_defaults(func=run_transform_changelogs)
+ sub_transform_changelogs.add_argument('json_input',
+ help="JSON-per-line of changelog entries",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_transform_changelogs.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+
sub_changelog = subparsers.add_parser('changelog')
sub_changelog.set_defaults(func=run_export_changelog)
sub_changelog.add_argument('--start',
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index e2b1e3a2..64c45062 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -1,4 +1,6 @@
from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
-from .transforms import entity_to_dict, entity_from_json, release_to_elasticsearch
+from .transforms import entity_to_dict, entity_from_json, \
+ release_to_elasticsearch, container_to_elasticsearch, \
+ changelog_to_elasticsearch
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index ebdce56f..a29b3019 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -236,8 +236,8 @@ class EntityImporter:
self._entity_queue.append(entity)
if len(self._entity_queue) >= self.edit_batch_size:
self.insert_batch(self._entity_queue)
- self.counts['insert'] += len(_entity_queue)
- self._entity_queue = 0
+ self.counts['insert'] += len(self._entity_queue)
+ self._entity_queue = []
def want(self, raw_record):
"""
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 7f6b1ee8..be62d63a 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -44,7 +44,7 @@ class JournalMetadataImporter(EntityImporter):
editgroup_extra=eg_extra)
def want(self, raw_record):
- if raw_record.get('issnl'):
+ if raw_record.get('issnl') and raw_record.get('name'):
return True
return False
@@ -55,6 +55,10 @@ class JournalMetadataImporter(EntityImporter):
returns a ContainerEntity (or None if invalid or couldn't parse)
"""
+ if not row.get('name'):
+ # Name is required (by schema)
+ return None
+
extra = dict()
for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
'coden', 'aliases', 'original_name', 'first_year', 'last_year',
@@ -76,8 +80,10 @@ class JournalMetadataImporter(EntityImporter):
extra_ia = dict()
# TODO: would like an ia.longtail_ia flag
if row.get('sim'):
+ # NB: None case of the .get() here is blech, but othrwise
+ # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
extra_ia['sim'] = {
- 'year_spans': row['sim']['year_spans'],
+ 'year_spans': row['sim'].get('year_spans'),
}
if extra_ia:
extra['ia'] = extra_ia
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index a85c877c..7bb75c3e 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -231,20 +231,12 @@ def container_to_elasticsearch(entity):
container_type = entity.container_type,
issnl = entity.issnl,
wikidata_qid = entity.wikidata_qid,
-
- entity_status = entity.entity_status,
- language = entity.language,
- license = entity.license_slug,
- doi = entity.doi,
- pmid = entity.pmid,
- isbn13 = entity.isbn13,
- core_id = entity.core_id,
- arxiv_id = entity.core_id,
- jstor_id = entity.jstor_id,
)
# TODO: region, discipline
# TODO: single primary language?
+ if not entity.extra:
+ entity.extra = dict()
for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
if entity.extra.get(key):
t[key] = entity.extra[key]
@@ -285,13 +277,46 @@ def container_to_elasticsearch(entity):
if extra['ia'].get('sim'):
any_ia_sim = True
- t['in_doaj'] = is_doaj
- t['in_road'] = is_road
+ t['in_doaj'] = in_doaj
+ t['in_road'] = in_road
t['in_doi'] = in_doi
t['in_sherpa_romeo'] = in_sherpa_romeo
- t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa
+ t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
t['is_longtail_oa'] = is_longtail_oa
t['any_kbart'] = any_ia_sim
t['any_jstor'] = any_ia_sim
t['any_ia_sim'] = bool(any_ia_sim)
return t
+
+
+def changelog_to_elasticsearch(entity):
+
+ editgroup = entity.editgroup
+ t = dict(
+ index=entity.index,
+ editgroup_id=entity.editgroup_id,
+ timestamp=entity.timestamp,
+ editor_id=editgroup.editor_id,
+ )
+
+ extra = editgroup.extra or dict()
+ if extra.get('agent'):
+ t['agent'] = extra['agent']
+
+ t['containers'] = len(editgroup.edits.containers)
+ t['creators'] = len(editgroup.edits.containers)
+ t['files'] = len(editgroup.edits.containers)
+ t['filesets'] = len(editgroup.edits.containers)
+ t['webcaptures'] = len(editgroup.edits.containers)
+ t['releases'] = len(editgroup.edits.containers)
+ t['works'] = len(editgroup.edits.containers)
+
+ # TODO: parse and pull out counts
+ #created = 0
+ #updated = 0
+ #deleted = 0
+ #t['created'] = created
+ #t['updated'] = updated
+ #t['deleted'] = deleted
+ #t['total'] = created + updated + deleted
+ return t