8 files changed, 214 insertions, 30 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 761ad6ab..691c6ed5 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -38,13 +38,17 @@ There is a Dockerfile in this directory which includes this installation.
 
 Drop and rebuild the schema:
 
-    http delete :9200/fatcat
-    http put :9200/fatcat < release_schema.json
+    http delete :9200/fatcat_release
+    http delete :9200/fatcat_container
+    http delete :9200/fatcat_changelog
+    http put :9200/fatcat_release < release_schema.json
+    http put :9200/fatcat_container < container_schema.json
+    http put :9200/fatcat_changelog < changelog_schema.json
 
 Put a single object (good for debugging):
 
-    head -n1 examples.json | http post :9200/fatcat/release/0
-    http get :9200/fatcat/release/0
+    head -n1 examples.json | http post :9200/fatcat_release/release/0
+    http get :9200/fatcat_release/release/0
 
 Bulk insert from a file on disk:
 
@@ -53,14 +57,14 @@ Bulk insert from a file on disk:
 Or, in a bulk production live-stream conversion:
 
     export LC_ALL=C.UTF-8
-    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
+    time zcat /srv/fatcat_release/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
 
 ## Full-Text Querying
 
 A generic full-text "query string" query look like this (replace "blood" with
 actual query string, and "size" field with the max results to return):
 
-    GET /fatcat/release/_search
+    GET /fatcat_release/release/_search
     {
       "query": {
         "query_string": {
diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index 7a7ec90c..f3211e99 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -16,8 +16,9 @@
     "changelog": {
         "properties": {
             "index":            { "type": "integer" },
-            "editgorup_id":     { "type": "keyword" },
-            "timestamp":        { "type": "datetime" },
+            "editgroup_id":     { "type": "keyword" },
+            "timestamp":        { "type": "date" },
+            "editor_id":        { "type": "keyword" },
             "username":         { "type": "keyword" },
             "is_bot":           { "type": "boolean" },
             "is_admin":         { "type": "boolean" },
diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json
new file mode 100644
index 00000000..05583330
--- /dev/null
+++ b/extra/elasticsearch/fatcat_schema.json
@@ -0,0 +1,109 @@
+{
+"settings": {
+    "index": {
+        "analysis": {
+            "analyzer": {
+                "default": {
+                    "type": "custom",
+                    "tokenizer": "standard",
+                    "filter": [ "lowercase", "asciifolding" ]
+                },
+                "textIcu": {
+                    "type": "custom",
+                    "tokenizer": "icu_tokenizer",
+                    "char_filter": [ "icu_normalizer" ],
+                    "filter": [ "icu_folding" ]
+                },
+                "textIcuSearch": {
+                    "type": "custom",
+                    "tokenizer": "icu_tokenizer",
+                    "char_filter": [ "icu_normalizer" ],
+                    "filter": [ "icu_folding" ]
+                }
+            }
+        }
+    }
+},
+"mappings": {
+    "release": {
+        "properties": {
+            "ident":          { "type": "keyword" },
+            "state":          { "type": "keyword" },
+            "revision":       { "type": "keyword" },
+            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "release_date":   { "type": "date" },
+            "release_year":   { "type": "integer" },
+            "release_type":   { "type": "keyword" },
+            "release_status": { "type": "keyword" },
+            "language":       { "type": "keyword" },
+            "doi":            { "type": "keyword" },
+            "pmid":           { "type": "keyword" },
+            "pmcid":          { "type": "keyword" },
+            "isbn13":         { "type": "keyword" },
+            "wikidata_qid":   { "type": "keyword" },
+            "core_id":        { "type": "keyword" },
+            "axiv_id":        { "type": "keyword" },
+            "jstor_id":       { "type": "keyword" },
+            "license":        { "type": "keyword" },
+            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_issnl":          { "type": "keyword" },
+            "container_type":           { "type": "keyword" },
+            "contrib_count":        { "type": "integer" },
+            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "ref_count":            { "type": "integer" },
+            "file_count":           { "type": "integer" },
+            "fileset_count":        { "type": "integer" },
+            "webcapture_count":     { "type": "integer" },
+            "any_abstract":         { "type": "boolean" },
+
+            "best_pdf_url":         { "type": "keyword" },
+            "ia_pdf_url":           { "type": "keyword" },
+            "is_oa":                { "type": "boolean" },
+            "is_longtail_oa":       { "type": "boolean" },
+            "is_preserved":         { "type": "boolean" },
+            "in_kbart":             { "type": "boolean" },
+            "in_jstor":             { "type": "boolean" },
+            "in_dweb":              { "type": "boolean" },
+            "in_web":               { "type": "boolean" },
+            "in_ia":                { "type": "boolean" },
+            "in_ia_sim":            { "type": "boolean" },
+            "in_shadows":           { "type": "boolean" },
+
+            "author":         { "type": "alias", "path": "contrib_names" },
+            "journal":        { "type": "alias", "path": "container_name" },
+            "date":           { "type": "alias", "path": "release_date" },
+            "year":           { "type": "alias", "path": "release_year" },
+            "issn":           { "type": "alias", "path": "container_issnl" },
+            "oa":             { "type": "alias", "path": "is_oa" },
+            "longtail":       { "type": "alias", "path": "is_longtail_oa" },
+            "lang":           { "type": "alias", "path": "language" },
+            "file_pdf_url":   { "type": "alias", "path": "best_pdf_url" },
+            "is_kept":        { "type": "alias", "path": "in_kbart" }
+        }
+    },
+    "changelog": {
+        "properties": {
+            "index":            { "type": "integer" },
+            "editgorup_id":     { "type": "keyword" },
+            "timestamp":        { "type": "date" },
+            "username":         { "type": "keyword" },
+            "is_bot":           { "type": "boolean" },
+            "is_admin":         { "type": "boolean" },
+            "agent":            { "type": "keyword" },
+            "containers":       { "type": "integer" },
+            "creators":         { "type": "integer" },
+            "files":            { "type": "integer" },
+            "filessets":        { "type": "integer" },
+            "webcaptures":      { "type": "integer" },
+            "releases":         { "type": "integer" },
+            "works":            { "type": "integer" },
+            "created":          { "type": "integer" },
+            "updated":          { "type": "integer" },
+            "deleted":          { "type": "integer" },
+            "total":            { "type": "integer" }
+        }
+    }
+}
+}
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index cf8bf1c3..027d6c0a 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -12,9 +12,10 @@ import json
 import argparse
 import fatcat_client
 from fatcat_client.rest import ApiException
-from fatcat_client import ReleaseEntity
+from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry
 from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \
-    release_to_elasticsearch, public_api
+    release_to_elasticsearch, container_to_elasticsearch, \
+    changelog_to_elasticsearch, public_api
 
 
 def run_export_releases(args):
@@ -30,9 +31,27 @@ def run_transform_releases(args):
         line = line.strip()
         if not line:
             continue
-        release = entity_from_json(line, ReleaseEntity)
+        entity = entity_from_json(line, ReleaseEntity)
         args.json_output.write(
-            json.dumps(release_to_elasticsearch(release)) + '\n')
+            json.dumps(release_to_elasticsearch(entity)) + '\n')
+
+def run_transform_containers(args):
+    for line in args.json_input:
+        line = line.strip()
+        if not line:
+            continue
+        entity = entity_from_json(line, ContainerEntity)
+        args.json_output.write(
+            json.dumps(container_to_elasticsearch(entity)) + '\n')
+
+def run_transform_changelogs(args):
+    for line in args.json_input:
+        line = line.strip()
+        if not line:
+            continue
+        entity = entity_from_json(line, ChangelogEntry)
+        args.json_output.write(
+            json.dumps(changelog_to_elasticsearch(entity)) + '\n')
 
 def run_export_changelog(args):
     api = args.api
@@ -74,6 +93,24 @@ def main():
         help="where to send output",
         default=sys.stdout, type=argparse.FileType('w'))
 
+    sub_transform_containers = subparsers.add_parser('transform-containers')
+    sub_transform_containers.set_defaults(func=run_transform_containers)
+    sub_transform_containers.add_argument('json_input',
+        help="JSON-per-line of container entities",
+        default=sys.stdin, type=argparse.FileType('r'))
+    sub_transform_containers.add_argument('json_output',
+        help="where to send output",
+        default=sys.stdout, type=argparse.FileType('w'))
+
+    sub_transform_changelogs = subparsers.add_parser('transform-changelogs')
+    sub_transform_changelogs.set_defaults(func=run_transform_changelogs)
+    sub_transform_changelogs.add_argument('json_input',
+        help="JSON-per-line of changelog entries",
+        default=sys.stdin, type=argparse.FileType('r'))
+    sub_transform_changelogs.add_argument('json_output',
+        help="where to send output",
+        default=sys.stdout, type=argparse.FileType('w'))
+
     sub_changelog = subparsers.add_parser('changelog')
     sub_changelog.set_defaults(func=run_export_changelog)
     sub_changelog.add_argument('--start',
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index e2b1e3a2..64c45062 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -1,4 +1,6 @@
 
 from .api_auth import authenticated_api, public_api
 from .fcid import fcid2uuid, uuid2fcid
-from .transforms import entity_to_dict, entity_from_json, release_to_elasticsearch
+from .transforms import entity_to_dict, entity_from_json, \
+    release_to_elasticsearch, container_to_elasticsearch, \
+    changelog_to_elasticsearch
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index ebdce56f..a29b3019 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -236,8 +236,8 @@ class EntityImporter:
         self._entity_queue.append(entity)
         if len(self._entity_queue) >= self.edit_batch_size:
             self.insert_batch(self._entity_queue)
-            self.counts['insert'] += len(_entity_queue)
-            self._entity_queue = 0
+            self.counts['insert'] += len(self._entity_queue)
+            self._entity_queue = []
 
     def want(self, raw_record):
         """
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
index 7f6b1ee8..be62d63a 100644
--- a/python/fatcat_tools/importers/journal_metadata.py
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -44,7 +44,7 @@ class JournalMetadataImporter(EntityImporter):
             editgroup_extra=eg_extra)
 
     def want(self, raw_record):
-        if raw_record.get('issnl'):
+        if raw_record.get('issnl') and raw_record.get('name'):
             return True
         return False
 
@@ -55,6 +55,10 @@ class JournalMetadataImporter(EntityImporter):
         returns a ContainerEntity (or None if invalid or couldn't parse)
         """
 
+        if not row.get('name'):
+            # Name is required (by schema)
+            return None
+
         extra = dict()
         for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',
             'coden', 'aliases', 'original_name', 'first_year', 'last_year',
@@ -76,8 +80,10 @@ class JournalMetadataImporter(EntityImporter):
         extra_ia = dict()
         # TODO: would like an ia.longtail_ia flag
         if row.get('sim'):
+            # NB: None case of the .get() here is blech, but othrwise
+            # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on
             extra_ia['sim'] = {
-                'year_spans': row['sim']['year_spans'],
+                'year_spans': row['sim'].get('year_spans'),
             }
         if extra_ia:
             extra['ia'] = extra_ia
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index a85c877c..7bb75c3e 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -231,20 +231,12 @@ def container_to_elasticsearch(entity):
         container_type = entity.container_type,
         issnl = entity.issnl,
         wikidata_qid = entity.wikidata_qid,
-
-        entity_status = entity.entity_status,
-        language = entity.language,
-        license = entity.license_slug,
-        doi = entity.doi,
-        pmid = entity.pmid,
-        isbn13 = entity.isbn13,
-        core_id = entity.core_id,
-        arxiv_id = entity.core_id,
-        jstor_id = entity.jstor_id,
     )
 
     # TODO: region, discipline
     # TODO: single primary language?
+    if not entity.extra:
+        entity.extra = dict()
     for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
         if entity.extra.get(key):
             t[key] = entity.extra[key]
@@ -285,13 +277,46 @@ def container_to_elasticsearch(entity):
         if extra['ia'].get('sim'):
             any_ia_sim = True
 
-    t['in_doaj'] = is_doaj
-    t['in_road'] = is_road
+    t['in_doaj'] = in_doaj
+    t['in_road'] = in_road
     t['in_doi'] = in_doi
     t['in_sherpa_romeo'] = in_sherpa_romeo
-    t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa
+    t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
     t['is_longtail_oa'] = is_longtail_oa
     t['any_kbart'] = any_ia_sim
     t['any_jstor'] = any_ia_sim
     t['any_ia_sim'] = bool(any_ia_sim)
     return t
+
+
+def changelog_to_elasticsearch(entity):
+
+    editgroup = entity.editgroup
+    t = dict(
+        index=entity.index,
+        editgroup_id=entity.editgroup_id,
+        timestamp=entity.timestamp,
+        editor_id=editgroup.editor_id,
+    )
+
+    extra = editgroup.extra or dict()
+    if extra.get('agent'):
+        t['agent'] = extra['agent']
+
+    t['containers'] = len(editgroup.edits.containers)
+    t['creators'] = len(editgroup.edits.containers)
+    t['files'] = len(editgroup.edits.containers)
+    t['filesets'] = len(editgroup.edits.containers)
+    t['webcaptures'] = len(editgroup.edits.containers)
+    t['releases'] = len(editgroup.edits.containers)
+    t['works'] = len(editgroup.edits.containers)
+
+    # TODO: parse and pull out counts
+    #created = 0
+    #updated = 0
+    #deleted = 0
+    #t['created'] = created
+    #t['updated'] = updated
+    #t['deleted'] = deleted
+    #t['total'] = created + updated + deleted
+    return t