From 901cf998ce7d8f896cf5d609719b1defd96d01d4 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 16:00:03 -0800
Subject: first implementation of ES file schema

Includes a trivial test and transform, but not any workers or doc
updates.
---
 python/fatcat_tools/transforms/elasticsearch.py | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 3a53db4d..8141a8b9 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -357,3 +357,48 @@ def changelog_to_elasticsearch(entity):
     #t['deleted'] = deleted
     #t['total'] = created + updated + deleted
     return t
+
+
+def file_to_elasticsearch(entity):
+    """
+    Converts from an entity model/schema to elasticsearch oriented schema.
+
+    Returns: dict
+    Raises exception on error (never returns None)
+    """
+
+    if entity.state in ('redirect', 'deleted'):
+        return dict(
+            ident = entity.ident,
+            state = entity.state,
+        )
+    elif entity.state != 'active':
+        raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+    # First, the easy ones (direct copy)
+    t = dict(
+        ident = entity.ident,
+        state = entity.state,
+        revision = entity.revision,
+        release_ids = entity.release_ids,
+        release_count = len(entity.release_ids),
+        mimetype = entity.mimetype,
+        size_bytes = entity.size,
+        sha1 = entity.sha1,
+        sha256 = entity.sha256,
+        md5 = entity.md5,
+        rel = [u.rel for u in entity.urls],
+    )
+
+    # TODO: domain, hosts (from urls; use proper urlcanon)
+    t['rel'] = list(set([u.rel for u in entity.urls]))
+    t['host'] = []
+    t['domain'] = []
+
+    in_ia = False
+    for u in entity.urls:
+        if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
+            in_ia = True
+    t['in_ia'] = bool(in_ia)
+
+    return t
-- 
cgit v1.2.3


From 8e8b447a1d142b7815498ffa02263c34207973b4 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 16:21:13 -0800
Subject: container ES schema changes

---
 extra/elasticsearch/container_schema.json       | 33 ++++++++++++++----------
 python/fatcat_tools/transforms/elasticsearch.py | 34 +++++++++++++------------
 2 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index b0a47e85..3be261a2 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -27,13 +27,17 @@
 "mappings": {
     "container": {
         "properties": {
-            "ident":          { "type": "keyword" },
+            "ident":          { "type": "keyword", "doc_values": false },
             "state":          { "type": "keyword" },
-            "revision":       { "type": "keyword" },
-            "name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "publisher":      { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "revision":       { "type": "keyword", "doc_values": false },
+            "name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "original_name":  { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "publisher":      { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "abbrev":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "aliases":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "container_type": { "type": "keyword" },
             "issnl":          { "type": "keyword" },
+            "issns":          { "type": "keyword" },
             "wikidata_qid":   { "type": "keyword" },
             "country":        { "type": "keyword" },
             "region":         { "type": "keyword" },
@@ -43,15 +47,17 @@
             "first_year":     { "type": "integer" },
             "last_year":      { "type": "integer" },
 
-            "in_doaj":        { "type": "boolean" },
-            "in_road":        { "type": "boolean" },
-            "in_doi":         { "type": "boolean" },
-            "in_sherpa_romeo":{ "type": "boolean" },
-            "is_oa":          { "type": "boolean" },
-            "is_longtail_oa": { "type": "boolean" },
-            "any_kbart":      { "type": "boolean" },
-            "any_jstor":      { "type": "boolean" },
-            "any_ia_sim":        { "type": "boolean" },
+
+            "biblio":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+            "in_doaj":              { "type": "boolean" },
+            "in_road":              { "type": "boolean" },
+            "is_oa":                { "type": "boolean" },
+            "is_longtail_oa":       { "type": "boolean" },
+            "any_kbart":            { "type": "boolean" },
+            "any_jstor":            { "type": "boolean" },
+            "any_ia_sim":           { "type": "boolean" },
+            "sherpa_romeo_color":   { "type": "keyword" },
 
             "releases_total": { "type": "integer" },
             "releases_kbart": { "type": "integer" },
@@ -64,6 +70,7 @@
 
             "year":           { "type": "alias", "path": "first_year" },
             "type":           { "type": "alias", "path": "container_type" },
+            "issn":           { "type": "alias", "path": "issns" },
             "oa":             { "type": "alias", "path": "is_oa" },
             "longtail":       { "type": "alias", "path": "is_longtail_oa" }
         }
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 8141a8b9..edc68748 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -257,23 +257,24 @@ def container_to_elasticsearch(entity, force_bool=True):
         wikidata_qid = entity.wikidata_qid,
     )
 
-    # TODO: region, discipline
-    # TODO: single primary language?
     if not entity.extra:
         entity.extra = dict()
-    for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
+    for key in ('country', 'languages', 'mimetypes', 'original_name',
+                'first_year', 'last_year', 'aliases', 'abbrev', 'region',
+                'discipline'):
         if entity.extra.get(key):
             t[key] = entity.extra[key]
 
+    t['issns'] = []
+    if entity.issnl:
+        t['issns'].append(entity.issnl)
+    for key in ('issnp', 'issne'):
+        if entity.extra.get(key):
+            t['issns'].append(entity.extra[key])
+
     in_doaj = None
     in_road = None
-    # TODO: not currently implemented
-    in_doi = None
-    # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid"
-    #in_doaj_works = None
-    in_sherpa_romeo = None
     is_oa = None
-    # TODO: not actually set/stored anywhere?
     is_longtail_oa = None
     any_kbart = None
     any_jstor = None
@@ -295,8 +296,9 @@ def container_to_elasticsearch(entity, force_bool=True):
     if extra.get('default_license'):
         if extra['default_license'].startswith('CC-'):
             is_oa = True
+    t['sherpa_romeo_color'] = None
     if extra.get('sherpa_romeo'):
-        in_sherpa_romeo = True
+        t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color')
         if extra['sherpa_romeo'].get('color') == 'white':
             is_oa = False
     if extra.get('kbart'):
@@ -306,21 +308,21 @@ def container_to_elasticsearch(entity, force_bool=True):
     if extra.get('ia'):
         if extra['ia'].get('sim'):
             any_ia_sim = True
+        if extra['ia'].get('longtail_oa'):
+            is_longtail_oa = True
     t['is_superceded'] = bool(extra.get('superceded'))
 
     t['in_doaj'] = bool(in_doaj)
     t['in_road'] = bool(in_road)
-    t['in_sherpa_romeo'] = bool(in_sherpa_romeo)
     t['any_kbart'] = bool(any_kbart)
-    t['is_longtail_oa'] = bool(is_longtail_oa)
     if force_bool:
-        t['in_doi'] = bool(in_doi)
-        t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa)
+        t['is_oa'] = bool(in_doaj or in_road or is_oa)
+        t['is_longtail_oa'] = bool(is_longtail_oa)
         t['any_jstor'] = bool(any_jstor)
         t['any_ia_sim'] = bool(any_ia_sim)
     else:
-        t['in_doi'] = in_doi
-        t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
+        t['is_oa'] = in_doaj or in_road or is_oa
+        t['is_longtail_oa'] = is_longtail_oa
         t['any_jstor'] = any_jstor
         t['any_ia_sim'] = any_ia_sim
     return t
-- 
cgit v1.2.3


From e047fbe1a9c495e86a6757d44eb32c9109a1b753 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 20:39:22 -0800
Subject: ES release schema updates

---
 extra/elasticsearch/release_schema.json         | 69 ++++++++++++++-------
 python/fatcat_tools/transforms/elasticsearch.py | 81 +++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 28 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 85026060..98a1c28e 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -27,48 +27,62 @@
 "mappings": {
     "release": {
         "properties": {
-            "ident":          { "type": "keyword" },
+            "ident":          { "type": "keyword", "doc_values": false },
             "state":          { "type": "keyword" },
-            "revision":       { "type": "keyword" },
-            "work_id":        { "type": "keyword" },
-            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "subtitle":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "revision":       { "type": "keyword", "doc_values": false },
+            "work_id":        { "type": "keyword", "doc_values": false },
+            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "subtitle":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "release_date":   { "type": "date" },
-            "release_year":   { "type": "integer" },
-            "release_type":   { "type": "keyword" },
+            "release_year":   { "type": "integer", "copy_to": "biblio" },
+            "release_type":   { "type": "keyword", "copy_to": "biblio" },
             "release_stage":  { "type": "keyword" },
-            "withdrawn_status": { "type": "keyword" },
+            "withdrawn_status": { "type": "keyword", "copy_to": "biblio" },
             "language":       { "type": "keyword" },
-            "doi":            { "type": "keyword" },
-            "pmid":           { "type": "keyword" },
-            "pmcid":          { "type": "keyword" },
-            "isbn13":         { "type": "keyword" },
-            "wikidata_qid":   { "type": "keyword" },
-            "core_id":        { "type": "keyword" },
-            "axiv_id":        { "type": "keyword" },
-            "jstor_id":       { "type": "keyword" },
-            "ark_id":         { "type": "keyword" },
-            "mag_id":         { "type": "keyword" },
+            "volume":         { "type": "keyword", "copy_to": "biblio" },
+            "issue":          { "type": "keyword", "copy_to": "biblio" },
+            "pages":          { "type": "keyword", "copy_to": "biblio" },
+            "first_page":     { "type": "keyword" },
+            "number":         { "type": "keyword", "copy_to": "biblio" },
+            "doi":            { "type": "keyword", "doc_values": false },
+            "doi_prefix":     { "type": "keyword" },
+            "doi_registrar":  { "type": "keyword" },
+            "pmid":           { "type": "keyword", "doc_values": false },
+            "pmcid":          { "type": "keyword", "doc_values": false },
+            "isbn13":         { "type": "keyword", "doc_values": false },
+            "wikidata_qid":   { "type": "keyword", "doc_values": false },
+            "core_id":        { "type": "keyword", "doc_values": false },
+            "axiv_id":        { "type": "keyword", "doc_values": false },
+            "jstor_id":       { "type": "keyword", "doc_values": false },
+            "ark_id":         { "type": "keyword", "doc_values": false },
+            "mag_id":         { "type": "keyword", "doc_values": false },
             "license":        { "type": "keyword" },
             "publisher":            { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-            "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "container_id":         { "type": "keyword" },
             "container_issnl":      { "type": "keyword" },
             "container_type":       { "type": "keyword" },
             "contrib_count":        { "type": "integer" },
-            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "affiliations":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "affiliation_rors":     { "type": "keyword" },
             "creator_ids":          { "type": "keyword" },
             "ref_count":            { "type": "integer" },
             "ref_linked_count":     { "type": "integer" },
+            "ref_release_ids":      { "type": "keyword" },
             "file_count":           { "type": "integer" },
             "fileset_count":        { "type": "integer" },
             "webcapture_count":     { "type": "integer" },
             "any_abstract":         { "type": "boolean" },
 
-            "best_pdf_url":         { "type": "keyword" },
-            "ia_pdf_url":           { "type": "keyword" },
+            "biblio":               { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+            "best_pdf_url":         { "type": "keyword", "doc_values": false },
+            "ia_pdf_url":           { "type": "keyword", "doc_values": false },
+            "ia_microfilm_url":     { "type": "keyword", "doc_values": false },
             "is_oa":                { "type": "boolean" },
+            "oa_color":             { "type": "keyword" },
             "is_longtail_oa":       { "type": "boolean" },
             "is_preserved":         { "type": "boolean" },
             "in_kbart":             { "type": "boolean" },
@@ -79,7 +93,13 @@
             "in_ia_sim":            { "type": "boolean" },
             "in_shadows":           { "type": "boolean" },
             "is_superceded":        { "type": "boolean" },
+            "is_retracted":         { "type": "boolean" },
+            "preservation":         { "type": "keyword" },
 
+            "affilation":     { "type": "alias", "path": "affiliations" },
+            "ror":            { "type": "alias", "path": "affiliation_rors" },
+            "creator_id":     { "type": "alias", "path": "creator_id" },
+            "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
             "author":         { "type": "alias", "path": "contrib_names" },
             "journal":        { "type": "alias", "path": "container_name" },
             "date":           { "type": "alias", "path": "release_date" },
@@ -90,6 +110,9 @@
             "lang":           { "type": "alias", "path": "language" },
             "file_pdf_url":   { "type": "alias", "path": "best_pdf_url" },
             "release_status": { "type": "alias", "path": "release_stage" },
+            "stage":          { "type": "alias", "path": "release_stage" },
+            "type":           { "type": "alias", "path": "release_type" },
+            "retracted":      { "type": "alias", "path": "is_retracted" },
             "is_kept":        { "type": "alias", "path": "in_kbart" }
         }
     }
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index edc68748..b997796d 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -50,6 +50,10 @@ def release_to_elasticsearch(entity, force_bool=True):
         release_stage = release.release_stage,
         withdrawn_status = release.withdrawn_status,
         language = release.language,
+        volume = release.volume,
+        issue = release.issue,
+        pages = release.pages,
+        number = release.number,
         license = release.license_slug,
         doi = release.ext_ids.doi,
         pmid = release.ext_ids.pmid,
@@ -72,7 +76,7 @@ def release_to_elasticsearch(entity, force_bool=True):
     in_dweb = False
     in_ia = False
     in_ia_sim = False
-    in_shadow = False
+    in_shadows = False
 
     release_year = release.release_year
     if release.release_date:
@@ -85,11 +89,15 @@ def release_to_elasticsearch(entity, force_bool=True):
 
     t['any_abstract'] = len(release.abstracts or []) > 0
     t['ref_count'] = len(release.refs or [])
-    t['ref_linked_count'] = 0
-    if release.refs:
-        t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id])
+    ref_release_ids = []
+    for r in (release.refs or []):
+        if r.target_release_id:
+            ref_release_ids.append(r.target_release_id)
+    t['ref_release_ids'] = ref_release_ids
+    t['ref_linked_count'] = len(ref_release_ids)
     t['contrib_count'] = len(release.contribs or [])
     contrib_names = []
+    contrib_affiliations = []
     creator_ids = []
     for c in (release.contribs or []):
         if c.raw_name:
@@ -98,8 +106,14 @@ def release_to_elasticsearch(entity, force_bool=True):
             contrib_names.append(c.surname)
         if c.creator_id:
             creator_ids.append(c.creator_id)
+        if c.raw_affiliation:
+            contrib_affiliations.append(c.raw_affiliation)
     t['contrib_names'] = contrib_names
     t['creator_ids'] = creator_ids
+    t['affiliations'] = contrib_affiliations
+
+    # TODO: mapping... probably by lookup?
+    t['affiliation_rors'] = None
 
     container = release.container
     if container:
@@ -140,8 +154,13 @@ def release_to_elasticsearch(entity, force_bool=True):
             if c_extra.get('szczepanski'):
                 if c_extra['szczepanski'].get('as_of'):
                     is_oa = True
-    else:
+
+    # fall back to release-level container metadata if container not linked or
+    # missing context
+    if not t.get('publisher'):
         t['publisher'] = release.publisher
+    if not t.get('container_name') and release.extra:
+        t['container_name'] = release.extra.get('container_name')
 
     if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
         in_jstor = True
@@ -203,6 +222,46 @@ def release_to_elasticsearch(entity, force_bool=True):
             if extra['crossref'].get('archive'):
                 # all crossref archives are KBART, I believe
                 in_kbart = True
+        # backwards compatible subtitle fetching
+        if not t['subtitle'] and extra.get('subtitle'):
+            if type(extra['subtitle']) == list:
+                t['subtitle'] = extra['subtitle'][0]
+            else:
+                t['subtitle'] = extra['subtitle']
+
+    t['first_page'] = None
+    if release.pages:
+        first = release.pages.split('-')[0]
+        first = first.replace('p', '')
+        if release.pages.isdigit():
+            t['first_page'] = release.pages
+        # TODO: non-numerical first pages
+
+    t['ia_microfilm_url'] = None
+    if in_ia_sim:
+        # TODO: determine URL somehow? I think this is in flux. Will probably
+        # need extra metadata in the container extra field.
+        # special case as a demo for now.
+        if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
+                and release.year in (2011, 2013) \
+                and release.volume.isdigit() \
+                and t['first_page']:
+            t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
+                release.year,
+                release.volume - 1,
+                t['first_page'],
+            )
+
+    t['doi_registrar'] = None
+    if extra and t['doi']:
+        for k in ('crossref', 'datacite', 'jalc'):
+            if k in extra:
+                t['doi_registrar'] = k
+        if not 'doi_registrar' in t:
+            t['doi_registrar'] = 'crossref'
+
+    if t['doi']:
+        t['doi_prefix'] = t['doi'].split('/')[0]
 
     if is_longtail_oa:
         is_oa = True
@@ -215,6 +274,7 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['in_jstor'] = bool(in_jstor)
         t['in_web'] = bool(in_web)
         t['in_dweb'] = bool(in_dweb)
+        t['in_shadows'] = bool(in_shadows)
     else:
         t['is_oa'] = is_oa
         t['is_longtail_oa'] = is_longtail_oa
@@ -223,9 +283,20 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['in_jstor'] = in_jstor
         t['in_web'] = in_web
         t['in_dweb'] = in_dweb
+        t['in_shadows'] = in_shadows
 
     t['in_ia'] = bool(in_ia)
     t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+
+    if in_ia:
+        t['preservation'] = 'bright'
+    elif in_kbart or in_jstor:
+        t['preservation'] = 'dark_only'
+    elif in_shadows:
+        t['preservation'] = 'shadows_only'
+    else:
+        t['preservation'] = 'none'
+
     return t
 
 def container_to_elasticsearch(entity, force_bool=True):
-- 
cgit v1.2.3


From 5d458a3df7e58e6551d8ec72979e376c62fdd2f7 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 21:52:33 -0800
Subject: fix some transform bugs, add some tests

---
 python/fatcat_tools/transforms/elasticsearch.py    |  14 +--
 python/fatcat_transform.py                         |  26 ++++-
 python/tests/files/changelog_3469683.json          |   1 +
 .../files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json     |   1 +
 .../files/release_etodop5banbndg3faecnfm6ozi.json  |   1 +
 python/tests/transform_elasticsearch.py            | 114 +++++++++++++++++++++
 python/tests/transform_tests.py                    | 106 -------------------
 7 files changed, 149 insertions(+), 114 deletions(-)
 create mode 100644 python/tests/files/changelog_3469683.json
 create mode 100644 python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json
 create mode 100644 python/tests/files/release_etodop5banbndg3faecnfm6ozi.json
 create mode 100644 python/tests/transform_elasticsearch.py
 delete mode 100644 python/tests/transform_tests.py

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index b997796d..812cd1fd 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -20,6 +20,7 @@ def test_check_kbart():
     assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False
     assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True
 
+
 def release_to_elasticsearch(entity, force_bool=True):
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
@@ -233,8 +234,8 @@ def release_to_elasticsearch(entity, force_bool=True):
     if release.pages:
         first = release.pages.split('-')[0]
         first = first.replace('p', '')
-        if release.pages.isdigit():
-            t['first_page'] = release.pages
+        if first.isdigit():
+            t['first_page'] = first
         # TODO: non-numerical first pages
 
     t['ia_microfilm_url'] = None
@@ -243,12 +244,12 @@ def release_to_elasticsearch(entity, force_bool=True):
         # need extra metadata in the container extra field.
         # special case as a demo for now.
         if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
-                and release.year in (2011, 2013) \
-                and release.volume.isdigit() \
+                and release.release_year in (2011, 2013) \
+                and release.issue.isdigit() \
                 and t['first_page']:
             t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
-                release.year,
-                release.volume - 1,
+                release.release_year,
+                int(release.issue) - 1,
                 t['first_page'],
             )
 
@@ -299,6 +300,7 @@ def release_to_elasticsearch(entity, force_bool=True):
 
     return t
 
+
 def container_to_elasticsearch(entity, force_bool=True):
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py
index ccb13871..42d2ea99 100755
--- a/python/fatcat_transform.py
+++ b/python/fatcat_transform.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 
 """
+Utility script for doing bulk conversion/tranforms of entity JSON schema to
+other formats
 """
 
 import sys
@@ -15,10 +17,11 @@ from citeproc_styles import get_style_filepath
 
 import fatcat_openapi_client
 from fatcat_openapi_client.rest import ApiException
-from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ChangelogEntry
+from fatcat_openapi_client import ReleaseEntity, ContainerEntity, FileEntity, ChangelogEntry
 from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \
     release_to_elasticsearch, container_to_elasticsearch, \
-    changelog_to_elasticsearch, public_api, release_to_csl, citeproc_csl
+    file_to_elasticsearch, changelog_to_elasticsearch, public_api, \
+    release_to_csl, citeproc_csl
 
 
 def run_elasticsearch_releases(args):
@@ -39,6 +42,15 @@ def run_elasticsearch_containers(args):
         args.json_output.write(
             json.dumps(container_to_elasticsearch(entity)) + '\n')
 
+def run_elasticsearch_files(args):
+    for line in args.json_input:
+        line = line.strip()
+        if not line:
+            continue
+        entity = entity_from_json(line, FileEntity, api_client=args.api.api_client)
+        args.json_output.write(
+            json.dumps(file_to_elasticsearch(entity)) + '\n')
+
 def run_elasticsearch_changelogs(args):
     for line in args.json_input:
         line = line.strip()
@@ -87,6 +99,16 @@ def main():
         help="where to send output",
         default=sys.stdout, type=argparse.FileType('w'))
 
+    sub_elasticsearch_files = subparsers.add_parser('elasticsearch-files',
+        help="convert fatcat file JSON schema to elasticsearch file schema")
+    sub_elasticsearch_files.set_defaults(func=run_elasticsearch_files)
+    sub_elasticsearch_files.add_argument('json_input',
+        help="JSON-per-line of file entities",
+        default=sys.stdin, type=argparse.FileType('r'))
+    sub_elasticsearch_files.add_argument('json_output',
+        help="where to send output",
+        default=sys.stdout, type=argparse.FileType('w'))
+
     sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs',
         help="convert fatcat changelog JSON schema to elasticsearch changelog schema")
     sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs)
diff --git a/python/tests/files/changelog_3469683.json b/python/tests/files/changelog_3469683.json
new file mode 100644
index 00000000..7a847b16
--- /dev/null
+++ b/python/tests/files/changelog_3469683.json
@@ -0,0 +1 @@
+{"index":3469683,"editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","timestamp":"2020-01-30T05:04:39.738601Z","editgroup":{"editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","editor_id":"scmbogxw25evtcesfcab5qaboa","editor":{"editor_id":"scmbogxw25evtcesfcab5qaboa","username":"crawl-bot","is_admin":true,"is_bot":true,"is_active":true},"changelog_index":3469683,"created":"2020-01-30T05:04:39.738601Z","description":"Files crawled from web using sandcrawler ingest tool","extra":{"agent":"fatcat_tools.IngestFileResultImporter","git_rev":"v0.3.1-280-ga889f32"},"edits":{"containers":[],"creators":[],"files":[{"edit_id":"ba819a2b-a4d0-43e6-9e5c-505284c8ae42","ident":"e3lmbzqyjjam3a5nqnccc6d654","revision":"7a606095-9d07-41ee-898a-bcf8b6bc0004","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/23802359.2020.1715878"}},{"edit_id":"a71c4b91-d599-4422-a7a6-527562161278","ident":"e62h2fa6fba6ve3lukv7n635fq","revision":"1374f1bd-684a-48b9-aaff-65b9e90083b5","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1186/s13071-020-3909-6"}},{"edit_id":"82e4d65d-0335-4a40-b9c9-bf38f9bd7b19","ident":"fam7ii245zasvnesikw7bhmoii","revision":"327e3358-2b2b-4919-9613-449bdbb76c55","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i1.8294"}},{"edit_id":"49e987dc-fc6f-4391-8b75-33176f03b5cb","ident":"fa6sljsebjapfojqapxd3dj4um","revision":"7f51f4fa-e448-410a-b7d5-7ae9cdf9fcb8","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33536/jcpe.v4i1.293"}},{"edit_id":"e62ed1ca-3961-423b-ab05-967694e32f70","ident":"fhhzbabf3zcx7p2tor2omqveyq","revision":"d40c3f8c-255f-4913-ace8-06db6af66697","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.59"}},{"edit_id":"337b8ed6-1248-4872-81da-d16f6db021e6","ident":"fllcoo4smfdyrh5q5lmu72e7cq","revision":"724b26cd-6ab1-46a1-bc88-a87410fdf102","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/1343943x.2020.1717970"}},{"edit_id":"43a0c9d5-692e-4c73-b154-d8769854d268","ident":"fzshcc6sfzegbduum2763o3lgy","revision":"a8f5e34b-fbf4-4d52-987e-887455e6bd50","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i1.8099"}},{"edit_id":"48767850-1141-47e3-80ef-556605e3588c","ident":"grdztt2vwjd65ovcifeo3ysbam","revision":"12dceb74-6333-4a17-b1e8-4afac9df1888","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.26565/2220-8089-2019-36-02"}},{"edit_id":"4b3b92ac-c250-4b78-9e91-43fc893c935e","ident":"hgpzsozky5amvcts45qb6nhqum","revision":"b5951fd6-dd36-422c-9770-8cdfb7e6d82d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934410"}},{"edit_id":"dabff0df-6f52-4adb-a63f-1965f30d8bd2","ident":"hpxdh7mykng77jyfoolpixzw2y","revision":"0e06e57c-0756-4ee5-8102-0d96478aa23f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.26452/ijrps.v11i1.1916"}},{"edit_id":"b757f2b5-5f97-45b8-bfcd-9c9fb08047b8","ident":"htorhznxdfdppbvpf57nrz536q","revision":"034f4640-fbb6-4123-9ad8-f934220ab820","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra09196c"}},{"edit_id":"5662469f-a4d3-4339-88a0-4d27ef3f5f58","ident":"if6a63p7rnaxbjkcr4egtihj74","revision":"08d384cf-396c-41e1-a14d-6bde24b47323","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1192/bjb.2019.92"}},{"edit_id":"540a1ec1-7879-42f4-b749-d3287eb26ef7","ident":"i76ou5g2jndo7gyjrxdhoks3bm","revision":"eddbf701-59c3-4f33-b26b-87ad29297f65","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934355"}},{"edit_id":"04a0b805-b09f-4ff6-80b3-77cb643c72d7","ident":"jl2z2az4sjfpdigdts3xjm24vy","revision":"a541e130-07be-43ae-8fbf-b0295d5b576f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1080/23802359.2020.1719936"}},{"edit_id":"3e4566b9-7e3f-4e02-aa81-b6299545b150","ident":"jsrgx72devbadbyyiqwm2bl4aa","revision":"f51f164e-0e96-4245-973b-3408250ebc3b","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.09"}},{"edit_id":"80c2f0ca-ba43-4481-b5a5-c15d87d4d7b4","ident":"kijz6wlf25dito3av5snrz345a","revision":"fa1b5279-2221-4063-853e-070ca2d5954b","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s2045796020000037"}},{"edit_id":"c0f44f12-e4c7-45d4-a79a-a3ce2d628e78","ident":"ky3cjfbzejecxaa5tjaaucurb4","revision":"0e227663-f825-42e8-bf57-ba50fea60bf5","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s204579602000013x"}},{"edit_id":"92093947-7ff5-48ad-bc47-b26d4c0959c8","ident":"llfciusk6jf6rd35ofpuufmgfe","revision":"ed89f8b0-98cd-4886-b99e-b5d276b2ac9f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934349"}},{"edit_id":"4cb0e042-4298-4d0b-98b8-3c0c808642ea","ident":"mdw33cq7svdfnlaevnpih7bsyu","revision":"4a0ed3ce-56f2-4299-861c-051e7c06499d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jjr.v9i2.8073"}},{"edit_id":"611fc1e1-79e1-4e10-a702-0420919407f6","ident":"mwghms3u2zecdf2x5zk7tzs4mq","revision":"101d7d0f-6aca-4df3-a08a-f4a3f26d3cd4","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1186/s12864-020-6509-0"}},{"edit_id":"b9a350bd-3497-4c83-97c1-0a20a420a287","ident":"ooejwh3g35cfrmmk4bvjcqxrai","revision":"8a4e8a05-3fb4-4d83-a545-35f152e24f58","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.6018/cpd.340501"}},{"edit_id":"aeab2a58-ec4e-4a2a-82ee-a0d2428eca50","ident":"pepmo2ajfzh7ldtqdugj4p5zvy","revision":"7d13f42d-ce94-4537-8c56-0158d9bf99e9","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.6018/cpd.347981"}},{"edit_id":"f3cd1575-fcfd-4f6b-ab48-28b06592df85","ident":"qcw5h7c5uncbbphtrp3zibn5y4","revision":"fa7b2868-777e-4046-98ca-b146c0056180","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s1742170519000425"}},{"edit_id":"3c0ffa45-7f39-4393-a73c-0615bf9543f6","ident":"qfmufcdlrbfyhcjemvut5om23i","revision":"fac63461-4790-49a8-9b75-b23e9a369529","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.34225/jidc.2019.14.2.57"}},{"edit_id":"4ad40f3a-55f4-4aa8-9016-d04f904b2163","ident":"qrpp45vopfbvzanalh4lmpiz24","revision":"2ac1355f-cba9-49ed-b260-9eea5ec09473","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.19"}},{"edit_id":"1ec3a67b-796c-4393-93c2-ddac4ea66e0f","ident":"q7cv6lezvjalpg5xd4tckkbmsi","revision":"702adc9f-9ecb-4030-a095-936f15839c31","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934359"}},{"edit_id":"a8ab2cf7-8aad-4060-b281-371d41df32cf","ident":"rqcd44zbfvcovipfpnqpowiq2e","revision":"bd51fd60-438a-42fe-80e9-104705b0580f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934350"}},{"edit_id":"6d626d5c-1c7c-42e3-8d00-c313a91fc7c1","ident":"ruzbactehngbrlyx4vq3zlaqlu","revision":"5da34289-7da0-42ad-98bb-1c0b21bb6e69","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.55"}},{"edit_id":"4651ac6b-228c-4918-ac74-0757ce64c031","ident":"rvbhhfvwc5dkhdpxekutlew7di","revision":"53b088f6-653c-4e6d-974a-8e5e6e154f12","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0190"}},{"edit_id":"156b4a96-af8b-4c01-a8ec-6aa53736450f","ident":"r6j7ad7vmvg75lvel6v4e4gbb4","revision":"46647620-a36a-482b-a651-492a4e6ca1bb","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.5802/smai-jcm.53"}},{"edit_id":"67e6f76f-859d-422d-8c90-715b3173e24c","ident":"scajd3ykrjatbjmifvxwcl3yhu","revision":"d99f1da3-484a-4977-bc14-6e952d007acd","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.11"}},{"edit_id":"22395a5e-2a2e-4bf2-9547-0fb5122f8a7f","ident":"sk22wngc3fh23b74nmkbgeeyya","revision":"7423067d-3a18-4305-98b4-5cee8127e24a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0300"}},{"edit_id":"4ea1f9dd-d85e-450b-bc2a-995364fcf3fe","ident":"tma3dvw77bghjfaqrdoiwquuge","revision":"c86364bf-501b-42b4-a3ed-ef334ff26e0a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.36447/estudios2014.v33-34.art3"}},{"edit_id":"87d158ff-b0e0-4944-88d9-017dcef70d6e","ident":"tvuumx4n75dorebv7bu7imyiym","revision":"ae17166e-43ce-4d5a-95de-4befeeb76fa6","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.35236/jots.663726"}},{"edit_id":"2114c75b-ac23-471a-83d9-c76ae5f6bf42","ident":"twnql5u4mbfqffal2atv5qucoq","revision":"cb73ac6c-bc2b-422b-bc27-7032d14cac4d","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.31616/asj.2019.0162"}},{"edit_id":"2f1d20fd-e69e-4490-9f70-f2b043f53634","ident":"tzwyuimcgrepvhide2sj3lovjm","revision":"b9cf9229-db47-4fa0-a8f6-45320b5af440","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934348"}},{"edit_id":"fdb6c388-9b22-45ff-8db1-1531eac43bbf","ident":"vfwz3fuvbbgcxek2hi6vjo525q","revision":"6c57ecb2-a870-4078-af17-a779ca3ceb28","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.4197/eco.29-1.8"}},{"edit_id":"3b8cf6fa-75e7-4b15-a83c-784552ff76e9","ident":"vtq5tvfltbfv3pizvux32ek5hi","revision":"7c0b5d64-6d20-407b-9de3-482a9c75f40e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.23917/jurisprudence.v8i2.6977"}},{"edit_id":"899303e8-ce86-4f53-909f-249ef45d9a3b","ident":"vuneewh3fncxpb7bzviptx6kze","revision":"3fd4ff1c-1855-41e2-9bae-7edbae86783a","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.3846/jcem.2020.11826"}},{"edit_id":"c0595f10-72dc-4144-b243-5b344912842f","ident":"v2i53kwtnbea7kfhsnvcpkvixe","revision":"09891662-3d67-4d7f-a4ed-7f9bfa3f426f","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.33549/physiolres.934347"}},{"edit_id":"80f26273-d243-4dcf-af4a-e05226ba679c","ident":"wtish6c32randpuucjxq5byjo4","revision":"a69cd59a-52bf-4c48-9323-62a4070b2440","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.29261/pakvetj/2019.033"}},{"edit_id":"79f1bdc1-2627-4328-97ff-71731295fcad","ident":"xkf7a4cavnhahnfsjj5w5aoopu","revision":"b826cb89-efcc-427f-a378-0829bb2b871c","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.36447/estudios2014.v33-34.art6"}},{"edit_id":"695feb37-ab8e-4ecd-8c9e-fae06ff63e39","ident":"yx24fslfafb6dgx7gvjbmoma5m","revision":"4109aeb1-fcc1-47f3-9d2c-8e3abfbc697e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra10366j"}},{"edit_id":"48f2eb73-a5a3-440a-8bd1-c581797a82ca","ident":"3p5rah3wbfftdht7rabkpjfcrm","revision":"df754aec-4750-45eb-97e6-943928dad661","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.01"}},{"edit_id":"7a937ed3-5362-40ed-8f96-e6c6231e0adf","ident":"4r3madqhfzb5jb7jd7xmv55em4","revision":"6feccd54-38bd-4d51-8f42-fe211baf5ba3","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.35236/jots.668781"}},{"edit_id":"3beaa94e-c4e1-4a6b-96a7-d455ad13b7aa","ident":"53ooeweri5efjm5vhl2bwjcfze","revision":"a9d318ad-eb2a-4590-a463-8d6016e2e887","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1017/s2045796020000049"}},{"edit_id":"8a0b80e2-c8b2-4457-ac89-2fc7ad7548f7","ident":"546k37iji5bfffakw2egl2azxy","revision":"3a4d93d0-c59e-4f81-8fbb-40ce21b11b1e","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.1039/c9ra08019h"}},{"edit_id":"15fa5e34-9829-483e-bfa5-a4011b974c6b","ident":"6cy3aonbdfgxbjxnujg3hsqx7q","revision":"021fed8f-5109-4389-9e7f-13a70cbaf4a3","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.2478/joim-2019-0023"}},{"edit_id":"6bc507c8-ceea-49a5-8c1f-9c652463588e","ident":"6rxwlcytwzeopgrhidvi236b2q","revision":"60307cd5-28a3-4063-9111-d5e90e1cb346","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.3846/aviation.2019.11913"}},{"edit_id":"283013c6-4400-4fdc-b2b2-1dbd1a262332","ident":"7j4w24plxzc3nnrkorbowmnra4","revision":"0fa4112c-a596-49a2-9d36-82c213db3fb8","editgroup_id":"jebmpiqsjja7jozze7hoy3bir4","extra":{"ingest_request_source":"fatcat-changelog","link_source":"doi","link_source_id":"10.24193/ed21.2019.17.22"}}],"filesets":[],"webcaptures":[],"releases":[],"works":[]}}}
\ No newline at end of file
diff --git a/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json b/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json
new file mode 100644
index 00000000..bed8977d
--- /dev/null
+++ b/python/tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json
@@ -0,0 +1 @@
+{"release_ids":["5tbuas2e4vd6jaowbgzmmhhqxe"],"mimetype":"application/pdf","urls":[{"url":"https://web.archive.org/web/20200130042753/https://www.zhros.ru/jour/article/download/811/542","rel":"webarchive"},{"url":"https://www.zhros.ru/jour/article/download/811/542","rel":"web"}],"sha256":"1665cdb90b73c684233038601c52995acef77bb37aefc6e63ae13e4194d48261","sha1":"3ad4df99ff1354ec0b5a333a59fba9a3a5d9812a","md5":"39159f9c8e98a245f954c9000b0f2810","size":739980,"revision":"dcc7a975-725d-4bc9-8c3f-cd0476cd485e","ident":"bcah4zp5tvdhjl5bqci2c2lgfa","state":"active"}
\ No newline at end of file
diff --git a/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json b/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json
new file mode 100644
index 00000000..1204c95d
--- /dev/null
+++ b/python/tests/files/release_etodop5banbndg3faecnfm6ozi.json
@@ -0,0 +1 @@
+{"abstracts":[],"refs":[{"index":0,"extra":{"issue":"Suppl 1","volume":"118"},"key":"10.1111/j.1471-0528.2011.03098.x-BIB1|cit1","year":2011,"container_name":"BJOG","title":"Saving Mothers' Lives: reviewing maternal deaths to make motherhood safer-2006-2008. The Eighth Report of the Confidential Enquiries into Maternal Deaths in the United Kingdom"}],"contribs":[{"index":0,"raw_name":"Philip Steer","role":"author","extra":{"seq":"first"}}],"language":"en","publisher":"Wiley","pages":"1404-1404","issue":"11","volume":"118","ext_ids":{"doi":"10.1111/j.1471-0528.2011.03098.x"},"release_year":2011,"release_date":"2011-09-09","release_stage":"published","release_type":"article-journal","container_id":"hl5g6d5msjcl7hlbyyvcsbhc2u","webcaptures":[],"filesets":[],"files":[],"container":{"wikidata_qid":"Q15724571","issnl":"1470-0328","publisher":"Wiley (Blackwell Publishing)","container_type":"journal","name":"BJOG: an International Journal of Obstetrics and Gynaecology","extra":{"abbrev":"BJOG","country":"gb","ia":{"sim":{"year_spans":[[1902,1915],[1921,2015]]}},"issne":"1471-0528","issnp":"1470-0328","kbart":{"clockss":{"year_spans":[[1989,1989],[1993,1993],[2002,2003],[2009,2017]]},"portico":{"year_spans":[[1902,2019]]}},"languages":["en"],"sherpa_romeo":{"color":"yellow"},"urls":["http://www.bjog.org/view/0/index.html"]},"revision":"ec26766c-c1fe-453b-837d-087cc254fe07","ident":"hl5g6d5msjcl7hlbyyvcsbhc2u","state":"active"},"work_id":"wmwe5wwkzfcs7gyjfgdeanksha","title":"Saving Mothers' Lives. Reviewing maternal deaths to make motherhood safer: 2006-2008","state":"active","ident":"etodop5banbndg3faecnfm6ozi","revision":"deb7e050-6df6-42ed-9704-788a0e30facf","extra":{"crossref":{"type":"journal-article"},"subtitle":["Correpondence"]}}
\ No newline at end of file
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
new file mode 100644
index 00000000..ab613a0a
--- /dev/null
+++ b/python/tests/transform_elasticsearch.py
@@ -0,0 +1,114 @@
+
+import json
+import pytest
+from fatcat_tools import *
+from fatcat_openapi_client import *
+from fixtures import api
+from import_journal_metadata import journal_metadata_importer
+
+from import_crossref import crossref_importer
+from import_matched import matched_importer
+
+def test_basic_elasticsearch_convert(crossref_importer):
+    with open('tests/files/crossref-works.single.json', 'r') as f:
+        # not a single line
+        raw = json.loads(f.read())
+        r = crossref_importer.parse_record(raw)
+    r.state = 'active'
+    release_to_elasticsearch(r)
+
+def test_rich_elasticsearch_convert():
+    r = ReleaseEntity(
+        title="something",
+        release_year=1234,
+        license_slug="CC-BY-NC",
+        ext_ids=ReleaseExtIds(),
+        refs=[
+            ReleaseRef(),
+            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
+        ],
+    )
+    r.state = 'active'
+    r.container = ContainerEntity(
+        name="dummy journal",
+        extra={
+            "ia": {
+                "sim": {
+                    "year_spans": [[1000, 1100]],
+                },
+            },
+            "kbart": {
+                "lockss": {
+                    "year_spans": [[1200, 1300]],
+                },
+                "jstor": {
+                    "year_spans": [[1950, 1960], [1980, 2005]],
+                },
+            },
+            "sherpa_romeo": {"color": "blue"},
+            "doaj": {"as_of": "2010-02-03"},
+        },
+    )
+    r.files = [FileEntity(
+        mimetype="application/pdf",
+        urls=[
+            FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
+            FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"),
+            FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"),
+        ],
+        extra={
+            "shadows": {},
+        },
+    )]
+    es = release_to_elasticsearch(r)
+    assert es['release_year'] == r.release_year
+    assert es['in_ia'] == True
+    assert es['in_jstor'] == False
+    assert es['in_ia_sim'] == False
+    assert es['in_ia'] == True
+    assert es['in_web'] == True
+    assert es['in_dweb'] == True
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['ref_count'] == 2
+    assert es['ref_linked_count'] == 1
+
+def test_elasticsearch_release_from_json():
+    r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
+    es = release_to_elasticsearch(r)
+
+    assert es['subtitle'] == "Correpondence"
+    assert es['ident'] == "etodop5banbndg3faecnfm6ozi"
+    assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology"
+    assert es['first_page'] == "1404"
+    assert es['issue'] == "11"
+    assert es['volume'] == "118"
+    assert es['number'] == None
+    assert es['in_ia_sim'] == True
+    assert es['in_kbart'] == True
+
+def test_elasticsearch_container_transform(journal_metadata_importer):
+    with open('tests/files/journal_metadata.sample.json', 'r') as f:
+        raw = json.loads(f.readline())
+        c = journal_metadata_importer.parse_record(raw)
+    c.state = 'active'
+    es = container_to_elasticsearch(c)
+    assert es['publisher'] == c.publisher
+
+def test_elasticsearch_file_transform(matched_importer):
+    f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity)
+
+    f.state = 'active'
+    es = file_to_elasticsearch(f)
+    assert es['sha1'] == f.sha1
+    assert es['sha256'] == f.sha256
+    assert es['md5'] == f.md5
+    assert es['size_bytes'] == f.size
+    assert es['mimetype'] == f.mimetype
+    assert es['in_ia'] == True
+    assert 'publisher' in es['rel']
+
+    # XXX: implement hosts and domain parsing with urlcanon
+    #assert 'journals.plos.org' in es['host']
+    #assert 'plos.org' in es['domain']
+
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
deleted file mode 100644
index 7b583ac4..00000000
--- a/python/tests/transform_tests.py
+++ /dev/null
@@ -1,106 +0,0 @@
-
-import json
-import pytest
-from fatcat_tools import *
-from fatcat_openapi_client import *
-from fixtures import api
-from import_journal_metadata import journal_metadata_importer
-
-from import_crossref import crossref_importer
-from import_matched import matched_importer
-
-def test_basic_elasticsearch_convert(crossref_importer):
-    with open('tests/files/crossref-works.single.json', 'r') as f:
-        # not a single line
-        raw = json.loads(f.read())
-        r = crossref_importer.parse_record(raw)
-    r.state = 'active'
-    release_to_elasticsearch(r)
-
-def test_rich_elasticsearch_convert():
-    r = ReleaseEntity(
-        title="something",
-        release_year=1234,
-        license_slug="CC-BY-NC",
-        ext_ids=ReleaseExtIds(),
-        refs=[
-            ReleaseRef(),
-            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
-        ],
-    )
-    r.state = 'active'
-    r.container = ContainerEntity(
-        name="dummy journal",
-        extra={
-            "ia": {
-                "sim": {
-                    "year_spans": [[1000, 1100]],
-                },
-            },
-            "kbart": {
-                "lockss": {
-                    "year_spans": [[1200, 1300]],
-                },
-                "jstor": {
-                    "year_spans": [[1950, 1960], [1980, 2005]],
-                },
-            },
-            "sherpa_romeo": {"color": "blue"},
-            "doaj": {"as_of": "2010-02-03"},
-        },
-    )
-    r.files = [FileEntity(
-        mimetype="application/pdf",
-        urls=[
-            FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
-            FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"),
-            FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"),
-        ],
-        extra={
-            "shadows": {},
-        },
-    )]
-    es = release_to_elasticsearch(r)
-    assert es['release_year'] == r.release_year
-    assert es['in_ia'] == True
-    assert es['in_jstor'] == False
-    assert es['in_ia_sim'] == False
-    assert es['in_ia'] == True
-    assert es['in_web'] == True
-    assert es['in_dweb'] == True
-    assert es['is_oa'] == True
-    assert es['is_longtail_oa'] == False
-    assert es['ref_count'] == 2
-    assert es['ref_linked_count'] == 1
-
-def test_elasticsearch_release_from_json():
-    r = entity_from_json(open('./tests/files/math_universe.json', 'r').read(), ReleaseEntity)
-    release_to_elasticsearch(r)
-
-def test_elasticsearch_container_transform(journal_metadata_importer):
-    with open('tests/files/journal_metadata.sample.json', 'r') as f:
-        raw = json.loads(f.readline())
-        c = journal_metadata_importer.parse_record(raw)
-    c.state = 'active'
-    es = container_to_elasticsearch(c)
-    assert es['publisher'] == c.publisher
-
-def test_elasticsearch_file_transform(matched_importer):
-    with open('tests/files/example_matched.json', 'r') as f:
-        raw = json.loads(f.readline())
-        f = matched_importer.parse_record(raw)
-
-    f.state = 'active'
-    es = file_to_elasticsearch(f)
-    assert es['sha1'] == f.sha1
-    assert es['sha256'] == f.sha256
-    assert es['md5'] == f.md5
-    assert es['size_bytes'] == f.size
-    assert es['mimetype'] == f.mimetype
-    assert es['in_ia'] == True
-    assert 'publisher' in es['rel']
-
-    # XXX: implement hosts and domain parsing with urlcanon
-    #assert 'journals.plos.org' in es['host']
-    #assert 'plos.org' in es['domain']
-
-- 
cgit v1.2.3


From d58c3891ac2122dac53ced606568108f543f2d80 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 21:52:58 -0800
Subject: actually implement changelog transform

---
 extra/elasticsearch/changelog_schema.json       | 11 ++++-
 python/fatcat_tools/transforms/elasticsearch.py | 62 ++++++++++++++++++-------
 python/tests/transform_elasticsearch.py         | 24 +++++++++-
 3 files changed, 78 insertions(+), 19 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index f3211e99..77c77238 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -16,20 +16,29 @@
     "changelog": {
         "properties": {
             "index":            { "type": "integer" },
-            "editgroup_id":     { "type": "keyword" },
+            "editgroup_id":     { "type": "keyword", "doc_values": false },
             "timestamp":        { "type": "date" },
             "editor_id":        { "type": "keyword" },
             "username":         { "type": "keyword" },
             "is_bot":           { "type": "boolean" },
             "is_admin":         { "type": "boolean" },
             "agent":            { "type": "keyword" },
+
             "containers":       { "type": "integer" },
+            "new_containers":   { "type": "integer" },
             "creators":         { "type": "integer" },
+            "new_creators":     { "type": "integer" },
             "files":            { "type": "integer" },
+            "new_files":        { "type": "integer" },
             "filessets":        { "type": "integer" },
+            "new_filessets":    { "type": "integer" },
             "webcaptures":      { "type": "integer" },
+            "new_webcaptures":  { "type": "integer" },
             "releases":         { "type": "integer" },
+            "new_releases":     { "type": "integer" },
             "works":            { "type": "integer" },
+            "new_works":        { "type": "integer" },
+
             "created":          { "type": "integer" },
             "updated":          { "type": "integer" },
             "deleted":          { "type": "integer" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 812cd1fd..c8547b27 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -401,36 +401,64 @@ def container_to_elasticsearch(entity, force_bool=True):
     return t
 
 
+def _type_of_edit(edit):
+    if edit.revision == None and edit.redirect_ident == None:
+        return 'delete'
+    elif edit.redirect_ident:
+        # redirect
+        return 'update'
+    elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision:
+        return 'create'
+    else:
+        return 'update'
+
+
 def changelog_to_elasticsearch(entity):
 
     editgroup = entity.editgroup
     t = dict(
         index=entity.index,
         editgroup_id=entity.editgroup_id,
-        timestamp=entity.timestamp,
+        timestamp=entity.timestamp.isoformat(),
         editor_id=editgroup.editor_id,
+        username=editgroup.editor.username,
+        is_bot=editgroup.editor.is_bot,
+        is_admin=editgroup.editor.is_admin,
     )
 
     extra = editgroup.extra or dict()
     if extra.get('agent'):
         t['agent'] = extra['agent']
 
-    t['containers'] = len(editgroup.edits.containers)
-    t['creators'] = len(editgroup.edits.containers)
-    t['files'] = len(editgroup.edits.containers)
-    t['filesets'] = len(editgroup.edits.containers)
-    t['webcaptures'] = len(editgroup.edits.containers)
-    t['releases'] = len(editgroup.edits.containers)
-    t['works'] = len(editgroup.edits.containers)
-
-    # TODO: parse and pull out counts
-    #created = 0
-    #updated = 0
-    #deleted = 0
-    #t['created'] = created
-    #t['updated'] = updated
-    #t['deleted'] = deleted
-    #t['total'] = created + updated + deleted
+    containers = [_type_of_edit(e) for e in editgroup.edits.containers]
+    creators = [_type_of_edit(e) for e in editgroup.edits.creators]
+    files = [_type_of_edit(e) for e in editgroup.edits.files]
+    filesets = [_type_of_edit(e) for e in editgroup.edits.filesets]
+    webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures]
+    releases = [_type_of_edit(e) for e in editgroup.edits.releases]
+    works = [_type_of_edit(e) for e in editgroup.edits.works]
+
+    t['containers'] = len(containers)
+    t['new_containers'] = len([e for e in containers if e == 'create'])
+    t['creators'] = len(creators)
+    t['new_creators'] = len([e for e in creators if e == 'create'])
+    t['files'] = len(files)
+    t['new_files'] = len([e for e in files if e == 'create'])
+    t['filesets'] = len(filesets)
+    t['new_filesets'] = len([e for e in filesets if e == 'create'])
+    t['webcaptures'] = len(webcaptures)
+    t['new_webcaptures'] = len([e for e in webcaptures if e == 'create'])
+    t['releases'] = len(releases)
+    t['new_releases'] = len([e for e in releases if e == 'create'])
+    t['works'] = len(works)
+    t['new_works'] = len([e for e in works if e == 'create'])
+
+    all_edits = containers + creators + files + filesets + webcaptures + releases + works
+
+    t['created'] = len([e for e in all_edits if e == 'create'])
+    t['updated'] = len([e for e in all_edits if e == 'update'])
+    t['deleted'] = len([e for e in all_edits if e == 'delete'])
+    t['total'] = len(all_edits)
     return t
 
 
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index ab613a0a..89a4eef8 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,9 +106,31 @@ def test_elasticsearch_file_transform(matched_importer):
     assert es['size_bytes'] == f.size
     assert es['mimetype'] == f.mimetype
     assert es['in_ia'] == True
-    assert 'publisher' in es['rel']
+    assert 'web' in es['rel']
 
     # XXX: implement hosts and domain parsing with urlcanon
     #assert 'journals.plos.org' in es['host']
     #assert 'plos.org' in es['domain']
 
+def test_elasticsearch_changelog_transform(matched_importer):
+    ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
+
+    es = changelog_to_elasticsearch(ce)
+    assert es['index'] == 3469683
+    # len("2020-01-30T05:04:39") => 19
+    assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19]
+    assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa"
+    assert es['username'] == "crawl-bot"
+    assert es['is_bot'] == True
+    assert es['is_admin'] == True
+    assert es['agent'] == "fatcat_tools.IngestFileResultImporter"
+
+    assert es['total'] == 50
+    assert es['files'] == 50
+    assert es['new_files'] == 50
+    assert es['created'] == 50
+
+    assert es['releases'] == 0
+    assert es['new_releases'] == 0
+    assert es['updated'] == 0
+    assert es['deleted'] == 0
-- 
cgit v1.2.3


From bf718fd076476c1a54e80ca88cd02ede606ab6f3 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 23:23:39 -0800
Subject: add country to v03b release schema

---
 extra/elasticsearch/release_schema.json         | 1 +
 python/fatcat_tools/transforms/elasticsearch.py | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 98a1c28e..2b67c5f5 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -40,6 +40,7 @@
             "release_stage":  { "type": "keyword" },
             "withdrawn_status": { "type": "keyword", "copy_to": "biblio" },
             "language":       { "type": "keyword" },
+            "country":        { "type": "keyword" },
             "volume":         { "type": "keyword", "copy_to": "biblio" },
             "issue":          { "type": "keyword", "copy_to": "biblio" },
             "pages":          { "type": "keyword", "copy_to": "biblio" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index c8547b27..f0146d01 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -155,6 +155,8 @@ def release_to_elasticsearch(entity, force_bool=True):
             if c_extra.get('szczepanski'):
                 if c_extra['szczepanski'].get('as_of'):
                     is_oa = True
+            if c_extra.get('country'):
+                t['country'] = c_extra['country']
 
     # fall back to release-level container metadata if container not linked or
     # missing context
-- 
cgit v1.2.3


From e98f389a53d886b4fa8f0237b90b086999770f78 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 23:26:58 -0800
Subject: elastic schema fixes

---
 extra/elasticsearch/file_schema.json            | 12 ++++++------
 extra/elasticsearch/release_schema.json         |  2 +-
 python/fatcat_tools/transforms/elasticsearch.py |  5 +++++
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index 66d81e0b..2a7e5be0 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -13,7 +13,7 @@
     }
 },
 "mappings": {
-    "changelog": {
+    "file": {
         "properties": {
             "ident":            { "type": "keyword", "doc_values": false },
             "state":            { "type": "keyword" },
@@ -33,13 +33,13 @@
             "in_ia":            { "type": "boolean" },
 
             "release_id":       { "type": "alias", "path": "release_ids" },
-            "sha1hex":          { "type": "alias", "path": "sha1hex" },
-            "sha256hex":        { "type": "alias", "path": "sha256hex" },
-            "md5hex":           { "type": "alias", "path": "md5hex" },
+            "sha1hex":          { "type": "alias", "path": "sha1" },
+            "sha256hex":        { "type": "alias", "path": "sha256" },
+            "md5hex":           { "type": "alias", "path": "md5" },
             "size":             { "type": "alias", "path": "size_bytes" },
             "domain":           { "type": "alias", "path": "domains" },
-            "host":             { "type": "alias", "path": "host" },
-            "rel":              { "type": "alias", "path": "rel" }
+            "host":             { "type": "alias", "path": "hosts" },
+            "rel":              { "type": "alias", "path": "rels" }
         }
     }
 }
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 2b67c5f5..3d301dba 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -99,7 +99,7 @@
 
             "affilation":     { "type": "alias", "path": "affiliations" },
             "ror":            { "type": "alias", "path": "affiliation_rors" },
-            "creator_id":     { "type": "alias", "path": "creator_id" },
+            "creator_id":     { "type": "alias", "path": "creator_ids" },
             "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
             "author":         { "type": "alias", "path": "contrib_names" },
             "journal":        { "type": "alias", "path": "container_name" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index f0146d01..42669bbf 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -416,6 +416,11 @@ def _type_of_edit(edit):
 
 
 def changelog_to_elasticsearch(entity):
+    """
+    Note that this importer requires expanded fill info to work. Calling code
+    may need to re-fetch editgroup from API to get the 'editor' field. Some of
+    the old kafka feed content doesn't includes editor in particular.
+    """
 
     editgroup = entity.editgroup
     t = dict(
-- 
cgit v1.2.3


From ade1eb9ff955ca5ba58acdc8b76e344c9cc54790 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 23:56:27 -0800
Subject: fix ES file schema plural field names

---
 python/fatcat_tools/transforms/elasticsearch.py | 7 +++----
 python/tests/transform_elasticsearch.py         | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 42669bbf..5a492fb4 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -497,13 +497,12 @@ def file_to_elasticsearch(entity):
         sha1 = entity.sha1,
         sha256 = entity.sha256,
         md5 = entity.md5,
-        rel = [u.rel for u in entity.urls],
     )
 
     # TODO: domain, hosts (from urls; use proper urlcanon)
-    t['rel'] = list(set([u.rel for u in entity.urls]))
-    t['host'] = []
-    t['domain'] = []
+    t['rels'] = list(set([u.rel for u in entity.urls]))
+    t['hosts'] = []
+    t['domains'] = []
 
     in_ia = False
     for u in entity.urls:
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 89a4eef8..c247e745 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,7 +106,7 @@ def test_elasticsearch_file_transform(matched_importer):
     assert es['size_bytes'] == f.size
     assert es['mimetype'] == f.mimetype
     assert es['in_ia'] == True
-    assert 'web' in es['rel']
+    assert 'web' in es['rels']
 
     # XXX: implement hosts and domain parsing with urlcanon
     #assert 'journals.plos.org' in es['host']
-- 
cgit v1.2.3


From 4cbee44529dd967c966ed3f2cc2bb80176be4e43 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Jan 2020 00:08:41 -0800
Subject: implement host+domain parsing for file ES transform

---
 python/fatcat_tools/transforms/elasticsearch.py | 14 +++++---------
 python/tests/transform_elasticsearch.py         |  7 +++----
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5a492fb4..e1980d90 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
 
-
 import collections
+import tldextract
 from fatcat_openapi_client import ApiClient
 
 
@@ -499,15 +499,11 @@ def file_to_elasticsearch(entity):
         md5 = entity.md5,
     )
 
-    # TODO: domain, hosts (from urls; use proper urlcanon)
+    parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+    t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
     t['rels'] = list(set([u.rel for u in entity.urls]))
-    t['hosts'] = []
-    t['domains'] = []
 
-    in_ia = False
-    for u in entity.urls:
-        if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
-            in_ia = True
-    t['in_ia'] = bool(in_ia)
+    t['in_ia'] = bool('archive.org' in t['domains'])
 
     return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index c247e745..e67681c6 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer):
     assert es['size_bytes'] == f.size
     assert es['mimetype'] == f.mimetype
     assert es['in_ia'] == True
-    assert 'web' in es['rels']
 
-    # XXX: implement hosts and domain parsing with urlcanon
-    #assert 'journals.plos.org' in es['host']
-    #assert 'plos.org' in es['domain']
+    assert 'web' in es['rels']
+    assert 'www.zhros.ru' in es['hosts']
+    assert 'zhros.ru' in es['domains']
 
 def test_elasticsearch_changelog_transform(matched_importer):
     ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
-- 
cgit v1.2.3


From 59912583926077260d99a9bf77a938c2215eb6c8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Jan 2020 00:20:34 -0800
Subject: tweak file ES archive.org domain tracking

---
 extra/elasticsearch/file_schema.json            | 1 +
 python/fatcat_tools/transforms/elasticsearch.py | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index 2a7e5be0..a0ac3346 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -31,6 +31,7 @@
             "hosts":            { "type": "keyword" },
             "rels":             { "type": "keyword" },
             "in_ia":            { "type": "boolean" },
+            "in_ia_petabox":    { "type": "boolean" },
 
             "release_id":       { "type": "alias", "path": "release_ids" },
             "sha1hex":          { "type": "alias", "path": "sha1" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index e1980d90..9aa3cece 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -505,5 +505,11 @@ def file_to_elasticsearch(entity):
     t['rels'] = list(set([u.rel for u in entity.urls]))
 
     t['in_ia'] = bool('archive.org' in t['domains'])
+    t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+
+    # ok, but actually remove archive.org hosts, because they make other
+    # aggregations hard and are a waste of storage
+    t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
+    t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
 
     return t
-- 
cgit v1.2.3


From caa588612b91181950697756eace8fda270fd092 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Jan 2020 01:03:08 -0800
Subject: add upper-case work-around from kibana map join

---
 extra/elasticsearch/release_schema.json         | 1 +
 python/fatcat_tools/transforms/elasticsearch.py | 1 +
 2 files changed, 2 insertions(+)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 07601f36..c0bbda22 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -53,6 +53,7 @@
             "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
             "language":       { "type": "keyword", "normalizer": "default" },
             "country":        { "type": "keyword", "normalizer": "default" },
+            "country_upper":  { "type": "keyword", "normalizer": "caseSensitive" },
             "volume":         { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
             "issue":          { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
             "pages":          { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 9aa3cece..ded239d3 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -157,6 +157,7 @@ def release_to_elasticsearch(entity, force_bool=True):
                     is_oa = True
             if c_extra.get('country'):
                 t['country'] = c_extra['country']
+                t['country_upper'] = c_extra['country'].upper()
 
     # fall back to release-level container metadata if container not linked or
     # missing context
-- 
cgit v1.2.3


From 5ba91951bb4ebc59cb59340e82cba2a7d763dc59 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Jan 2020 13:22:45 -0800
Subject: fix release es transform missing 'issue'

---
 python/fatcat_tools/transforms/elasticsearch.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index ded239d3..b5abe2ae 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -248,6 +248,7 @@ def release_to_elasticsearch(entity, force_bool=True):
         # special case as a demo for now.
         if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
                 and release.release_year in (2011, 2013) \
+                and release.issue \
                 and release.issue.isdigit() \
                 and t['first_page']:
             t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
-- 
cgit v1.2.3


From 741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 31 Jan 2020 13:31:59 -0800
Subject: ES releases: host/domain fixes

---
 python/fatcat_tools/transforms/elasticsearch.py | 4 ++--
 python/tests/transform_elasticsearch.py         | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index b5abe2ae..f8bc05fb 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -502,7 +502,7 @@ def file_to_elasticsearch(entity):
     )
 
     parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
-    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+    t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
     t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
     t['rels'] = list(set([u.rel for u in entity.urls]))
 
@@ -512,6 +512,6 @@ def file_to_elasticsearch(entity):
     # ok, but actually remove archive.org hosts, because they make other
     # aggregations hard and are a waste of storage
     t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
-    t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
+    t['domains'] = [h for h in t['domains'] if h not in ('archive.org')]
 
     return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index e67681c6..c94ab375 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -110,6 +110,9 @@ def test_elasticsearch_file_transform(matched_importer):
     assert 'web' in es['rels']
     assert 'www.zhros.ru' in es['hosts']
     assert 'zhros.ru' in es['domains']
+    assert not '.archive.org' in (es['hosts'] + es['domains'])
+    assert not 'archive.org' in (es['hosts'] + es['domains'])
+    assert not 'web.archive.org' in (es['hosts'] + es['domains'])
 
 def test_elasticsearch_changelog_transform(matched_importer):
     ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
-- 
cgit v1.2.3


From 83387210e6775751e5eb690a7d8b56fe99dbe380 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 7 Feb 2020 14:38:13 -0800
Subject: ES files: don't remove archive.org domains/hosts

---
 python/fatcat_tools/transforms/elasticsearch.py | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index f8bc05fb..e00d7830 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -509,9 +509,4 @@ def file_to_elasticsearch(entity):
     t['in_ia'] = bool('archive.org' in t['domains'])
     t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
 
-    # ok, but actually remove archive.org hosts, because they make other
-    # aggregations hard and are a waste of storage
-    t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
-    t['domains'] = [h for h in t['domains'] if h not in ('archive.org')]
-
     return t
-- 
cgit v1.2.3


From 0450f22006c9b991cdc4695458fc3b3e3e97bfbb Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 26 Feb 2020 11:22:30 -0800
Subject: ES release: last minor tweaks

---
 extra/elasticsearch/release_schema.json         | 8 +++++---
 python/fatcat_tools/transforms/elasticsearch.py | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 1b91696c..666a672f 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -52,8 +52,8 @@
             "release_stage":  { "type": "keyword", "normalizer": "default" },
             "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
             "language":       { "type": "keyword", "normalizer": "default" },
-            "country":        { "type": "keyword", "normalizer": "default" },
-            "country_upper":  { "type": "keyword", "normalizer": "caseSensitive" },
+            "country_code":        { "type": "keyword", "normalizer": "default" },
+            "country_code_upper":  { "type": "keyword", "normalizer": "caseSensitive" },
             "volume":         { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
             "issue":          { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
             "pages":          { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
@@ -71,8 +71,10 @@
             "jstor_id":       { "type": "keyword", "normalizer": "default", "doc_values": false },
             "ark_id":         { "type": "keyword", "normalizer": "default", "doc_values": false },
             "mag_id":         { "type": "keyword", "normalizer": "default", "doc_values": false },
+            "s2_id":          { "type": "keyword", "normalizer": "default", "doc_values": false },
             "license":        { "type": "keyword", "normalizer": "default" },
             "publisher":            { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+            "publisher_type":       { "type": "keyword", "normalizer": "default" },
             "container_name":       { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "container_id":         { "type": "keyword", "normalizer": "default" },
             "container_issnl":      { "type": "keyword", "normalizer": "default" },
@@ -110,7 +112,7 @@
             "is_retracted":         { "type": "boolean" },
             "preservation":         { "type": "keyword", "normalizer": "default" },
 
-            "affiliation":     { "type": "alias", "path": "affiliations" },
+            "affiliation":    { "type": "alias", "path": "affiliations" },
             "ror":            { "type": "alias", "path": "affiliation_rors" },
             "creator_id":     { "type": "alias", "path": "creator_ids" },
             "ref_release_id": { "type": "alias", "path": "ref_release_ids" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index e00d7830..cbafca7e 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -156,8 +156,8 @@ def release_to_elasticsearch(entity, force_bool=True):
                 if c_extra['szczepanski'].get('as_of'):
                     is_oa = True
             if c_extra.get('country'):
-                t['country'] = c_extra['country']
-                t['country_upper'] = c_extra['country'].upper()
+                t['country_code'] = c_extra['country']
+                t['country_code_upper'] = c_extra['country'].upper()
 
     # fall back to release-level container metadata if container not linked or
     # missing context
-- 
cgit v1.2.3


From 4e6bc246d01183f4c7ffad7d0d474e683f04c07f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 26 Feb 2020 11:28:05 -0800
Subject: ES container last tweaks

---
 extra/elasticsearch/container_schema.json       | 7 ++++---
 python/fatcat_tools/transforms/elasticsearch.py | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index be3a408e..5cd85b04 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -47,11 +47,12 @@
             "publisher":      { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "abbrev":         { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
             "aliases":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
+            "publisher_type": { "type": "keyword", "normalizer": "default" },
             "container_type": { "type": "keyword", "normalizer": "default" },
             "issnl":          { "type": "keyword", "normalizer": "default" },
             "issns":          { "type": "keyword", "normalizer": "default" },
             "wikidata_qid":   { "type": "keyword", "normalizer": "default" },
-            "country":        { "type": "keyword", "normalizer": "default" },
+            "country_code":   { "type": "keyword", "normalizer": "default" },
             "region":         { "type": "keyword", "normalizer": "default" },
             "discipline":     { "type": "keyword", "normalizer": "default" },
             "languages":      { "type": "keyword", "normalizer": "default" },
@@ -74,8 +75,8 @@
             "releases_total": { "type": "integer" },
             "releases_kbart": { "type": "integer" },
             "releases_ia":    { "type": "integer" },
-            "releases_sim":   { "type": "integer" },
-            "releases_shadow":          { "type": "integer" },
+            "releases_ia_sim":          { "type": "integer" },
+            "releases_shadows":         { "type": "integer" },
             "releases_any_file":        { "type": "integer" },
             "releases_any_fileset":     { "type": "integer" },
             "releases_any_webcapture":  { "type": "integer" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index cbafca7e..8581febd 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -342,6 +342,9 @@ def container_to_elasticsearch(entity, force_bool=True):
         if entity.extra.get(key):
             t[key] = entity.extra[key]
 
+    if 'country' in t:
+        t['country_code'] = t.pop('country')
+
     t['issns'] = []
     if entity.issnl:
         t['issns'].append(entity.issnl)
-- 
cgit v1.2.3


From 81e0784813500a39955c20278140e25d7940d9c6 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 26 Feb 2020 22:04:35 -0800
Subject: improve is_oa flag accuracy

Particularly, the ezb=green match seems mostly incorrect.

Note that pmcid being assigned could still be in an embargo window?
---
 proposals/2020_elasticsearch_schemas.md         |  4 ++--
 python/fatcat_tools/transforms/elasticsearch.py | 12 ++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'python/fatcat_tools/transforms/elasticsearch.py')

diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md
index 5fb28d19..c3e79073 100644
--- a/proposals/2020_elasticsearch_schemas.md
+++ b/proposals/2020_elasticsearch_schemas.md
@@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which
 is:
 
 - `bright`
-- `dark_only`
-- `shadow_only`
+- `dark`
+- `shadows_only`
 - `none`
 
 Note that these don't align with OA color or work-level preservation (aka, no
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 8581febd..87e054ec 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -149,9 +149,6 @@ def release_to_elasticsearch(entity, force_bool=True):
             if c_extra.get('road'):
                 if c_extra['road'].get('as_of'):
                     is_oa = True
-            if c_extra.get('ezb'):
-                if c_extra['ezb'].get('color') == 'green':
-                    is_oa = True
             if c_extra.get('szczepanski'):
                 if c_extra['szczepanski'].get('as_of'):
                     is_oa = True
@@ -210,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True):
         # TODO: more/better checks here, particularly strict *not* OA licenses
         if release.license_slug.startswith("CC-"):
             is_oa = True
+        if release.license_slug.startswith("ARXIV-"):
+            is_oa = True
 
     extra = release.extra or dict()
     if extra:
@@ -293,10 +292,10 @@ def release_to_elasticsearch(entity, force_bool=True):
     t['in_ia'] = bool(in_ia)
     t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
 
-    if in_ia:
+    if in_ia or t.get('pmcid') or t.get('arxiv_id'):
         t['preservation'] = 'bright'
     elif in_kbart or in_jstor:
-        t['preservation'] = 'dark_only'
+        t['preservation'] = 'dark'
     elif in_shadows:
         t['preservation'] = 'shadows_only'
     else:
@@ -367,9 +366,6 @@ def container_to_elasticsearch(entity, force_bool=True):
     if extra.get('road'):
         if extra['road'].get('as_of'):
             in_road = True
-    if extra.get('ezb'):
-        if extra['ezb'].get('color') == 'green':
-            is_oa = True
     if extra.get('szczepanski'):
         if extra['szczepanski'].get('as_of'):
             is_oa = True
-- 
cgit v1.2.3