7 files changed, 453 insertions, 50 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 03c7cbcc..c26446fd 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
 from .wayback_static import auto_wayback_static
 from .cdl_dash_dat import auto_cdl_dash_dat
 from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 5f5c46b8..c000ad62 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -194,6 +194,8 @@ DOMAIN_REL_MAP = {
     "www.scielo.cl": "repository",
     "www.scielo.org.mx": "repository",
     "zenodo.org": "repository",
+    "www.biorxiv.org": "repository",
+    "www.medrxiv.org": "repository",
 
     "citeseerx.ist.psu.edu": "aggregator",
     "publisher-connector.core.ac.uk": "aggregator",
@@ -220,6 +222,13 @@ DOMAIN_REL_MAP = {
     "www.nature.com": "publisher",
     "www.pnas.org": "publisher",
     "www.tandfonline.com": "publisher",
+    "www.frontiersin.org": "publisher",
+    "www.degruyter.com": "publisher",
+    "www.mdpi.com": "publisher",
+    "www.ahajournals.org": "publisher",
+    "ehp.niehs.nih.gov": "publisher",
+    "journals.tsu.ru": "publisher",
+    "www.cogentoa.com": "publisher",
 
     "www.researchgate.net": "academicsocial",
     "academia.edu": "academicsocial",
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index fdaba176..4772bfaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -32,8 +32,11 @@ class IngestFileResultImporter(EntityImporter):
             'fatcat-ingest',
             'arabesque',
             'mag-corpus',
+            'mag',
             'unpaywall-corpus',
+            'unpaywall',
             's2-corpus',
+            's2',
         ]
         if kwargs.get('skip_source_whitelist', False):
             self.ingest_request_source_whitelist = []
@@ -137,7 +140,12 @@ class IngestFileResultImporter(EntityImporter):
         if not 'terminal_dt' in terminal:
             terminal['terminal_dt'] = terminal['dt']
         assert len(terminal['terminal_dt']) == 14
-        url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
+
+        default_rel = self.default_link_rel
+        if request.get('link_source') == 'doi':
+            default_rel = 'publisher'
+        default_rel = request.get('rel', default_rel)
+        url = make_rel_url(terminal['terminal_url'], default_rel)
 
         if not url:
             self.counts['skip-url'] += 1
@@ -158,8 +166,8 @@ class IngestFileResultImporter(EntityImporter):
             release_ids=[release_ident],
             urls=urls,
         )
-        if fatcat and fatcat.get('edit_extra'):
-            fe.edit_extra = fatcat['edit_extra']
+        if request.get('edit_extra'):
+            fe.edit_extra = request['edit_extra']
         else:
             fe.edit_extra = dict()
         if request.get('ingest_request_source'):
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..4cd22775
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,195 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+    """
+    Importer for shadow library files (matched to releases)
+
+    Input format is JSON with keys:
+    - shadow
+        - shadow_corpus (string slug)
+        - shadow_id (string)
+        - doi
+        - pmid
+        - isbn13
+    - file_meta
+        - sha1hex
+        - sha256hex
+        - md5hex
+        - size_bytes
+        - mimetype
+    - cdx (may be null)
+        - url
+        - datetime
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+        eg_extra = kwargs.pop('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+        self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+    def want(self, raw_record):
+        """
+        Only want to import records with complete file-level metadata
+        """
+        fm = raw_record['file_meta']
+        if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+            self.counts['skip-file-meta-incomplete'] += 1
+            return False
+        if fm['mimetype'] != 'application/pdf':
+            self.counts['skip-not-pdf'] += 1
+            return False
+        return True
+
+    def parse_record(self, obj):
+        """
+        We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+        """
+
+        shadow_corpus = obj['shadow']['shadow_corpus']
+        assert shadow_corpus == shadow_corpus.strip().lower()
+        doi = clean_doi(obj['shadow'].get('doi'))
+        pmid = clean_pmid(obj['shadow'].get('pmid'))
+        isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+        shadow_id = obj['shadow'].get('shadow_id').strip()
+        assert shadow_id
+
+        extra = { '{}_id'.format(shadow_corpus): shadow_id }
+        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+            if not ext_id:
+                continue
+            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+        # lookup release via several idents
+        re = None
+        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+            if not ext_id:
+                continue
+            try:
+                re = self.api.lookup_release(**{ext_type: ext_id})
+            except fatcat_openapi_client.rest.ApiException as err:
+                if err.status not in (404, 400):
+                    raise err
+                re = None
+            if re:
+                break
+
+        if not re:
+            self.counts['skip-release-not-found'] += 1
+            return None
+
+        release_ids = [re.ident,]
+
+        # parse single CDX into URLs (if exists)
+        urls = []
+        if obj.get('cdx'):
+            url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+            if url != None:
+                urls.append(url)
+            wayback = "https://web.archive.org/web/{}/{}".format(
+                obj['cdx']['datetime'],
+                obj['cdx']['url'])
+            urls.append(("webarchive", wayback))
+        urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+        fe = fatcat_openapi_client.FileEntity(
+            md5=obj['file_meta']['md5hex'],
+            sha1=obj['file_meta']['sha1hex'],
+            sha256=obj['file_meta']['sha256hex'],
+            size=int(obj['file_meta']['size_bytes']),
+            mimetype=obj['file_meta']['mimetype'] or None,
+            release_ids=release_ids,
+            urls=urls,
+            extra=dict(shadows=extra),
+        )
+        return fe
+
+    def try_update(self, fe):
+        # lookup sha1, or create new entity
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            return True
+
+        if not existing.extra:
+            existing.extra = {}
+
+        if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+            # already imported from this shadow library; skip
+            self.counts['exists'] += 1
+            return False
+
+        # check for edit conflicts
+        if existing.ident in [e.ident for e in self._edits_inflight]:
+            self.counts['skip-update-inflight'] += 1
+            return False
+        if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+            raise Exception("Inflight insert; shouldn't happen")
+
+        # minimum viable "existing" URL cleanup to fix dupes and broken links:
+        # remove 'None' wayback URLs, and set archive.org rel 'archive'
+        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+        for i in range(len(existing.urls)):
+            u = existing.urls[i]
+            if u.rel == 'repository' and '://archive.org/download/' in u.url:
+                existing.urls[i].rel = 'archive'
+            if u.rel == 'social':
+                u.rel = 'academicsocial'
+
+        # merge the existing into this one and update
+        merged_urls = {}
+        for u in fe.urls + existing.urls:
+            merged_urls[u.url] = u
+        existing.urls = list(merged_urls.values())
+        if not existing.extra.get('shadows'):
+            existing.extra['shadows'] = fe.extra['shadows']
+        else:
+            existing.extra['shadows'].update(fe.extra['shadows'])
+
+        # do these "plus ones" because we really want to do these updates when possible
+        if len(existing.urls) > SANE_MAX_URLS + 1:
+            self.counts['skip-update-too-many-url'] += 1
+            return None
+        existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+        if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+            self.counts['skip-update-too-many-releases'] += 1
+            return None
+        existing.mimetype = existing.mimetype or fe.mimetype
+        existing.size = existing.size or fe.size
+        existing.md5 = existing.md5 or fe.md5
+        existing.sha1 = existing.sha1 or fe.sha1
+        existing.sha256 = existing.sha256 or fe.sha256
+        edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+        # add sha1 to non-entity edit row, so we can do more aggressive
+        # group-level de-dupe
+        edit.sha1 = existing.sha1
+        self._edits_inflight.append(edit)
+        self.counts['update'] += 1
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py
index 6a4b1bba..3f4700ff 100644
--- a/python/fatcat_tools/transforms/__init__.py
+++ b/python/fatcat_tools/transforms/__init__.py
@@ -1,5 +1,5 @@
 
 from .entities import entity_to_dict, entity_from_json, entity_from_dict
-from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch
+from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch, file_to_elasticsearch
 from .csl import release_to_csl, citeproc_csl
 from .ingest import release_ingest_request
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 3a53db4d..87e054ec 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
 
-
 import collections
+import tldextract
 from fatcat_openapi_client import ApiClient
 
 
@@ -20,6 +20,7 @@ def test_check_kbart():
     assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False
     assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True
 
+
 def release_to_elasticsearch(entity, force_bool=True):
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
@@ -50,6 +51,10 @@ def release_to_elasticsearch(entity, force_bool=True):
         release_stage = release.release_stage,
         withdrawn_status = release.withdrawn_status,
         language = release.language,
+        volume = release.volume,
+        issue = release.issue,
+        pages = release.pages,
+        number = release.number,
         license = release.license_slug,
         doi = release.ext_ids.doi,
         pmid = release.ext_ids.pmid,
@@ -72,7 +77,7 @@ def release_to_elasticsearch(entity, force_bool=True):
     in_dweb = False
     in_ia = False
     in_ia_sim = False
-    in_shadow = False
+    in_shadows = False
 
     release_year = release.release_year
     if release.release_date:
@@ -85,11 +90,15 @@ def release_to_elasticsearch(entity, force_bool=True):
 
     t['any_abstract'] = len(release.abstracts or []) > 0
     t['ref_count'] = len(release.refs or [])
-    t['ref_linked_count'] = 0
-    if release.refs:
-        t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id])
+    ref_release_ids = []
+    for r in (release.refs or []):
+        if r.target_release_id:
+            ref_release_ids.append(r.target_release_id)
+    t['ref_release_ids'] = ref_release_ids
+    t['ref_linked_count'] = len(ref_release_ids)
     t['contrib_count'] = len(release.contribs or [])
     contrib_names = []
+    contrib_affiliations = []
     creator_ids = []
     for c in (release.contribs or []):
         if c.raw_name:
@@ -98,8 +107,14 @@ def release_to_elasticsearch(entity, force_bool=True):
             contrib_names.append(c.surname)
         if c.creator_id:
             creator_ids.append(c.creator_id)
+        if c.raw_affiliation:
+            contrib_affiliations.append(c.raw_affiliation)
     t['contrib_names'] = contrib_names
     t['creator_ids'] = creator_ids
+    t['affiliations'] = contrib_affiliations
+
+    # TODO: mapping... probably by lookup?
+    t['affiliation_rors'] = None
 
     container = release.container
     if container:
@@ -134,14 +149,19 @@ def release_to_elasticsearch(entity, force_bool=True):
             if c_extra.get('road'):
                 if c_extra['road'].get('as_of'):
                     is_oa = True
-            if c_extra.get('ezb'):
-                if c_extra['ezb'].get('color') == 'green':
-                    is_oa = True
             if c_extra.get('szczepanski'):
                 if c_extra['szczepanski'].get('as_of'):
                     is_oa = True
-    else:
+            if c_extra.get('country'):
+                t['country_code'] = c_extra['country']
+                t['country_code_upper'] = c_extra['country'].upper()
+
+    # fall back to release-level container metadata if container not linked or
+    # missing context
+    if not t.get('publisher'):
         t['publisher'] = release.publisher
+    if not t.get('container_name') and release.extra:
+        t['container_name'] = release.extra.get('container_name')
 
     if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')):
         in_jstor = True
@@ -187,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True):
         # TODO: more/better checks here, particularly strict *not* OA licenses
         if release.license_slug.startswith("CC-"):
             is_oa = True
+        if release.license_slug.startswith("ARXIV-"):
+            is_oa = True
 
     extra = release.extra or dict()
     if extra:
@@ -203,6 +225,47 @@ def release_to_elasticsearch(entity, force_bool=True):
             if extra['crossref'].get('archive'):
                 # all crossref archives are KBART, I believe
                 in_kbart = True
+        # backwards compatible subtitle fetching
+        if not t['subtitle'] and extra.get('subtitle'):
+            if type(extra['subtitle']) == list:
+                t['subtitle'] = extra['subtitle'][0]
+            else:
+                t['subtitle'] = extra['subtitle']
+
+    t['first_page'] = None
+    if release.pages:
+        first = release.pages.split('-')[0]
+        first = first.replace('p', '')
+        if first.isdigit():
+            t['first_page'] = first
+        # TODO: non-numerical first pages
+
+    t['ia_microfilm_url'] = None
+    if in_ia_sim:
+        # TODO: determine URL somehow? I think this is in flux. Will probably
+        # need extra metadata in the container extra field.
+        # special case as a demo for now.
+        if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \
+                and release.release_year in (2011, 2013) \
+                and release.issue \
+                and release.issue.isdigit() \
+                and t['first_page']:
+            t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format(
+                release.release_year,
+                int(release.issue) - 1,
+                t['first_page'],
+            )
+
+    t['doi_registrar'] = None
+    if extra and t['doi']:
+        for k in ('crossref', 'datacite', 'jalc'):
+            if k in extra:
+                t['doi_registrar'] = k
+        if not 'doi_registrar' in t:
+            t['doi_registrar'] = 'crossref'
+
+    if t['doi']:
+        t['doi_prefix'] = t['doi'].split('/')[0]
 
     if is_longtail_oa:
         is_oa = True
@@ -215,6 +278,7 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['in_jstor'] = bool(in_jstor)
         t['in_web'] = bool(in_web)
         t['in_dweb'] = bool(in_dweb)
+        t['in_shadows'] = bool(in_shadows)
     else:
         t['is_oa'] = is_oa
         t['is_longtail_oa'] = is_longtail_oa
@@ -223,11 +287,23 @@ def release_to_elasticsearch(entity, force_bool=True):
         t['in_jstor'] = in_jstor
         t['in_web'] = in_web
         t['in_dweb'] = in_dweb
+        t['in_shadows'] = in_shadows
 
     t['in_ia'] = bool(in_ia)
     t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+
+    if in_ia or t.get('pmcid') or t.get('arxiv_id'):
+        t['preservation'] = 'bright'
+    elif in_kbart or in_jstor:
+        t['preservation'] = 'dark'
+    elif in_shadows:
+        t['preservation'] = 'shadows_only'
+    else:
+        t['preservation'] = 'none'
+
     return t
 
+
 def container_to_elasticsearch(entity, force_bool=True):
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
@@ -257,23 +333,27 @@ def container_to_elasticsearch(entity, force_bool=True):
         wikidata_qid = entity.wikidata_qid,
     )
 
-    # TODO: region, discipline
-    # TODO: single primary language?
     if not entity.extra:
         entity.extra = dict()
-    for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):
+    for key in ('country', 'languages', 'mimetypes', 'original_name',
+                'first_year', 'last_year', 'aliases', 'abbrev', 'region',
+                'discipline'):
         if entity.extra.get(key):
             t[key] = entity.extra[key]
 
+    if 'country' in t:
+        t['country_code'] = t.pop('country')
+
+    t['issns'] = []
+    if entity.issnl:
+        t['issns'].append(entity.issnl)
+    for key in ('issnp', 'issne'):
+        if entity.extra.get(key):
+            t['issns'].append(entity.extra[key])
+
     in_doaj = None
     in_road = None
-    # TODO: not currently implemented
-    in_doi = None
-    # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid"
-    #in_doaj_works = None
-    in_sherpa_romeo = None
     is_oa = None
-    # TODO: not actually set/stored anywhere?
     is_longtail_oa = None
     any_kbart = None
     any_jstor = None
@@ -286,17 +366,15 @@ def container_to_elasticsearch(entity, force_bool=True):
     if extra.get('road'):
         if extra['road'].get('as_of'):
             in_road = True
-    if extra.get('ezb'):
-        if extra['ezb'].get('color') == 'green':
-            is_oa = True
     if extra.get('szczepanski'):
         if extra['szczepanski'].get('as_of'):
             is_oa = True
     if extra.get('default_license'):
         if extra['default_license'].startswith('CC-'):
             is_oa = True
+    t['sherpa_romeo_color'] = None
     if extra.get('sherpa_romeo'):
-        in_sherpa_romeo = True
+        t['sherpa_romeo_color'] = extra['sherpa_romeo'].get('color')
         if extra['sherpa_romeo'].get('color') == 'white':
             is_oa = False
     if extra.get('kbart'):
@@ -306,54 +384,128 @@ def container_to_elasticsearch(entity, force_bool=True):
     if extra.get('ia'):
         if extra['ia'].get('sim'):
             any_ia_sim = True
+        if extra['ia'].get('longtail_oa'):
+            is_longtail_oa = True
     t['is_superceded'] = bool(extra.get('superceded'))
 
     t['in_doaj'] = bool(in_doaj)
     t['in_road'] = bool(in_road)
-    t['in_sherpa_romeo'] = bool(in_sherpa_romeo)
     t['any_kbart'] = bool(any_kbart)
-    t['is_longtail_oa'] = bool(is_longtail_oa)
     if force_bool:
-        t['in_doi'] = bool(in_doi)
-        t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa)
+        t['is_oa'] = bool(in_doaj or in_road or is_oa)
+        t['is_longtail_oa'] = bool(is_longtail_oa)
         t['any_jstor'] = bool(any_jstor)
         t['any_ia_sim'] = bool(any_ia_sim)
     else:
-        t['in_doi'] = in_doi
-        t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa
+        t['is_oa'] = in_doaj or in_road or is_oa
+        t['is_longtail_oa'] = is_longtail_oa
         t['any_jstor'] = any_jstor
         t['any_ia_sim'] = any_ia_sim
     return t
 
 
+def _type_of_edit(edit):
+    if edit.revision == None and edit.redirect_ident == None:
+        return 'delete'
+    elif edit.redirect_ident:
+        # redirect
+        return 'update'
+    elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision:
+        return 'create'
+    else:
+        return 'update'
+
+
 def changelog_to_elasticsearch(entity):
+    """
+    Note that this importer requires expanded fill info to work. Calling code
+    may need to re-fetch editgroup from API to get the 'editor' field. Some of
+    the old kafka feed content doesn't includes editor in particular.
+    """
 
     editgroup = entity.editgroup
     t = dict(
         index=entity.index,
         editgroup_id=entity.editgroup_id,
-        timestamp=entity.timestamp,
+        timestamp=entity.timestamp.isoformat(),
         editor_id=editgroup.editor_id,
+        username=editgroup.editor.username,
+        is_bot=editgroup.editor.is_bot,
+        is_admin=editgroup.editor.is_admin,
     )
 
     extra = editgroup.extra or dict()
     if extra.get('agent'):
         t['agent'] = extra['agent']
 
-    t['containers'] = len(editgroup.edits.containers)
-    t['creators'] = len(editgroup.edits.containers)
-    t['files'] = len(editgroup.edits.containers)
-    t['filesets'] = len(editgroup.edits.containers)
-    t['webcaptures'] = len(editgroup.edits.containers)
-    t['releases'] = len(editgroup.edits.containers)
-    t['works'] = len(editgroup.edits.containers)
-
-    # TODO: parse and pull out counts
-    #created = 0
-    #updated = 0
-    #deleted = 0
-    #t['created'] = created
-    #t['updated'] = updated
-    #t['deleted'] = deleted
-    #t['total'] = created + updated + deleted
+    containers = [_type_of_edit(e) for e in editgroup.edits.containers]
+    creators = [_type_of_edit(e) for e in editgroup.edits.creators]
+    files = [_type_of_edit(e) for e in editgroup.edits.files]
+    filesets = [_type_of_edit(e) for e in editgroup.edits.filesets]
+    webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures]
+    releases = [_type_of_edit(e) for e in editgroup.edits.releases]
+    works = [_type_of_edit(e) for e in editgroup.edits.works]
+
+    t['containers'] = len(containers)
+    t['new_containers'] = len([e for e in containers if e == 'create'])
+    t['creators'] = len(creators)
+    t['new_creators'] = len([e for e in creators if e == 'create'])
+    t['files'] = len(files)
+    t['new_files'] = len([e for e in files if e == 'create'])
+    t['filesets'] = len(filesets)
+    t['new_filesets'] = len([e for e in filesets if e == 'create'])
+    t['webcaptures'] = len(webcaptures)
+    t['new_webcaptures'] = len([e for e in webcaptures if e == 'create'])
+    t['releases'] = len(releases)
+    t['new_releases'] = len([e for e in releases if e == 'create'])
+    t['works'] = len(works)
+    t['new_works'] = len([e for e in works if e == 'create'])
+
+    all_edits = containers + creators + files + filesets + webcaptures + releases + works
+
+    t['created'] = len([e for e in all_edits if e == 'create'])
+    t['updated'] = len([e for e in all_edits if e == 'update'])
+    t['deleted'] = len([e for e in all_edits if e == 'delete'])
+    t['total'] = len(all_edits)
+    return t
+
+
+def file_to_elasticsearch(entity):
+    """
+    Converts from an entity model/schema to elasticsearch oriented schema.
+
+    Returns: dict
+    Raises exception on error (never returns None)
+    """
+
+    if entity.state in ('redirect', 'deleted'):
+        return dict(
+            ident = entity.ident,
+            state = entity.state,
+        )
+    elif entity.state != 'active':
+        raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+    # First, the easy ones (direct copy)
+    t = dict(
+        ident = entity.ident,
+        state = entity.state,
+        revision = entity.revision,
+        release_ids = entity.release_ids,
+        release_count = len(entity.release_ids),
+        mimetype = entity.mimetype,
+        size_bytes = entity.size,
+        sha1 = entity.sha1,
+        sha256 = entity.sha256,
+        md5 = entity.md5,
+    )
+
+    parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+    t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
+    t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
+    t['rels'] = list(set([u.rel for u in entity.urls]))
+
+    t['in_ia'] = bool('archive.org' in t['domains'])
+    t['in_ia_petabox'] = bool('archive.org' in t['hosts'])
+
     return t
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 7a9a585d..b84d5e70 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -105,6 +105,8 @@ class EntityUpdatesWorker(FatcatWorker):
         self.live_pdf_ingest_doi_prefix_acceptlist = [
             # biorxiv and medrxiv
             "10.1101/",
+            # researchgate
+            "10.13140/",
         ]
 
     def want_live_ingest(self, release, ingest_request):
@@ -121,6 +123,33 @@ class EntityUpdatesWorker(FatcatWorker):
         ingest_type = ingest_request.get('ingest_type')
         doi = ingest_request.get('ext_ids', {}).get('doi')
 
+        is_document = release.release_type in (
+            'article-journal',
+            'paper-conference',
+            'article',
+            'report',
+            'chapter',
+            'manuscript',
+            'review',
+            'thesis',
+            'letter',
+            'editorial',
+            'abstract',
+            'entry',
+            'patent',
+            'post',
+            'review-book',
+        )
+        is_not_pdf = release.release_type in (
+            'dataset',
+            'stub',
+            'software',
+            'figure',
+            'graphic',
+        )
+
+        # accept list sets a default "crawl it" despite OA metadata for
+        # known-OA DOI prefixes
         in_acceptlist = False
         if doi:
             for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
@@ -129,9 +158,18 @@ class EntityUpdatesWorker(FatcatWorker):
 
         if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
             es = release_to_elasticsearch(release)
-            if not es['is_oa'] and not in_acceptlist:
+            # most datacite documents are in IRs and should be crawled
+            is_datacite_doc = False
+            if release.extra and ('datacite' in release.extra) and is_document:
+                is_datacite_doc = True
+            if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                 return False
 
+        # if ingest_type is pdf but release_type is almost certainly not a PDF,
+        # skip it. This is mostly a datacite thing.
+        if ingest_type == "pdf" and is_not_pdf:
+            return False
+
         if ingest_type == "pdf" and doi:
             for prefix in self.ingest_pdf_doi_prefix_blocklist:
                 if doi.startswith(prefix):