22 files changed, 768 insertions, 96 deletions
diff --git a/extra/stats/2020-02-19-prod-stats.json b/extra/stats/2020-02-19-prod-stats.json
new file mode 100644
index 00000000..a2313233
--- /dev/null
+++ b/extra/stats/2020-02-19-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":3509511,"timestamp":"2020-02-20T01:42:50.980212+00:00"}},"container":{"total":148356},"papers":{"in_kbart":60523853,"in_web":19616767,"in_web_not_kbart":8937938,"is_oa":11524180,"total":105665352},"release":{"refs_total":889522285,"total":143709455}}
diff --git a/extra/stats/2020-02-19-prod-table-sizes.txt b/extra/stats/2020-02-19-prod-table-sizes.txt
new file mode 100644
index 00000000..cab2b52e
--- /dev/null
+++ b/extra/stats/2020-02-19-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size:  476.74G
+
+              table_name               | table_size | indexes_size | total_size 
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib"            | 53 GB      | 43 GB        | 96 GB
+ "public"."release_rev"                | 58 GB      | 33 GB        | 91 GB
+ "public"."refs_blob"                  | 85 GB      | 2884 MB      | 88 GB
+ "public"."release_edit"               | 14 GB      | 20 GB        | 34 GB
+ "public"."work_edit"                  | 13 GB      | 20 GB        | 34 GB
+ "public"."release_ident"              | 9504 MB    | 15 GB        | 24 GB
+ "public"."work_ident"                 | 9302 MB    | 15 GB        | 24 GB
+ "public"."abstracts"                  | 16 GB      | 1501 MB      | 18 GB
+ "public"."file_rev_url"               | 9980 MB    | 3550 MB      | 13 GB
+ "public"."work_rev"                   | 6038 MB    | 5825 MB      | 12 GB
+ "public"."release_ref"                | 3997 MB    | 5690 MB      | 9686 MB
+ "public"."file_rev"                   | 3472 MB    | 5103 MB      | 8574 MB
+ "public"."file_edit"                  | 2934 MB    | 3959 MB      | 6893 MB
+ "public"."release_rev_abstract"       | 2402 MB    | 3339 MB      | 5742 MB
+ "public"."file_ident"                 | 1795 MB    | 2437 MB      | 4231 MB
+ "public"."file_rev_release"           | 1651 MB    | 2428 MB      | 4078 MB
+ "public"."creator_edit"               | 702 MB     | 942 MB       | 1643 MB
+ "public"."creator_rev"                | 695 MB     | 719 MB       | 1413 MB
+ "public"."editgroup"                  | 761 MB     | 404 MB       | 1164 MB
+ "public"."creator_ident"              | 474 MB     | 648 MB       | 1121 MB
+ "public"."release_rev_extid"          | 200 MB     | 312 MB       | 512 MB
+ "public"."changelog"                  | 218 MB     | 214 MB       | 432 MB
+ "public"."container_rev"              | 75 MB      | 23 MB        | 98 MB
+ "public"."container_edit"             | 25 MB      | 31 MB        | 56 MB
+ "public"."container_ident"            | 11 MB      | 19 MB        | 30 MB
+ "public"."webcapture_rev_cdx"         | 64 kB      | 32 kB        | 96 kB
+ "public"."fileset_rev_file"           | 48 kB      | 32 kB        | 80 kB
+ "public"."auth_oidc"                  | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_edit"               | 16 kB      | 48 kB        | 64 kB
+ "public"."editor"                     | 16 kB      | 48 kB        | 64 kB
+ "public"."webcapture_edit"            | 16 kB      | 48 kB        | 64 kB
+ "public"."editgroup_annotation"       | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_rev_url"            | 16 kB      | 32 kB        | 48 kB
+ "public"."webcapture_rev_url"         | 16 kB      | 32 kB        | 48 kB
+ "public"."fileset_rev_release"        | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_ident"              | 8192 bytes | 32 kB        | 40 kB
+ "public"."webcapture_rev_release"     | 8192 bytes | 32 kB        | 40 kB
+ "public"."webcapture_ident"           | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_rev"                | 16 kB      | 16 kB        | 32 kB
+ "public"."webcapture_rev"             | 16 kB      | 16 kB        | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB        | 24 kB
+(41 rows)
diff --git a/proposals/2020_sql_size_reduction.md b/proposals/2020_sql_size_reduction.md
index f421e455..2fa39873 100644
--- a/proposals/2020_sql_size_reduction.md
+++ b/proposals/2020_sql_size_reduction.md
@@ -52,6 +52,8 @@ Other growth is expected to be much smaller, let's say a few GB of disk.
 
 This works out to a bit over 600 GByte total disk size.
 
+NOTE: math was wrong? 470 + 80 + 100 -> 650 GByte, call it 700 GByte
+
 
 ## Idea: finish `ext_id` migration and drop columns+index from `release_rev`
 
@@ -172,3 +174,17 @@ would drop ~20% of data size and ~20% of index size.
 Would it make more sense to use {ident, editgroup} as the primary key and UNIQ,
 then have a separate index on `editgroup`? On the assumption that `editgroup`
 cardinality is much smaller, thus the index disk usage would be smaller.
+
+## Idea: use binary for hashes
+
+We currently store file hashes (SHA-1, SHA-256, MD5) and abstracts/`ref_blobs`
+keys as TEXT in lower-case hex encoding. Using binary instead could be as much
+as a 50% size savings for both column and index storage. The difference becomes
+more apparent when all files have all hashes populated.
+
+base32 encoded strings would be smaller (but non-negligable) savings.
+
+This change has a reasonable migration path, is entirely internal to postgres
+and fatcatd, and would be no change to API schema. Postgres also allows `hex`
+encoding on `bytea` data type, which can make reading/debugging reasonable.
+
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ad4de0e2..843685aa 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,11 @@ def run_grobid_metadata(args):
         bezerk_mode=args.bezerk_mode)
     LinePusher(fmi, args.tsv_file).run()
 
+def run_shadow_lib(args):
+    fmi = ShadowLibraryImporter(args.api,
+        edit_batch_size=100)
+    JsonLinePusher(fmi, args.json_file).run()
+
 def run_wayback_static(args):
     api = args.api
 
@@ -473,6 +478,16 @@ def main():
         action='store_true',
         help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
 
+    sub_shadow_lib = subparsers.add_parser('shadow-lib',
+        help="create release and file entities based on GROBID PDF metadata extraction")
+    sub_shadow_lib.set_defaults(
+        func=run_shadow_lib,
+        auth_var="FATCAT_AUTH_WORKER_SHADOW",
+    )
+    sub_shadow_lib.add_argument('json_file',
+        help="JSON file to import from (or stdin)",
+        default=sys.stdin, type=argparse.FileType('r'))
+
     sub_wayback_static = subparsers.add_parser('wayback-static',
         help="crude crawl+ingest tool for single-page HTML docs from wayback")
     sub_wayback_static.set_defaults(
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 6ce36974..6fda74c5 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -11,7 +11,7 @@ import argparse
 from collections import Counter
 import raven
 import elasticsearch
-from elasticsearch_dsl import Search
+from elasticsearch_dsl import Search, Q
 
 from fatcat_tools import public_api, simple_kafka_producer, kafka_fail_fast
 from fatcat_tools.transforms import release_ingest_request
@@ -21,45 +21,54 @@ from fatcat_tools.transforms import release_ingest_request
 sentry_client = raven.Client()
 
 
-def run_ingest_container(args):
-    """
-    This command queries elasticsearch for releases from a given container (eg,
-    journal), and prepares ingest requests for them.
-
-    By default it filters to releases which don't have any fulltext files
-    archived in IA, and dumps the ingest requests as JSON.
-    """
+def _init_search(args):
 
     # ensure API connection works
     args.api.get_changelog()
 
+    client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint)
+    search = Search(using=client, index="fatcat_release")
+    return search
+
+
+def _run_search_dump(args, search):
+
+    if args.dry_run:
+        print("=== THIS IS A DRY RUN ===")
+
     kafka_producer = None
     ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
     if args.enqueue_kafka:
         print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr)
         kafka_producer = simple_kafka_producer(args.kafka_hosts)
 
-    client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint)
-
-    s = Search(using=client, index="fatcat_release") \
-        .filter("term", in_ia=False) \
-        .filter("term", is_oa=True)
-
-    # filter/query by container
-    if args.container_id:
-        s = s.filter("term", container_id=args.container_id)
-    elif args.issnl:
-        s = s.filter("term", container_issnl=args.issnl)
-    elif args.publisher:
-        s = s.query("match", publisher=args.publisher)
-    elif args.name:
-        s = s.query("match", container_name=args.name)
+    if args.limit is not None:
+        search = search[:args.limit]
+
+    if args.before_year:
+        search = search \
+            .filter("exists", field="release_year") \
+            .filter("range", release_date=dict(lt=args.before_year))
+    if args.after_year:
+        search = search \
+            .filter("exists", field="release_year") \
+            .filter("range", release_date=dict(gte=args.after_year))
+
+    if not args.allow_non_oa:
+        search = search.filter("term", is_oa=True)
+
+    if args.release_types:
+        release_types = args.release_types.split(',')
+        search = search \
+            .filter("terms", release_type=release_types)
     else:
-        print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr)
-        sys.exit(-1)
+        search = search \
+            .filter("bool", must_not=[
+                Q("terms", release_type=["stub", "component"])
+            ])
 
     counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0})
-    counts['estimate'] = s.count()
+    counts['estimate'] = search.count()
     print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr)
 
     # don't try to clean up scroll if we are connected to public server (behind
@@ -67,19 +76,24 @@ def run_ingest_container(args):
     if args.elasticsearch_endpoint in (
             'https://search.fatcat.wiki',
             'https://search.qa.fatcat.wiki'):
-        s = s.params(clear_scroll=False)
+        search = search.params(clear_scroll=False)
 
-    results = s.scan()
+    results = search.scan()
     for esr in results:
+        if args.limit and counts['ingest_request'] >= args.limit:
+            break
         counts['elasticsearch_release'] += 1
         release = args.api.get_release(esr.ident)
         ingest_request = release_ingest_request(
             release,
-            ingest_request_source="fatcat-ingest-container",
+            ingest_request_source="fatcat-ingest",
         )
         if not ingest_request:
             continue
-        if kafka_producer != None:
+        counts['ingest_request'] += 1
+        if args.dry_run:
+            continue
+        if kafka_producer is not None:
             kafka_producer.produce(
                 ingest_file_request_topic,
                 json.dumps(ingest_request).encode('utf-8'),
@@ -87,12 +101,73 @@ def run_ingest_container(args):
                 on_delivery=kafka_fail_fast,
             )
             counts['kafka'] += 1
-        # also printing to stdout when in kafka mode; could skip?
-        print(json.dumps(ingest_request))
-        counts['ingest_request'] += 1
-    if kafka_producer != None:
+        else:
+            print(json.dumps(ingest_request))
+    if kafka_producer is not None:
         kafka_producer.flush()
     print(counts, file=sys.stderr)
+    if args.dry_run:
+        print("=== THIS WAS A DRY RUN ===")
+
+
+def run_ingest_container(args):
+    """
+    This command queries elasticsearch for releases from a given container (eg,
+    journal), and prepares ingest requests for them.
+
+    By default it filters to releases which don't have any fulltext files
+    archived in IA, and dumps the ingest requests as JSON.
+    """
+
+    search = _init_search(args).filter("term", in_ia=False)
+
+    # filter/query by container
+    if args.container_id:
+        search = search.filter("term", container_id=args.container_id)
+    elif args.issnl:
+        search = search.filter("term", container_issnl=args.issnl)
+    elif args.publisher:
+        search = search.query("match", publisher=args.publisher)
+    elif args.name:
+        search = search.query("match", container_name=args.name)
+    else:
+        print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr)
+        sys.exit(-1)
+
+    return _run_search_dump(args, search)
+
+
+def run_ingest_query(args):
+    """
+    Accepts a free-form Lucene query language string. Intended to work the same
+    way as searches in the fatcat web interface.
+    """
+
+    search = _init_search(args) \
+        .filter("term", in_ia=False) \
+        .query(
+            "query_string",
+            query=args.query,
+            default_operator="AND",
+            analyze_wildcard=True,
+            lenient=True,
+            fields=["title^5", "contrib_names^2", "container_title"],
+        )
+
+    return _run_search_dump(args, search)
+
+
+def run_ingest_extid(args):
+    """
+    Selects release entities where the external identifier (extid) exists
+    """
+
+    search = _init_search(args) \
+        .filter("term", in_ia=False) \
+        .filter("exists", field=args.extid)
+
+    return _run_search_dump(args, search)
+
 
 def main():
     parser = argparse.ArgumentParser(
@@ -112,20 +187,51 @@ def main():
     parser.add_argument('--env',
         default="dev",
         help="Kafka topic namespace to use (eg, prod, qa, dev)")
+    parser.add_argument('--limit',
+        default=None,
+        type=int,
+        help="Max number of search hits to return")
+    parser.add_argument('--dry-run',
+        action='store_true',
+        help="runs through creating all ingest requests, but doesn't actually output or enqueue")
+    parser.add_argument('--before-year',
+        type=str,
+        help="filters results to only with release_year before this (not inclusive)")
+    parser.add_argument('--after-year',
+        type=str,
+        help="filters results to only with release_year after this (inclusive)")
+    parser.add_argument('--release-types',
+        type=str,
+        help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.")
+    parser.add_argument('--allow-non-oa',
+        action='store_true',
+        help="By default, we limit to OA releases. This removes that filter")
     subparsers = parser.add_subparsers()
 
-    sub_ingest_container = subparsers.add_parser('ingest-container',
+    sub_container = subparsers.add_parser('container',
         help="Create ingest requests for releases from a specific container")
-    sub_ingest_container.set_defaults(func=run_ingest_container)
-    sub_ingest_container.add_argument('--container-id',
+    sub_container.set_defaults(func=run_ingest_container)
+    sub_container.add_argument('--container-id',
         help="fatcat container entity ident")
-    sub_ingest_container.add_argument('--issnl',
+    sub_container.add_argument('--issnl',
         help="ISSN-L of container entity")
-    sub_ingest_container.add_argument('--publisher',
+    sub_container.add_argument('--publisher',
         help="publisher name")
-    sub_ingest_container.add_argument('--name',
+    sub_container.add_argument('--name',
         help="container name")
 
+    sub_query = subparsers.add_parser('query',
+        help="Create ingest requests for releases from a specific query")
+    sub_query.set_defaults(func=run_ingest_query)
+    sub_query.add_argument('query',
+        help="search query (same DSL as web interface search)")
+
+    sub_extid = subparsers.add_parser('extid',
+        help="Create ingest requests for releases that have given extid defined")
+    sub_extid.set_defaults(func=run_ingest_extid)
+    sub_extid.add_argument('extid',
+        help="extid short name (as included in ES release schema)")
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print("tell me what to do!")
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 33f44600..d2d71d3c 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -70,8 +70,8 @@ class HarvestCrossrefWorker:
 
         def fail_fast(err, msg):
             if err is not None:
-                print("Kafka producer delivery error: {}".format(err))
-                print("Bailing out...")
+                print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+                print("Bailing out...", file=sys.stderr)
                 # TODO: should it be sys.exit(-1)?
                 raise KafkaException(err)
 
@@ -117,7 +117,7 @@ class HarvestCrossrefWorker:
             if http_resp.status_code == 503:
                 # crude backoff; now redundant with session exponential
                 # backoff, but allows for longer backoff/downtime on remote end
-                print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code))
+                print("got HTTP {}, pausing for 30 seconds".format(http_resp.status_code), file=sys.stderr)
                 # keep kafka producer connection alive
                 self.producer.poll(0)
                 time.sleep(30.0)
@@ -131,7 +131,7 @@ class HarvestCrossrefWorker:
             items = self.extract_items(resp)
             count += len(items)
             print("... got {} ({} of {}), HTTP fetch took {}".format(len(items), count,
-                self.extract_total(resp), http_resp.elapsed))
+                self.extract_total(resp), http_resp.elapsed), file=sys.stderr)
             #print(json.dumps(resp))
             for work in items:
                 self.producer.produce(
@@ -156,7 +156,7 @@ class HarvestCrossrefWorker:
         while True:
             current = self.state.next(continuous)
             if current:
-                print("Fetching DOIs updated on {} (UTC)".format(current))
+                print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)
                 self.fetch_date(current)
                 self.state.complete(current,
                     kafka_topic=self.state_topic,
@@ -164,11 +164,11 @@ class HarvestCrossrefWorker:
                 continue
 
             if continuous:
-                print("Sleeping {} seconds...".format(self.loop_sleep))
+                print("Sleeping {} seconds...".format(self.loop_sleep), file=sys.stderr)
                 time.sleep(self.loop_sleep)
             else:
                 break
-        print("{} DOI ingest caught up".format(self.name))
+        print("{} DOI ingest caught up".format(self.name), file=sys.stderr)
 
 
 class HarvestDataciteWorker(HarvestCrossrefWorker):
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index 78830a1c..310366bd 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -57,6 +57,10 @@ class HarvestState:
         if catchup_days or start_date or end_date:
             self.enqueue_period(start_date, end_date, catchup_days)
 
+    def __str__(self):
+        return '<HarvestState to_process={}, completed={}>'.format(
+            len(self.to_process), len(self.completed))
+
     def enqueue_period(self, start_date=None, end_date=None, catchup_days=14):
         """
         This function adds a time period to the "TODO" list, unless the dates
@@ -129,7 +133,7 @@ class HarvestState:
             def fail_fast(err, msg):
                 if err:
                     raise KafkaException(err)
-            print("Commiting status to Kafka: {}".format(kafka_topic))
+            print("Commiting status to Kafka: {}".format(kafka_topic), file=sys.stderr)
             producer_conf = kafka_config.copy()
             producer_conf.update({
                 'delivery.report.only.error': True,
@@ -154,7 +158,7 @@ class HarvestState:
         if not kafka_topic:
             return
 
-        print("Fetching state from kafka topic: {}".format(kafka_topic))
+        print("Fetching state from kafka topic: {}".format(kafka_topic), file=sys.stderr)
         def fail_fast(err, msg):
             if err:
                 raise KafkaException(err)
@@ -191,4 +195,4 @@ class HarvestState:
 
         # verify that we got at least to HWM
         assert c >= hwm[1]
-        print("... got {} state update messages, done".format(c))
+        print("... got {} state update messages, done".format(c), file=sys.stderr)
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index f908ba83..11b5fa0a 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -49,13 +49,14 @@ class HarvestOaiPmhWorker:
         self.name = "unnamed"
         self.state = HarvestState(start_date, end_date)
         self.state.initialize_from_kafka(self.state_topic, self.kafka_config)
+        print(self.state, file=sys.stderr)
 
     def fetch_date(self, date):
 
         def fail_fast(err, msg):
             if err is not None:
-                print("Kafka producer delivery error: {}".format(err))
-                print("Bailing out...")
+                print("Kafka producer delivery error: {}".format(err), file=sys.stderr)
+                print("Bailing out...", file=sys.stderr)
                 # TODO: should it be sys.exit(-1)?
                 raise KafkaException(err)
 
@@ -79,14 +80,14 @@ class HarvestOaiPmhWorker:
                 'until': date_str,
             })
         except sickle.oaiexceptions.NoRecordsMatch:
-            print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str))
+            print("WARN: no OAI-PMH records for this date: {} (UTC)".format(date_str), file=sys.stderr)
             return
 
         count = 0
         for item in records:
             count += 1
             if count % 50 == 0:
-                print("... up to {}".format(count))
+                print("... up to {}".format(count), file=sys.stderr)
             producer.produce(
                 self.produce_topic,
                 item.raw.encode('utf-8'),
@@ -99,7 +100,7 @@ class HarvestOaiPmhWorker:
         while True:
             current = self.state.next(continuous)
             if current:
-                print("Fetching DOIs updated on {} (UTC)".format(current))
+                print("Fetching DOIs updated on {} (UTC)".format(current), file=sys.stderr)
                 self.fetch_date(current)
                 self.state.complete(current,
                     kafka_topic=self.state_topic,
@@ -107,11 +108,11 @@ class HarvestOaiPmhWorker:
                 continue
 
             if continuous:
-                print("Sleeping {} seconds...".format(self.loop_sleep))
+                print("Sleeping {} seconds...".format(self.loop_sleep), file=sys.stderr)
                 time.sleep(self.loop_sleep)
             else:
                 break
-        print("{} OAI-PMH ingest caught up".format(self.name))
+        print("{} OAI-PMH ingest caught up".format(self.name), file=sys.stderr)
 
 
 class HarvestArxivWorker(HarvestOaiPmhWorker):
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d936605f..10557ef8 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -28,3 +28,4 @@ from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
 from .wayback_static import auto_wayback_static
 from .cdl_dash_dat import auto_cdl_dash_dat
 from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
+from .shadow import ShadowLibraryImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 1ffbd6e7..a84ce90f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -194,6 +194,8 @@ DOMAIN_REL_MAP = {
     "www.scielo.cl": "repository",
     "www.scielo.org.mx": "repository",
     "zenodo.org": "repository",
+    "www.biorxiv.org": "repository",
+    "www.medrxiv.org": "repository",
 
     "citeseerx.ist.psu.edu": "aggregator",
     "publisher-connector.core.ac.uk": "aggregator",
@@ -220,6 +222,13 @@ DOMAIN_REL_MAP = {
     "www.nature.com": "publisher",
     "www.pnas.org": "publisher",
     "www.tandfonline.com": "publisher",
+    "www.frontiersin.org": "publisher",
+    "www.degruyter.com": "publisher",
+    "www.mdpi.com": "publisher",
+    "www.ahajournals.org": "publisher",
+    "ehp.niehs.nih.gov": "publisher",
+    "journals.tsu.ru": "publisher",
+    "www.cogentoa.com": "publisher",
 
     "www.researchgate.net": "academicsocial",
     "academia.edu": "academicsocial",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f77481a..4e382348 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,11 +1,11 @@
 """
 Prototype importer for datacite.org data.
 
-Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
+Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51
 
-Datacite being an aggregator, the data is varied and exposes a couple of
-problems in content and structure. A few fields habe their own parsing
-functions (parse_datacite_...), which can be tested more easily.
+Datacite being an aggregator, the data is heterogenous and exposes a couple of
+problems in content and structure. A few fields have their own parsing
+functions (parse_datacite_...), which may help testing.
 """
 
 import collections
@@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter):
         release_date, release_month, release_year = parse_datacite_dates(
             attributes.get('dates', []))
 
+        # Some records do not use the "dates" field (e.g. micropub), but:
+        # "attributes.published" or "attributes.publicationYear"
+        if not any((release_date, release_month, release_year)):
+            release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+            if not any((release_date, release_month, release_year)):
+                release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+
+        if not any((release_date, release_month, release_year)):
+            print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
@@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter):
                                 len(container_name)))
                             container_name = container_name[0]
 
+        # Exception: https://www.micropublication.org/, see: !MR24.
+        if container_id is None and container_name is None:
+            if publisher and publisher.lower().startswith('micropublication'):
+                container_name = publisher
+
         # Volume and issue.
         volume = container.get('volume')
         issue = container.get('issue')
@@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter):
             if len(text) > MAX_ABSTRACT_LENGTH:
                 text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
 
-            # Detect language.
+            # Detect language. This is fuzzy and may be removed, if too unreliable.
             lang = None
             try:
                 lang = langdetect.detect(text)
@@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter):
 
                 if name:
                     name = clean(name)
-                if not name:
+                if not any((name, given_name, surname)):
                     continue
+                if not name:
+                    name = "{} {}".format(given_name or '', surname or '').strip()
                 if name in name_blacklist:
                     continue
                 if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -924,6 +941,32 @@ def parse_datacite_titles(titles):
 
     return title, original_language_title, subtitle
 
+def parse_single_date(value):
+    """
+    Given a single string containing a date in arbitrary format, try to return
+    tuple (date: datetime.date, month: int, year: int).
+    """
+    if not value:
+        return None, None, None
+    if isinstance(value, int):
+        value = str(value)
+    parser = dateparser.DateDataParser()
+    try:
+        # Results in a dict with keys: date_obj, period, locale.
+        parse_result = parser.get_date_data(value)
+        # A datetime object, later we need a date, only.
+        result = parse_result['date_obj']
+        if result is not None:
+            if parse_result['period'] == 'year':
+                return None, None, result.year
+            elif parse_result['period'] == 'month':
+                return None, result.month, result.year
+            else:
+                return result.date(), result.month, result.year
+    except TypeError as err:
+        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+
+    return None, None, None
 
 def parse_datacite_dates(dates):
     """
@@ -966,7 +1009,7 @@ def parse_datacite_dates(dates):
     )
 
     def parse_item(item):
-        result, value, year_only = None, item.get('date', ''), False
+        result, value, year_only = None, item.get('date', '') or '', False
         release_date, release_month, release_year = None, None, None
 
         for layout, granularity in common_patterns:
@@ -981,23 +1024,7 @@ def parse_datacite_dates(dates):
 
         if result is None:
             print('fallback for {}'.format(value), file=sys.stderr)
-            parser = dateparser.DateDataParser()
-            try:
-                # Results in a dict with keys: date_obj, period, locale.
-                parse_result = parser.get_date_data(value)
-
-                # A datetime object, later we need a date, only.
-                result = parse_result['date_obj']
-                if result is not None:
-                    if parse_result['period'] == 'year':
-                        return None, None, result.year
-                    elif parse_result['period'] == 'month':
-                        return None, result.month, result.year
-                    else:
-                        return result.date(), result.month, result.year
-            except TypeError as err:
-                print("{} date parsing failed with: {}".format(value, err),
-                      file=sys.stderr)
+            release_date, release_month, release_year = parse_single_date(value)
 
         if result is None:
             # Unparsable date.
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index bdfd2835..4772bfaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter):
             'fatcat-ingest-container',
             'fatcat-ingest',
             'arabesque',
+            'mag-corpus',
+            'mag',
+            'unpaywall-corpus',
+            'unpaywall',
+            's2-corpus',
+            's2',
         ]
         if kwargs.get('skip_source_whitelist', False):
             self.ingest_request_source_whitelist = []
@@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter):
             self.counts['skip-hit'] += 1
             return False
         source = row['request'].get('ingest_request_source')
+        if not source:
+            self.counts['skip-ingest_request_source'] += 1
+            return False
         if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
             self.counts['skip-ingest_request_source'] += 1
             return False
         if source.startswith('arabesque'):
-            if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'):
+            if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'):
                 self.counts['skip-arabesque-source'] += 1
                 return False
         if source.startswith('savepapernow'):
@@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter):
         if not 'terminal_dt' in terminal:
             terminal['terminal_dt'] = terminal['dt']
         assert len(terminal['terminal_dt']) == 14
-        url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
+
+        default_rel = self.default_link_rel
+        if request.get('link_source') == 'doi':
+            default_rel = 'publisher'
+        default_rel = request.get('rel', default_rel)
+        url = make_rel_url(terminal['terminal_url'], default_rel)
 
         if not url:
             self.counts['skip-url'] += 1
@@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter):
             release_ids=[release_ident],
             urls=urls,
         )
-        if fatcat and fatcat.get('edit_extra'):
-            fe.edit_extra = fatcat['edit_extra']
+        if request.get('edit_extra'):
+            fe.edit_extra = request['edit_extra']
         else:
             fe.edit_extra = dict()
         if request.get('ingest_request_source'):
@@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
     def want(self, row):
 
         source = row['request'].get('ingest_request_source')
+        if not source:
+            self.counts['skip-ingest_request_source'] += 1
+            return False
         if not source.startswith('savepapernow'):
             self.counts['skip-not-savepapernow'] += 1
             return False
diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py
new file mode 100644
index 00000000..4cd22775
--- /dev/null
+++ b/python/fatcat_tools/importers/shadow.py
@@ -0,0 +1,195 @@
+
+import sys
+import json
+import sqlite3
+import itertools
+import fatcat_openapi_client
+
+from fatcat_tools.normal import *
+from .common import EntityImporter, make_rel_url, SANE_MAX_RELEASES, SANE_MAX_URLS
+
+
+class ShadowLibraryImporter(EntityImporter):
+    """
+    Importer for shadow library files (matched to releases)
+
+    Input format is JSON with keys:
+    - shadow
+        - shadow_corpus (string slug)
+        - shadow_id (string)
+        - doi
+        - pmid
+        - isbn13
+    - file_meta
+        - sha1hex
+        - sha256hex
+        - md5hex
+        - size_bytes
+        - mimetype
+    - cdx (may be null)
+        - url
+        - datetime
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.pop('editgroup_description', None) or "Import of 'Shadow Library' file/release matches"
+        eg_extra = kwargs.pop('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ShadowLibraryImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+        self.default_link_rel = kwargs.get("default_link_rel", "web")
+
+    def want(self, raw_record):
+        """
+        Only want to import records with complete file-level metadata
+        """
+        fm = raw_record['file_meta']
+        if not (fm['mimetype'] and fm['md5hex'] and fm['sha256hex'] and fm['size_bytes']):
+            self.counts['skip-file-meta-incomplete'] += 1
+            return False
+        if fm['mimetype'] != 'application/pdf':
+            self.counts['skip-not-pdf'] += 1
+            return False
+        return True
+
+    def parse_record(self, obj):
+        """
+        We do the release lookup in this method. Try DOI, then PMID, last ISBN13.
+        """
+
+        shadow_corpus = obj['shadow']['shadow_corpus']
+        assert shadow_corpus == shadow_corpus.strip().lower()
+        doi = clean_doi(obj['shadow'].get('doi'))
+        pmid = clean_pmid(obj['shadow'].get('pmid'))
+        isbn13 = clean_isbn13(obj['shadow'].get('isbn13'))
+        shadow_id = obj['shadow'].get('shadow_id').strip()
+        assert shadow_id
+
+        extra = { '{}_id'.format(shadow_corpus): shadow_id }
+        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+            if not ext_id:
+                continue
+            extra['{}_{}'.format(shadow_corpus, ext_type)] = ext_id
+
+        # lookup release via several idents
+        re = None
+        for (ext_type, ext_id) in [('doi', doi), ('pmid', pmid), ('isbn13', isbn13)]:
+            if not ext_id:
+                continue
+            try:
+                re = self.api.lookup_release(**{ext_type: ext_id})
+            except fatcat_openapi_client.rest.ApiException as err:
+                if err.status not in (404, 400):
+                    raise err
+                re = None
+            if re:
+                break
+
+        if not re:
+            self.counts['skip-release-not-found'] += 1
+            return None
+
+        release_ids = [re.ident,]
+
+        # parse single CDX into URLs (if exists)
+        urls = []
+        if obj.get('cdx'):
+            url = make_rel_url(obj['cdx']['url'], default_link_rel=self.default_link_rel)
+            if url != None:
+                urls.append(url)
+            wayback = "https://web.archive.org/web/{}/{}".format(
+                obj['cdx']['datetime'],
+                obj['cdx']['url'])
+            urls.append(("webarchive", wayback))
+        urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+
+        fe = fatcat_openapi_client.FileEntity(
+            md5=obj['file_meta']['md5hex'],
+            sha1=obj['file_meta']['sha1hex'],
+            sha256=obj['file_meta']['sha256hex'],
+            size=int(obj['file_meta']['size_bytes']),
+            mimetype=obj['file_meta']['mimetype'] or None,
+            release_ids=release_ids,
+            urls=urls,
+            extra=dict(shadows=extra),
+        )
+        return fe
+
+    def try_update(self, fe):
+        # lookup sha1, or create new entity
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            return True
+
+        if not existing.extra:
+            existing.extra = {}
+
+        if existing.extra.get('shadows') and list(fe.extra['shadows'].keys())[0] in existing.extra['shadows']:
+            # already imported from this shadow library; skip
+            self.counts['exists'] += 1
+            return False
+
+        # check for edit conflicts
+        if existing.ident in [e.ident for e in self._edits_inflight]:
+            self.counts['skip-update-inflight'] += 1
+            return False
+        if fe.sha1 in [e.sha1 for e in self._edits_inflight]:
+            raise Exception("Inflight insert; shouldn't happen")
+
+        # minimum viable "existing" URL cleanup to fix dupes and broken links:
+        # remove 'None' wayback URLs, and set archive.org rel 'archive'
+        existing.urls = [u for u in existing.urls if not ('://web.archive.org/web/None/' in u.url)]
+        for i in range(len(existing.urls)):
+            u = existing.urls[i]
+            if u.rel == 'repository' and '://archive.org/download/' in u.url:
+                existing.urls[i].rel = 'archive'
+            if u.rel == 'social':
+                u.rel = 'academicsocial'
+
+        # merge the existing into this one and update
+        merged_urls = {}
+        for u in fe.urls + existing.urls:
+            merged_urls[u.url] = u
+        existing.urls = list(merged_urls.values())
+        if not existing.extra.get('shadows'):
+            existing.extra['shadows'] = fe.extra['shadows']
+        else:
+            existing.extra['shadows'].update(fe.extra['shadows'])
+
+        # do these "plus ones" because we really want to do these updates when possible
+        if len(existing.urls) > SANE_MAX_URLS + 1:
+            self.counts['skip-update-too-many-url'] += 1
+            return None
+        existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+        if len(existing.release_ids) > SANE_MAX_RELEASES + 1:
+            self.counts['skip-update-too-many-releases'] += 1
+            return None
+        existing.mimetype = existing.mimetype or fe.mimetype
+        existing.size = existing.size or fe.size
+        existing.md5 = existing.md5 or fe.md5
+        existing.sha1 = existing.sha1 or fe.sha1
+        existing.sha256 = existing.sha256 or fe.sha256
+        edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+        # add sha1 to non-entity edit row, so we can do more aggressive
+        # group-level de-dupe
+        edit.sha1 = existing.sha1
+        self._edits_inflight.append(edit)
+        self.counts['update'] += 1
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 27a4fb93..22b5154e 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -23,16 +23,16 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
         url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv)
         link_source = "arxiv"
         link_source_id = release.ext_ids.arxiv
-    elif release.ext_ids.doi:
-        url = "https://doi.org/{}".format(release.ext_ids.doi)
-        link_source = "doi"
-        link_source_id = release.ext_ids.doi
     elif release.ext_ids.pmcid:
         # TODO: how to tell if an author manuscript in PMC vs. published?
         #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
         url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
         link_source = "pmc"
         link_source_id = release.ext_ids.pmcid
+    elif release.ext_ids.doi:
+        url = "https://doi.org/{}".format(release.ext_ids.doi)
+        link_source = "doi"
+        link_source_id = release.ext_ids.doi
 
     if not url:
         return None
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index c8584ccf..b84d5e70 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -102,6 +102,12 @@ class EntityUpdatesWorker(FatcatWorker):
             # ccdc.cam.ac.uk: crystal structures
             "10.5517/",
         ]
+        self.live_pdf_ingest_doi_prefix_acceptlist = [
+            # biorxiv and medrxiv
+            "10.1101/",
+            # researchgate
+            "10.13140/",
+        ]
 
     def want_live_ingest(self, release, ingest_request):
         """
@@ -115,13 +121,55 @@ class EntityUpdatesWorker(FatcatWorker):
 
         link_source = ingest_request.get('ingest_request')
         ingest_type = ingest_request.get('ingest_type')
+        doi = ingest_request.get('ext_ids', {}).get('doi')
+
+        is_document = release.release_type in (
+            'article-journal',
+            'paper-conference',
+            'article',
+            'report',
+            'chapter',
+            'manuscript',
+            'review',
+            'thesis',
+            'letter',
+            'editorial',
+            'abstract',
+            'entry',
+            'patent',
+            'post',
+            'review-book',
+        )
+        is_not_pdf = release.release_type in (
+            'dataset',
+            'stub',
+            'software',
+            'figure',
+            'graphic',
+        )
+
+        # accept list sets a default "crawl it" despite OA metadata for
+        # known-OA DOI prefixes
+        in_acceptlist = False
+        if doi:
+            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+                if doi.startswith(prefix):
+                    in_acceptlist = True
 
         if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
             es = release_to_elasticsearch(release)
-            if not es['is_oa']:
+            # most datacite documents are in IRs and should be crawled
+            is_datacite_doc = False
+            if release.extra and ('datacite' in release.extra) and is_document:
+                is_datacite_doc = True
+            if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                 return False
 
-        doi = ingest_request.get('ext_ids', {}).get('doi')
+        # if ingest_type is pdf but release_type is almost certainly not a PDF,
+        # skip it. This is mostly a datacite thing.
+        if ingest_type == "pdf" and is_not_pdf:
+            return False
+
         if ingest_type == "pdf" and doi:
             for prefix in self.ingest_pdf_doi_prefix_blocklist:
                 if doi.startswith(prefix):
diff --git a/python/fatcat_web/entity_helpers.py b/python/fatcat_web/entity_helpers.py
index af0fea83..591dda80 100644
--- a/python/fatcat_web/entity_helpers.py
+++ b/python/fatcat_web/entity_helpers.py
@@ -53,6 +53,10 @@ def enrich_release_entity(entity):
         entity._es = release_to_elasticsearch(entity, force_bool=False)
     if entity.container and entity.container.state == "active":
         entity.container._es = container_to_elasticsearch(entity.container, force_bool=False)
+    if entity.files:
+        # remove shadows-only files with no URLs
+        entity.files = [f for f in entity.files
+            if not (f.extra and f.extra.get('shadows') and not f.urls)]
     if entity.filesets:
         for fs in entity.filesets:
             fs._total_size = sum([f.size for f in fs.manifest])
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index 83ecd1c8..961b4759 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -196,8 +196,9 @@
   </tbody>
 </table>
 {% else %}
-<p>There are no known files associated with this release (you could try
-<a href="/work/{{ release.work_id }}">other releases for this work?</a>).
+<p>There are no accessible files associated with this release. You could check
+<a href="/work/{{ release.work_id }}">other releases for this work</a> for an
+accessible version.
 {% endif %}
 {% endif %}
 
diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json
new file mode 100644
index 00000000..5f851bbb
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_30.json
@@ -0,0 +1,72 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "raw_name": "Celja J Uebel",
+        "givenName": "Celja J",
+        "familyName": "Uebel",
+        "affiliation": [],
+        "role": "author"
+      },
+      {
+        "raw_name": "Carolyn M Phillips",
+        "givenName": "Carolyn M",
+        "familyName": "Phillips",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "container": {},
+    "publicationYear": 2019,
+    "subjects": [],
+    "contributors": [],
+    "dates": null,
+    "language": null,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "Biological liquid-liquid phase separation",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": null,
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json
new file mode 100644
index 00000000..fc2c4dfc
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_30.json
@@ -0,0 +1,39 @@
+{
+  "abstracts": [
+    {
+      "content": "Biological liquid-liquid phase separation",
+      "lang": "fr",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "index": 0,
+      "given_name": "Celja J",
+      "surname": "Uebel",
+      "raw_name": "Celja J Uebel",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "given_name": "Carolyn M",
+      "raw_name": "Carolyn M Phillips",
+      "surname": "Phillips",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "container_name": "microPublication Biology"
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+}
diff --git a/python/tests/files/example_shadow.json b/python/tests/files/example_shadow.json
new file mode 100644
index 00000000..3386f481
--- /dev/null
+++ b/python/tests/files/example_shadow.json
@@ -0,0 +1,10 @@
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"12703034","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","doi":"10.1371/journal.pmed.0020124","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","sha256hex":"b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79","md5hex":"debd8db178fa08a7a0aaec6e42832a8e","size_bytes":206121,"mimetype":"application/pdf"},"cdx":{"url":"https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf","datetime":"20180729135948","sha1hex":"0000002922264275f11cca7b1c3fb662070d0dd7","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"UNPAYWALL-PDF-CRAWL-2018-07-20180729132538992-15980-16048-wbgrp-svc281/UNPAYWALL-PDF-CRAWL-2018-07-20180729135708800-16009-11693~wbgrp-svc281.us.archive.org~8443.warc.gz","warc_csize":32497,"warc_offset":105265425,"row_created":"2019-08-09T23:25:44.571943+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"51052483","sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","doi":"10.1191/0266355403gh289oa","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"00000119fa780ce368ebd96563afdb3eebb90ad3","sha256hex":"57ce460db4410b9bfaf500ed652fd29e64d46b40c17e28f1156ba03736edf91b","md5hex":"96133eec3a6c533993213e7bdf446251","size_bytes":164344,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"2476283","sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","doi":"10.1016/0042-207x(62)90512-2","pmid":"54321","isbn13":null},"file_meta":{"sha1hex":"0000017a31547caf347fab66282a40831b9ceb08","sha256hex":"e8d0c607b024ff6ffd58a35f76c454844b70ad19fe3f78a573af1ae53f53ad9d","md5hex":"b53318522b9f35a42b7e53f150fe70b2","size_bytes":116735,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"8760871","sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","doi":"10.1016/s0042-207x(79)80945-8","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000001abf3dbf936d5053d14f41699722531b8c6","sha256hex":"8a69b4a6dff98682ad43e7d4139221c1557c1bd202b615490af8a2c7dcbb71d2","md5hex":"29e1cfac8ecfbc8be57a1ec8b465c4be","size_bytes":138218,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"11473618","sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","doi":"10.1038/ng.2339","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000022e387be46ef797f6686d36c9899cbd6856","sha256hex":"a72517e8e72d78bc07a6ef7ff3a6d1d3e04325df986cb8f1bbb4e809f7a9dbdd","md5hex":"9cb8a6e056c9cc740d3bed0c50cd53dc","size_bytes":80992,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"47301218","sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","doi":"10.2307/23406551","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"0000029209536bda5f22e5110e573c5bd8ceb43a","sha256hex":"315f1d39a00ccf256fa15d92a14869dbda48d31500989aaacb11368f906a5827","md5hex":"8141b42ec3bb41fa87099633a1b61d93","size_bytes":305236,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"30603850","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","doi":"10.1109/spire.1998.712983","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","sha256hex":"777e2c472e9d2fec3bbd26bad788562cf1e08e5850315c25cfb6e46d38e7e4af","md5hex":"3a3c92fabaf6cf437bb596d9e9255ff6","size_bytes":113768,"mimetype":"application/pdf"},"cdx":{"url":"http://proteomics.bioprojects.org/pavel/papers/SST_versus_EST_in_gene_recognition..pdf","datetime":"20081121222143","sha1hex":"000002c1abd521f18aa23d9e8f464e697e218ab1","cdx_sha1hex":null,"mimetype":"application/pdf","warc_path":"1227992340180_31-c/1227992509265_9.arc.gz","warc_csize":61212,"warc_offset":62956683,"row_created":"2020-01-07T02:06:33.965383+00:00"}}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"9311918","sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","doi":"10.1111/j.1439-0507.2008.01572.x","pmid":"18721331","isbn13":null},"file_meta":{"sha1hex":"000002d4f7d4174451e4214475d5ba59f1f6a593","sha256hex":"713758ce0417f604c0a4b0bf5b5eea571a9b08ca4cc81a98d602c43f42abfe37","md5hex":"0df123e6305c617ffd38ebef90b1e318","size_bytes":178664,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"7757772","sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","doi":"10.1007/s10464-011-9424-3","pmid":"21287262","isbn13":null},"file_meta":{"sha1hex":"000002f8966a4c5547f8a47f43661fcc3edc34ea","sha256hex":"ee1bce27134ae55b3d67f9b31f66571e41ac496fc3fb526dec2d53513b8f6deb","md5hex":"e72c5cf3d61635821e78ca0306c98887","size_bytes":337857,"mimetype":"application/pdf"},"cdx":null}
+{"shadow":{"shadow_corpus":"scimag","shadow_id":"74272862","sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","doi":"10.1002/slct.201802783","pmid":null,"isbn13":null},"file_meta":{"sha1hex":"000003a94022be58305ccc2a018a6359eeb226db","sha256hex":"f277eefc7b1466df814a7a892ab8e2e7f08db1faae0bf73b893211e5f5b37193","md5hex":"27534b8494f54ba5de47c16fb2590b04","size_bytes":1372272,"mimetype":"application/pdf"},"cdx":null}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 669a6984..15650375 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(30):
+    for i in range(31):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
-        print('testing mapping from {} => {}'.format(src, dst))
         with open(src, 'r') as f:
             re = datacite_importer.parse_record(json.load(f))
             result = entity_to_dict(re)
diff --git a/python/tests/import_shadow.py b/python/tests/import_shadow.py
new file mode 100644
index 00000000..70a918d2
--- /dev/null
+++ b/python/tests/import_shadow.py
@@ -0,0 +1,61 @@
+
+import json
+import pytest
+from fatcat_tools.importers import ShadowLibraryImporter, JsonLinePusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def shadow_importer(api):
+    yield ShadowLibraryImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_shadow_importer_basic(shadow_importer):
+    with open('tests/files/example_shadow.json', 'r') as f:
+        JsonLinePusher(shadow_importer, f).run()
+
+def test_shadow_importer(shadow_importer):
+    last_index = shadow_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/example_shadow.json', 'r') as f:
+        shadow_importer.bezerk_mode = True
+        counts = JsonLinePusher(shadow_importer, f).run()
+    assert counts['insert'] == 2
+    assert counts['exists'] == 0
+    assert counts['skip'] == 8
+
+    # fetch most recent editgroup
+    change = shadow_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "shadow library" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.ShadowLibraryImporter" in eg.extra['agent']
+
+    # re-insert; should skip
+    with open('tests/files/example_shadow.json', 'r') as f:
+        shadow_importer.reset()
+        shadow_importer.bezerk_mode = False
+        counts = JsonLinePusher(shadow_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 2
+    assert counts['skip'] == 8
+
+def test_shadow_dict_parse(shadow_importer):
+    with open('tests/files/example_shadow.json', 'r') as f:
+        raw = json.loads(f.readline())
+        f = shadow_importer.parse_record(raw)
+
+        assert f.sha1 == "0000002922264275f11cca7b1c3fb662070d0dd7"
+        assert f.md5 == "debd8db178fa08a7a0aaec6e42832a8e"
+        assert f.sha256 == "b4728210cc0f70d8a8f8c39bd97fcbbab3eaca4309ac4bdfbce5df3b66c82f79"
+        assert f.mimetype == "application/pdf"
+        assert f.size == 206121
+        assert len(f.urls) == 2
+        for u in f.urls:
+            if u.rel == "publisher":
+                assert u.url.startswith("https://link.springer.com/content/pdf/10.1007%2Fs11626-008-9119-8.pdf")
+            if u.rel == "webarchive":
+                assert u.url.startswith("https://web.archive.org/")
+                assert "20180729135948" in u.url
+        assert len(f.release_ids) == 1
+