4 files changed, 50 insertions, 13 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 00c719f1..4a0322e7 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter):
         # external identifiers
         extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
 
-        # TODO: filter out huge releases; we'll get them later (and fix bug in
-        # fatcatd)
-        if max(len(contribs), len(refs), len(abstracts)) > 750:
+        # filter out unreasonably huge releases
+        if len(abstracts) > 100:
+            return None
+        if len(refs) > 2000:
+            return None
+        if len(refs) > 5000:
             return None
 
         # release date parsing is amazingly complex
@@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter):
             release_year = raw_date[0]
             release_date = None
 
+        original_title = None
+        if obj.get('original-title'):
+            original_title = clean(obj.get('original-title')[0], force_xml=True)
+        if obj.get('title'):
+            title = clean(obj.get('title')[0], force_xml=True)
         re = fatcat_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
-            title=clean(obj.get('title', [None])[0], force_xml=True),
-            original_title=clean(obj.get('original-title', [None])[0]),
+            title=title,
+            original_title=original_title,
             release_type=release_type,
             release_status=release_status,
             release_date=release_date,
diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz
new file mode 100644
index 00000000..48f58257
--- /dev/null
+++ b/python/tests/files/huge_crossref_doi.json.gz
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 193f78f6..6e7f72c5 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,5 +1,5 @@
 
-import json
+import json, gzip
 import pytest
 from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
 from fixtures import api
@@ -15,9 +15,17 @@ def crossref_importer_existing(api):
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
         yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
 
-def test_crossref_importer_batch(crossref_importer):
-    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
-        JsonLinePusher(crossref_importer, f).run()
+def test_crossref_importer_huge(crossref_importer):
+    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
+    with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f:
+        crossref_importer.bezerk_mode = True
+        line = f.readline()
+        mega_blob = [line for i in range(95)]
+        counts = JsonLinePusher(crossref_importer, mega_blob).run()
+    assert counts['insert'] == 95
+    change = crossref_importer.api.get_changelog_entry(index=last_index+1)
+    release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert len(release.contribs) == 1014
 
 def test_crossref_importer(crossref_importer):
     last_index = crossref_importer.api.get_changelog(limit=1)[0].index
diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs
index a92c45a6..73e7aa58 100644
--- a/rust/src/entity_crud.rs
+++ b/rust/src/entity_crud.rs
@@ -1964,7 +1964,26 @@ impl EntityCrud for ReleaseEntity {
             if let Some(ref release_type) = entity.release_type {
                 check_release_type(release_type)?;
             }
+            if let Some(ref abstracts) = entity.abstracts {
+                if abstracts.len() > 200 {
+                    return Err(FatcatError::BadRequest(
+                        "too many abstracts (sanity cap is 200)".to_string(),
+                    ).into())
+                }
+            }
+            if let Some(ref refs) = entity.abstracts {
+                if refs.len() > 10000 {
+                    return Err(FatcatError::BadRequest(
+                        "too many refs (sanity cap is 10000)".to_string(),
+                    ).into())
+                }
+            }
             if let Some(ref contribs) = entity.contribs {
+                if contribs.len() > 10000 {
+                    return Err(FatcatError::BadRequest(
+                        "too many contributors (sanity cap is 10000)".to_string(),
+                    ).into())
+                }
                 for contrib in contribs {
                     if let Some(ref role) = contrib.role {
                         check_contrib_role(role)?;
@@ -2160,18 +2179,20 @@ impl EntityCrud for ReleaseEntity {
             }
         }
 
-        if !release_ref_rows.is_empty() {
+        // can't insert more than 65k rows at a time, so take chunks
+        for release_ref_batch in release_ref_rows.chunks(2000) {
             insert_into(release_ref::table)
-                .values(release_ref_rows)
+                .values(release_ref_batch)
                 .execute(conn)?;
         }
 
-        if !release_contrib_rows.is_empty() {
+        for release_contrib_batch in release_contrib_rows.chunks(2000) {
             insert_into(release_contrib::table)
-                .values(release_contrib_rows)
+                .values(release_contrib_batch)
                 .execute(conn)?;
         }
 
+        // limit is much smaller for abstracts, so don't need to batch
         if !abstract_rows.is_empty() {
             // Sort of an "upsert"; only inserts new abstract rows if they don't already exist
             insert_into(abstracts::table)