allow importing contrib/refs lists

The motivation here isn't really to support these gigantic lists on principle, but to be able to ingest large corpuses without having to decide whether to filter out or crop such lists.
author: Bryan Newbold <bnewbold@robocracy.org> 2019-01-24 15:21:43 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-01-24 15:21:48 -0800
commit: f955f66789b0078dcb973ce587d2d3b3184e73a7 (patch)
tree: e048502b461fdfa973d4697b3043f4a516694b2f
parent: 206acf1b37a1a34d5338c744e17ef2035cd2db58 (diff)
download: fatcat-f955f66789b0078dcb973ce587d2d3b3184e73a7.tar.gz
fatcat-f955f66789b0078dcb973ce587d2d3b3184e73a7.zip
4 files changed, 50 insertions, 13 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 00c719f1..4a0322e7 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter):
         # external identifiers
         extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
 
-        # TODO: filter out huge releases; we'll get them later (and fix bug in
-        # fatcatd)
-        if max(len(contribs), len(refs), len(abstracts)) > 750:
+        # filter out unreasonably huge releases
+        if len(abstracts) > 100:
+            return None
+        if len(refs) > 2000:
+            return None
+        if len(refs) > 5000:
             return None
 
         # release date parsing is amazingly complex
@@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter):
             release_year = raw_date[0]
             release_date = None
 
+        original_title = None
+        if obj.get('original-title'):
+            original_title = clean(obj.get('original-title')[0], force_xml=True)
+        if obj.get('title'):
+            title = clean(obj.get('title')[0], force_xml=True)
         re = fatcat_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
-            title=clean(obj.get('title', [None])[0], force_xml=True),
-            original_title=clean(obj.get('original-title', [None])[0]),
+            title=title,
+            original_title=original_title,
             release_type=release_type,
             release_status=release_status,
             release_date=release_date,
diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz
new file mode 100644
index 00000000..48f58257
--- /dev/null
+++ b/python/tests/files/huge_crossref_doi.json.gz
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 193f78f6..6e7f72c5 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,5 +1,5 @@
 
-import json
+import json, gzip
 import pytest
 from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
 from fixtures import api
@@ -15,9 +15,17 @@ def crossref_importer_existing(api):
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
         yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
 
-def test_crossref_importer_batch(crossref_importer):
-    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
-        JsonLinePusher(crossref_importer, f).run()
+def test_crossref_importer_huge(crossref_importer):
+    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
+    with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f:
+        crossref_importer.bezerk_mode = True
+        line = f.readline()
+        mega_blob = [line for i in range(95)]
+        counts = JsonLinePusher(crossref_importer, mega_blob).run()
+    assert counts['insert'] == 95
+    change = crossref_importer.api.get_changelog_entry(index=last_index+1)
+    release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert len(release.contribs) == 1014
 
 def test_crossref_importer(crossref_importer):
     last_index = crossref_importer.api.get_changelog(limit=1)[0].index
diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs
index a92c45a6..73e7aa58 100644
--- a/rust/src/entity_crud.rs
+++ b/rust/src/entity_crud.rs
@@ -1964,7 +1964,26 @@ impl EntityCrud for ReleaseEntity {
             if let Some(ref release_type) = entity.release_type {
                 check_release_type(release_type)?;
             }
+            if let Some(ref abstracts) = entity.abstracts {
+                if abstracts.len() > 200 {
+                    return Err(FatcatError::BadRequest(
+                        "too many abstracts (sanity cap is 200)".to_string(),
+                    ).into())
+                }
+            }
+            if let Some(ref refs) = entity.abstracts {
+                if refs.len() > 10000 {
+                    return Err(FatcatError::BadRequest(
+                        "too many refs (sanity cap is 10000)".to_string(),
+                    ).into())
+                }
+            }
             if let Some(ref contribs) = entity.contribs {
+                if contribs.len() > 10000 {
+                    return Err(FatcatError::BadRequest(
+                        "too many contributors (sanity cap is 10000)".to_string(),
+                    ).into())
+                }
                 for contrib in contribs {
                     if let Some(ref role) = contrib.role {
                         check_contrib_role(role)?;
@@ -2160,18 +2179,20 @@ impl EntityCrud for ReleaseEntity {
             }
         }
 
-        if !release_ref_rows.is_empty() {
+        // can't insert more than 65k rows at a time, so take chunks
+        for release_ref_batch in release_ref_rows.chunks(2000) {
             insert_into(release_ref::table)
-                .values(release_ref_rows)
+                .values(release_ref_batch)
                 .execute(conn)?;
         }
 
-        if !release_contrib_rows.is_empty() {
+        for release_contrib_batch in release_contrib_rows.chunks(2000) {
             insert_into(release_contrib::table)
-                .values(release_contrib_rows)
+                .values(release_contrib_batch)
                 .execute(conn)?;
         }
 
+        // limit is much smaller for abstracts, so don't need to batch
         if !abstract_rows.is_empty() {
             // Sort of an "upsert"; only inserts new abstract rows if they don't already exist
             insert_into(abstracts::table)
author	Bryan Newbold <bnewbold@robocracy.org>	2019-01-24 15:21:43 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-01-24 15:21:48 -0800
commit	f955f66789b0078dcb973ce587d2d3b3184e73a7 (patch)
tree	e048502b461fdfa973d4697b3043f4a516694b2f
parent	206acf1b37a1a34d5338c744e17ef2035cd2db58 (diff)
download	fatcat-f955f66789b0078dcb973ce587d2d3b3184e73a7.tar.gz fatcat-f955f66789b0078dcb973ce587d2d3b3184e73a7.zip