aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/crossref.py18
-rw-r--r--python/tests/files/huge_crossref_doi.json.gzbin0 -> 12035 bytes
-rw-r--r--python/tests/import_crossref.py16
-rw-r--r--rust/src/entity_crud.rs29
4 files changed, 50 insertions, 13 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 00c719f1..4a0322e7 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter):
# external identifiers
extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
- # TODO: filter out huge releases; we'll get them later (and fix bug in
- # fatcatd)
- if max(len(contribs), len(refs), len(abstracts)) > 750:
+ # filter out unreasonably huge releases
+ if len(abstracts) > 100:
+ return None
+ if len(refs) > 2000:
+ return None
+ if len(refs) > 5000:
return None
# release date parsing is amazingly complex
@@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter):
release_year = raw_date[0]
release_date = None
+ original_title = None
+ if obj.get('original-title'):
+ original_title = clean(obj.get('original-title')[0], force_xml=True)
+ if obj.get('title'):
+ title = clean(obj.get('title')[0], force_xml=True)
re = fatcat_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- title=clean(obj.get('title', [None])[0], force_xml=True),
- original_title=clean(obj.get('original-title', [None])[0]),
+ title=title,
+ original_title=original_title,
release_type=release_type,
release_status=release_status,
release_date=release_date,
diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz
new file mode 100644
index 00000000..48f58257
--- /dev/null
+++ b/python/tests/files/huge_crossref_doi.json.gz
Binary files differ
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 193f78f6..6e7f72c5 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,5 +1,5 @@
-import json
+import json, gzip
import pytest
from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
from fixtures import api
@@ -15,9 +15,17 @@ def crossref_importer_existing(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
-def test_crossref_importer_batch(crossref_importer):
- with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
- JsonLinePusher(crossref_importer, f).run()
+def test_crossref_importer_huge(crossref_importer):
+ last_index = crossref_importer.api.get_changelog(limit=1)[0].index
+ with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f:
+ crossref_importer.bezerk_mode = True
+ line = f.readline()
+ mega_blob = [line for i in range(95)]
+ counts = JsonLinePusher(crossref_importer, mega_blob).run()
+ assert counts['insert'] == 95
+ change = crossref_importer.api.get_changelog_entry(index=last_index+1)
+ release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+ assert len(release.contribs) == 1014
def test_crossref_importer(crossref_importer):
last_index = crossref_importer.api.get_changelog(limit=1)[0].index
diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs
index a92c45a6..73e7aa58 100644
--- a/rust/src/entity_crud.rs
+++ b/rust/src/entity_crud.rs
@@ -1964,7 +1964,26 @@ impl EntityCrud for ReleaseEntity {
if let Some(ref release_type) = entity.release_type {
check_release_type(release_type)?;
}
+ if let Some(ref abstracts) = entity.abstracts {
+ if abstracts.len() > 200 {
+ return Err(FatcatError::BadRequest(
+ "too many abstracts (sanity cap is 200)".to_string(),
+ ).into())
+ }
+ }
+ if let Some(ref refs) = entity.abstracts {
+ if refs.len() > 10000 {
+ return Err(FatcatError::BadRequest(
+ "too many refs (sanity cap is 10000)".to_string(),
+ ).into())
+ }
+ }
if let Some(ref contribs) = entity.contribs {
+ if contribs.len() > 10000 {
+ return Err(FatcatError::BadRequest(
+ "too many contributors (sanity cap is 10000)".to_string(),
+ ).into())
+ }
for contrib in contribs {
if let Some(ref role) = contrib.role {
check_contrib_role(role)?;
@@ -2160,18 +2179,20 @@ impl EntityCrud for ReleaseEntity {
}
}
- if !release_ref_rows.is_empty() {
+ // can't insert more than 65k rows at a time, so take chunks
+ for release_ref_batch in release_ref_rows.chunks(2000) {
insert_into(release_ref::table)
- .values(release_ref_rows)
+ .values(release_ref_batch)
.execute(conn)?;
}
- if !release_contrib_rows.is_empty() {
+ for release_contrib_batch in release_contrib_rows.chunks(2000) {
insert_into(release_contrib::table)
- .values(release_contrib_rows)
+ .values(release_contrib_batch)
.execute(conn)?;
}
+ // limit is much smaller for abstracts, so don't need to batch
if !abstract_rows.is_empty() {
// Sort of an "upsert"; only inserts new abstract rows if they don't already exist
insert_into(abstracts::table)