diff options
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 18 | ||||
-rw-r--r-- | python/tests/files/huge_crossref_doi.json.gz | bin | 0 -> 12035 bytes | |||
-rw-r--r-- | python/tests/import_crossref.py | 16 | ||||
-rw-r--r-- | rust/src/entity_crud.rs | 29 |
4 files changed, 50 insertions, 13 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 00c719f1..4a0322e7 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter): # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) - # TODO: filter out huge releases; we'll get them later (and fix bug in - # fatcatd) - if max(len(contribs), len(refs), len(abstracts)) > 750: + # filter out unreasonably huge releases + if len(abstracts) > 100: + return None + if len(refs) > 2000: + return None + if len(refs) > 5000: return None # release date parsing is amazingly complex @@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None + original_title = None + if obj.get('original-title'): + original_title = clean(obj.get('original-title')[0], force_xml=True) + if obj.get('title'): + title = clean(obj.get('title')[0], force_xml=True) re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, - title=clean(obj.get('title', [None])[0], force_xml=True), - original_title=clean(obj.get('original-title', [None])[0]), + title=title, + original_title=original_title, release_type=release_type, release_status=release_status, release_date=release_date, diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz Binary files differnew file mode 100644 index 00000000..48f58257 --- /dev/null +++ b/python/tests/files/huge_crossref_doi.json.gz diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 193f78f6..6e7f72c5 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,5 +1,5 @@ -import json +import json, gzip import pytest from fatcat_tools.importers import CrossrefImporter, JsonLinePusher from fixtures import api @@ -15,9 +15,17 @@ def crossref_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) -def test_crossref_importer_batch(crossref_importer): - with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - JsonLinePusher(crossref_importer, f).run() +def test_crossref_importer_huge(crossref_importer): + last_index = crossref_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f: + crossref_importer.bezerk_mode = True + line = f.readline() + mega_blob = [line for i in range(95)] + counts = JsonLinePusher(crossref_importer, mega_blob).run() + assert counts['insert'] == 95 + change = crossref_importer.api.get_changelog_entry(index=last_index+1) + release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 1014 def test_crossref_importer(crossref_importer): last_index = crossref_importer.api.get_changelog(limit=1)[0].index diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index a92c45a6..73e7aa58 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -1964,7 +1964,26 @@ impl EntityCrud for ReleaseEntity { if let Some(ref release_type) = entity.release_type { check_release_type(release_type)?; } + if let Some(ref abstracts) = entity.abstracts { + if abstracts.len() > 200 { + return Err(FatcatError::BadRequest( + "too many abstracts (sanity cap is 200)".to_string(), + ).into()) + } + } + if let Some(ref refs) = entity.abstracts { + if refs.len() > 10000 { + return Err(FatcatError::BadRequest( + "too many refs (sanity cap is 10000)".to_string(), + ).into()) + } + } if let Some(ref contribs) = entity.contribs { + if contribs.len() > 10000 { + return Err(FatcatError::BadRequest( + "too many contributors (sanity cap is 10000)".to_string(), + ).into()) + } for contrib in contribs { if let Some(ref role) = contrib.role { check_contrib_role(role)?; @@ -2160,18 +2179,20 @@ impl EntityCrud for ReleaseEntity { } } - if !release_ref_rows.is_empty() { + // can't insert more than 65k rows at a time, so take chunks + for release_ref_batch in release_ref_rows.chunks(2000) { insert_into(release_ref::table) - .values(release_ref_rows) + .values(release_ref_batch) .execute(conn)?; } - if !release_contrib_rows.is_empty() { + for release_contrib_batch in release_contrib_rows.chunks(2000) { insert_into(release_contrib::table) - .values(release_contrib_rows) + .values(release_contrib_batch) .execute(conn)?; } + // limit is much smaller for abstracts, so don't need to batch if !abstract_rows.is_empty() { // Sort of an "upsert"; only inserts new abstract rows if they don't already exist insert_into(abstracts::table) |