From f955f66789b0078dcb973ce587d2d3b3184e73a7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 24 Jan 2019 15:21:43 -0800 Subject: allow importing contrib/refs lists The motivation here isn't really to support these gigantic lists on principle, but to be able to ingest large corpuses without having to decide whether to filter out or crop such lists. --- python/tests/files/huge_crossref_doi.json.gz | Bin 0 -> 12035 bytes python/tests/import_crossref.py | 16 ++++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 python/tests/files/huge_crossref_doi.json.gz (limited to 'python/tests') diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz new file mode 100644 index 00000000..48f58257 Binary files /dev/null and b/python/tests/files/huge_crossref_doi.json.gz differ diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 193f78f6..6e7f72c5 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,5 +1,5 @@ -import json +import json, gzip import pytest from fatcat_tools.importers import CrossrefImporter, JsonLinePusher from fixtures import api @@ -15,9 +15,17 @@ def crossref_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) -def test_crossref_importer_batch(crossref_importer): - with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - JsonLinePusher(crossref_importer, f).run() +def test_crossref_importer_huge(crossref_importer): + last_index = crossref_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f: + crossref_importer.bezerk_mode = True + line = f.readline() + mega_blob = [line for i in range(95)] + counts = JsonLinePusher(crossref_importer, mega_blob).run() + assert counts['insert'] == 95 + change = crossref_importer.api.get_changelog_entry(index=last_index+1) + release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 1014 def test_crossref_importer(crossref_importer): last_index = crossref_importer.api.get_changelog(limit=1)[0].index -- cgit v1.2.3