summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/crossref.py18
-rw-r--r--python/tests/files/huge_crossref_doi.json.gzbin0 -> 12035 bytes
-rw-r--r--python/tests/import_crossref.py16
3 files changed, 25 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 00c719f1..4a0322e7 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter):
# external identifiers
extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
- # TODO: filter out huge releases; we'll get them later (and fix bug in
- # fatcatd)
- if max(len(contribs), len(refs), len(abstracts)) > 750:
+ # filter out unreasonably huge releases
+ if len(abstracts) > 100:
+ return None
+ if len(refs) > 2000:
+ return None
+ if len(refs) > 5000:
return None
# release date parsing is amazingly complex
@@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter):
release_year = raw_date[0]
release_date = None
+ original_title = None
+ if obj.get('original-title'):
+ original_title = clean(obj.get('original-title')[0], force_xml=True)
+ if obj.get('title'):
+ title = clean(obj.get('title')[0], force_xml=True)
re = fatcat_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- title=clean(obj.get('title', [None])[0], force_xml=True),
- original_title=clean(obj.get('original-title', [None])[0]),
+ title=title,
+ original_title=original_title,
release_type=release_type,
release_status=release_status,
release_date=release_date,
diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz
new file mode 100644
index 00000000..48f58257
--- /dev/null
+++ b/python/tests/files/huge_crossref_doi.json.gz
Binary files differ
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 193f78f6..6e7f72c5 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,5 +1,5 @@
-import json
+import json, gzip
import pytest
from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
from fixtures import api
@@ -15,9 +15,17 @@ def crossref_importer_existing(api):
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
-def test_crossref_importer_batch(crossref_importer):
- with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
- JsonLinePusher(crossref_importer, f).run()
+def test_crossref_importer_huge(crossref_importer):
+ last_index = crossref_importer.api.get_changelog(limit=1)[0].index
+ with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f:
+ crossref_importer.bezerk_mode = True
+ line = f.readline()
+ mega_blob = [line for i in range(95)]
+ counts = JsonLinePusher(crossref_importer, mega_blob).run()
+ assert counts['insert'] == 95
+ change = crossref_importer.api.get_changelog_entry(index=last_index+1)
+ release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+ assert len(release.contribs) == 1014
def test_crossref_importer(crossref_importer):
last_index = crossref_importer.api.get_changelog(limit=1)[0].index