diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 18 | ||||
-rw-r--r-- | python/tests/files/huge_crossref_doi.json.gz | bin | 0 -> 12035 bytes | |||
-rw-r--r-- | python/tests/import_crossref.py | 16 |
3 files changed, 25 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 00c719f1..4a0322e7 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter): # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) - # TODO: filter out huge releases; we'll get them later (and fix bug in - # fatcatd) - if max(len(contribs), len(refs), len(abstracts)) > 750: + # filter out unreasonably huge releases + if len(abstracts) > 100: + return None + if len(refs) > 2000: + return None + if len(refs) > 5000: return None # release date parsing is amazingly complex @@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None + original_title = None + if obj.get('original-title'): + original_title = clean(obj.get('original-title')[0], force_xml=True) + if obj.get('title'): + title = clean(obj.get('title')[0], force_xml=True) re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, - title=clean(obj.get('title', [None])[0], force_xml=True), - original_title=clean(obj.get('original-title', [None])[0]), + title=title, + original_title=original_title, release_type=release_type, release_status=release_status, release_date=release_date, diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz Binary files differnew file mode 100644 index 00000000..48f58257 --- /dev/null +++ b/python/tests/files/huge_crossref_doi.json.gz diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 193f78f6..6e7f72c5 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,5 +1,5 @@ -import json +import json, gzip import pytest from fatcat_tools.importers import CrossrefImporter, JsonLinePusher from fixtures import api @@ -15,9 +15,17 @@ def crossref_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) -def test_crossref_importer_batch(crossref_importer): - with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - JsonLinePusher(crossref_importer, f).run() +def test_crossref_importer_huge(crossref_importer): + last_index = crossref_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f: + crossref_importer.bezerk_mode = True + line = f.readline() + mega_blob = [line for i in range(95)] + counts = JsonLinePusher(crossref_importer, mega_blob).run() + assert counts['insert'] == 95 + change = crossref_importer.api.get_changelog_entry(index=last_index+1) + release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 1014 def test_crossref_importer(crossref_importer): last_index = crossref_importer.api.get_changelog(limit=1)[0].index |