diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-24 15:21:43 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-24 15:21:48 -0800 |
commit | f955f66789b0078dcb973ce587d2d3b3184e73a7 (patch) | |
tree | e048502b461fdfa973d4697b3043f4a516694b2f /python | |
parent | 206acf1b37a1a34d5338c744e17ef2035cd2db58 (diff) | |
download | fatcat-f955f66789b0078dcb973ce587d2d3b3184e73a7.tar.gz fatcat-f955f66789b0078dcb973ce587d2d3b3184e73a7.zip |
allow importing contrib/refs lists
The motivation here isn't really to support these gigantic lists on
principle, but to be able to ingest large corpuses without having to
decide whether to filter out or crop such lists.
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 18 | ||||
-rw-r--r-- | python/tests/files/huge_crossref_doi.json.gz | bin | 0 -> 12035 bytes | |||
-rw-r--r-- | python/tests/import_crossref.py | 16 |
3 files changed, 25 insertions, 9 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 00c719f1..4a0322e7 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter): # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) - # TODO: filter out huge releases; we'll get them later (and fix bug in - # fatcatd) - if max(len(contribs), len(refs), len(abstracts)) > 750: + # filter out unreasonably huge releases + if len(abstracts) > 100: + return None + if len(refs) > 2000: + return None + if len(refs) > 5000: return None # release date parsing is amazingly complex @@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None + original_title = None + if obj.get('original-title'): + original_title = clean(obj.get('original-title')[0], force_xml=True) + if obj.get('title'): + title = clean(obj.get('title')[0], force_xml=True) re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, - title=clean(obj.get('title', [None])[0], force_xml=True), - original_title=clean(obj.get('original-title', [None])[0]), + title=title, + original_title=original_title, release_type=release_type, release_status=release_status, release_date=release_date, diff --git a/python/tests/files/huge_crossref_doi.json.gz b/python/tests/files/huge_crossref_doi.json.gz Binary files differnew file mode 100644 index 00000000..48f58257 --- /dev/null +++ b/python/tests/files/huge_crossref_doi.json.gz diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 193f78f6..6e7f72c5 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,5 +1,5 @@ -import json +import json, gzip import pytest from fatcat_tools.importers import CrossrefImporter, JsonLinePusher from fixtures import api @@ -15,9 +15,17 @@ def crossref_importer_existing(api): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False) -def test_crossref_importer_batch(crossref_importer): - with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: - JsonLinePusher(crossref_importer, f).run() +def test_crossref_importer_huge(crossref_importer): + last_index = crossref_importer.api.get_changelog(limit=1)[0].index + with gzip.open('tests/files/huge_crossref_doi.json.gz', 'rt') as f: + crossref_importer.bezerk_mode = True + line = f.readline() + mega_blob = [line for i in range(95)] + counts = JsonLinePusher(crossref_importer, mega_blob).run() + assert counts['insert'] == 95 + change = crossref_importer.api.get_changelog_entry(index=last_index+1) + release = crossref_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert len(release.contribs) == 1014 def test_crossref_importer(crossref_importer): last_index = crossref_importer.api.get_changelog(limit=1)[0].index |