diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 18 |
1 files changed, 13 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 00c719f1..4a0322e7 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -303,9 +303,12 @@ class CrossrefImporter(EntityImporter): # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) - # TODO: filter out huge releases; we'll get them later (and fix bug in - # fatcatd) - if max(len(contribs), len(refs), len(abstracts)) > 750: + # filter out unreasonably huge releases + if len(abstracts) > 100: + return None + if len(refs) > 2000: + return None + if len(refs) > 5000: return None # release date parsing is amazingly complex @@ -322,11 +325,16 @@ class CrossrefImporter(EntityImporter): release_year = raw_date[0] release_date = None + original_title = None + if obj.get('original-title'): + original_title = clean(obj.get('original-title')[0], force_xml=True) + if obj.get('title'): + title = clean(obj.get('title')[0], force_xml=True) re = fatcat_client.ReleaseEntity( work_id=None, container_id=container_id, - title=clean(obj.get('title', [None])[0], force_xml=True), - original_title=clean(obj.get('original-title', [None])[0]), + title=title, + original_title=original_title, release_type=release_type, release_status=release_status, release_date=release_date, |