From ef7de452e81a8ccf70c1fa229f80b92bc65c46f0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 4 Nov 2019 11:42:37 -0800 Subject: crossref: count why skip happened Might skip based on release type (eg container, not a paper/release), or missing title, or other reasons. Over 7 million DOIs are getting skipped, curious why. --- python/fatcat_tools/importers/crossref.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 3705eb81..6bb233f3 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -174,10 +174,12 @@ class CrossrefImporter(EntityImporter): if obj.get('type') in (None, 'journal', 'proceedings', 'standard-series', 'report-series', 'book-series', 'book-set', 'book-track', 'proceedings-series'): + self.counts['skip-release-type'] += 1 return None # Do require the 'title' keys to exsit, as release entities do if (not 'title' in obj) or (not obj['title']): + self.counts['skip-blank-title'] += 1 return None release_type = self.map_release_type(obj['type']) @@ -376,10 +378,13 @@ class CrossrefImporter(EntityImporter): # filter out unreasonably huge releases if len(abstracts) > 100: + self.counts['skip-huge-abstracts'] += 1 return None - if len(refs) > 2000: + if len(contribs) > 2000: + self.counts['skip-huge-contribs'] += 1 return None if len(refs) > 5000: + self.counts['skip-huge-refs'] += 1 return None # release date parsing is amazingly complex @@ -406,6 +411,7 @@ class CrossrefImporter(EntityImporter): title = clean(obj.get('title')[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character + self.counts['skip-blank-title'] += 1 return None subtitle = None -- cgit v1.2.3