2 files changed, 13 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index faee6aac..d8abf3eb 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -11,12 +11,14 @@ from .common import EntityImporter, clean
 
 # The docs/guide should be the cannonical home for these mappings; update there
 # first
+# Can get a list of Crossref types (with counts) via API:
+# https://api.crossref.org/works?rows=0&facet=type-name:*
 CROSSREF_TYPE_MAP = {
     'book': 'book',
     'book-chapter': 'chapter',
     'book-part': 'chapter',
     'book-section': 'chapter',
-    'component': None,
+    'component': 'component',
     'dataset': 'dataset',
     'dissertation': 'thesis',
     'edited-book': 'book',
@@ -158,6 +160,7 @@ class CrossrefImporter(EntityImporter):
 
     def want(self, obj):
         if not obj.get('title'):
+            self.counts['skip-blank-title'] += 1
             return False
 
         # do most of these checks in-line below
@@ -174,10 +177,12 @@ class CrossrefImporter(EntityImporter):
         if obj.get('type') in (None, 'journal', 'proceedings',
                 'standard-series', 'report-series', 'book-series', 'book-set',
                 'book-track', 'proceedings-series'):
+            self.counts['skip-release-type'] += 1
             return None
 
         # Do require the 'title' keys to exsit, as release entities do
         if (not 'title' in obj) or (not obj['title']):
+            self.counts['skip-blank-title'] += 1
             return None
 
         release_type = self.map_release_type(obj['type'])
@@ -376,10 +381,13 @@ class CrossrefImporter(EntityImporter):
 
         # filter out unreasonably huge releases
         if len(abstracts) > 100:
+            self.counts['skip-huge-abstracts'] += 1
             return None
-        if len(refs) > 2000:
+        if len(contribs) > 2000:
+            self.counts['skip-huge-contribs'] += 1
             return None
         if len(refs) > 5000:
+            self.counts['skip-huge-refs'] += 1
             return None
 
         # release date parsing is amazingly complex
@@ -406,6 +414,7 @@ class CrossrefImporter(EntityImporter):
             title = clean(obj.get('title')[0], force_xml=True)
             if not title or len(title) <= 1:
                 # title can't be just a single character
+                self.counts['skip-blank-title'] += 1
                 return None
 
         subtitle = None
@@ -413,7 +422,7 @@ class CrossrefImporter(EntityImporter):
             subtitle = clean(obj.get('subtitle')[0], force_xml=True)
             if not subtitle or len(subtitle) <= 1:
                 # subtitle can't be just a single character
-                return None
+                subtitle = None
 
         if extra_crossref:
             extra['crossref'] = extra_crossref
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 3954abe2..afa2410f 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -58,7 +58,7 @@ def test_crossref_importer(crossref_importer):
 def test_crossref_mappings(crossref_importer):
     assert crossref_importer.map_release_type('journal-article') == "article-journal"
     assert crossref_importer.map_release_type('asdf') is None
-    assert crossref_importer.map_release_type('component') is None
+    assert crossref_importer.map_release_type('book-series') is None
     assert crossref_importer.map_release_type('standard') == 'standard'
 
 def test_crossref_importer_create(crossref_importer):