diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 15 | ||||
| -rw-r--r-- | python/tests/import_crossref.py | 2 | 
2 files changed, 13 insertions, 4 deletions
| diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index faee6aac..d8abf3eb 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -11,12 +11,14 @@ from .common import EntityImporter, clean  # The docs/guide should be the cannonical home for these mappings; update there  # first +# Can get a list of Crossref types (with counts) via API: +# https://api.crossref.org/works?rows=0&facet=type-name:*  CROSSREF_TYPE_MAP = {      'book': 'book',      'book-chapter': 'chapter',      'book-part': 'chapter',      'book-section': 'chapter', -    'component': None, +    'component': 'component',      'dataset': 'dataset',      'dissertation': 'thesis',      'edited-book': 'book', @@ -158,6 +160,7 @@ class CrossrefImporter(EntityImporter):      def want(self, obj):          if not obj.get('title'): +            self.counts['skip-blank-title'] += 1              return False          # do most of these checks in-line below @@ -174,10 +177,12 @@ class CrossrefImporter(EntityImporter):          if obj.get('type') in (None, 'journal', 'proceedings',                  'standard-series', 'report-series', 'book-series', 'book-set',                  'book-track', 'proceedings-series'): +            self.counts['skip-release-type'] += 1              return None          # Do require the 'title' keys to exsit, as release entities do          if (not 'title' in obj) or (not obj['title']): +            self.counts['skip-blank-title'] += 1              return None          release_type = self.map_release_type(obj['type']) @@ -376,10 +381,13 @@ class CrossrefImporter(EntityImporter):          # filter out unreasonably huge releases          if len(abstracts) > 100: +            self.counts['skip-huge-abstracts'] += 1              return None -        if len(refs) > 2000: +        if len(contribs) > 2000: +            self.counts['skip-huge-contribs'] += 1              return None          if len(refs) > 5000: +            self.counts['skip-huge-refs'] += 1              return None          # release date parsing is amazingly complex @@ -406,6 +414,7 @@ class CrossrefImporter(EntityImporter):              title = clean(obj.get('title')[0], force_xml=True)              if not title or len(title) <= 1:                  # title can't be just a single character +                self.counts['skip-blank-title'] += 1                  return None          subtitle = None @@ -413,7 +422,7 @@ class CrossrefImporter(EntityImporter):              subtitle = clean(obj.get('subtitle')[0], force_xml=True)              if not subtitle or len(subtitle) <= 1:                  # subtitle can't be just a single character -                return None +                subtitle = None          if extra_crossref:              extra['crossref'] = extra_crossref diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 3954abe2..afa2410f 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -58,7 +58,7 @@ def test_crossref_importer(crossref_importer):  def test_crossref_mappings(crossref_importer):      assert crossref_importer.map_release_type('journal-article') == "article-journal"      assert crossref_importer.map_release_type('asdf') is None -    assert crossref_importer.map_release_type('component') is None +    assert crossref_importer.map_release_type('book-series') is None      assert crossref_importer.map_release_type('standard') == 'standard'  def test_crossref_importer_create(crossref_importer): | 
