diff options
-rw-r--r-- | TODO.md | 1 | ||||
-rw-r--r-- | guide/src/entity_release.md | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 15 | ||||
-rw-r--r-- | python/tests/import_crossref.py | 2 | ||||
-rw-r--r-- | rust/src/identifiers.rs | 1 |
5 files changed, 17 insertions, 4 deletions
@@ -165,6 +165,7 @@ new importers: ## Schema / Entity Fields +- file+fileset "first seen" datetime - file type/scope/coverage: "fulltext", "abstract", etc - elastic transform should only include authors, not editors (?) - `translation_of` field on releases (or similar/general). `retraction_of` to a diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md index 27ce0f2c..1fd0a2f1 100644 --- a/guide/src/entity_release.md +++ b/guide/src/entity_release.md @@ -216,6 +216,8 @@ with a small number of (proposed) extensions: - `stub` (fatcat extension) for releases which have notable external identifiers, and thus are included "for completeness", but don't seem to represent a "full work". +- `component` (fatcat extension) for sub-components of a full paper (or other + work). Eg, figures or tables. An example of a `stub` might be a paper that gets an extra DOI by accident; the primary DOI should be a full release, and the accidental DOI can be a `stub` diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index faee6aac..d8abf3eb 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -11,12 +11,14 @@ from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there # first +# Can get a list of Crossref types (with counts) via API: +# https://api.crossref.org/works?rows=0&facet=type-name:* CROSSREF_TYPE_MAP = { 'book': 'book', 'book-chapter': 'chapter', 'book-part': 'chapter', 'book-section': 'chapter', - 'component': None, + 'component': 'component', 'dataset': 'dataset', 'dissertation': 'thesis', 'edited-book': 'book', @@ -158,6 +160,7 @@ class CrossrefImporter(EntityImporter): def want(self, obj): if not obj.get('title'): + self.counts['skip-blank-title'] += 1 return False # do most of these checks in-line below @@ -174,10 +177,12 @@ class CrossrefImporter(EntityImporter): if obj.get('type') in (None, 'journal', 'proceedings', 'standard-series', 'report-series', 'book-series', 'book-set', 'book-track', 'proceedings-series'): + self.counts['skip-release-type'] += 1 return None # Do require the 'title' keys to exsit, as release entities do if (not 'title' in obj) or (not obj['title']): + self.counts['skip-blank-title'] += 1 return None release_type = self.map_release_type(obj['type']) @@ -376,10 +381,13 @@ class CrossrefImporter(EntityImporter): # filter out unreasonably huge releases if len(abstracts) > 100: + self.counts['skip-huge-abstracts'] += 1 return None - if len(refs) > 2000: + if len(contribs) > 2000: + self.counts['skip-huge-contribs'] += 1 return None if len(refs) > 5000: + self.counts['skip-huge-refs'] += 1 return None # release date parsing is amazingly complex @@ -406,6 +414,7 @@ class CrossrefImporter(EntityImporter): title = clean(obj.get('title')[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character + self.counts['skip-blank-title'] += 1 return None subtitle = None @@ -413,7 +422,7 @@ class CrossrefImporter(EntityImporter): subtitle = clean(obj.get('subtitle')[0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character - return None + subtitle = None if extra_crossref: extra['crossref'] = extra_crossref diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 3954abe2..afa2410f 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -58,7 +58,7 @@ def test_crossref_importer(crossref_importer): def test_crossref_mappings(crossref_importer): assert crossref_importer.map_release_type('journal-article') == "article-journal" assert crossref_importer.map_release_type('asdf') is None - assert crossref_importer.map_release_type('component') is None + assert crossref_importer.map_release_type('book-series') is None assert crossref_importer.map_release_type('standard') == 'standard' def test_crossref_importer_create(crossref_importer): diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs index 597af338..180dc43b 100644 --- a/rust/src/identifiers.rs +++ b/rust/src/identifiers.rs @@ -540,6 +540,7 @@ pub fn check_release_type(raw: &str) -> Result<()> { "letter", "stub", "retraction", + "component", ]; for good in valid_types { if raw == good { |