summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--TODO.md1
-rw-r--r--guide/src/entity_release.md2
-rw-r--r--python/fatcat_tools/importers/crossref.py15
-rw-r--r--python/tests/import_crossref.py2
-rw-r--r--rust/src/identifiers.rs1
5 files changed, 17 insertions, 4 deletions
diff --git a/TODO.md b/TODO.md
index 2fec5121..0c766204 100644
--- a/TODO.md
+++ b/TODO.md
@@ -165,6 +165,7 @@ new importers:
## Schema / Entity Fields
+- file+fileset "first seen" datetime
- file type/scope/coverage: "fulltext", "abstract", etc
- elastic transform should only include authors, not editors (?)
- `translation_of` field on releases (or similar/general). `retraction_of` to a
diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md
index 27ce0f2c..1fd0a2f1 100644
--- a/guide/src/entity_release.md
+++ b/guide/src/entity_release.md
@@ -216,6 +216,8 @@ with a small number of (proposed) extensions:
- `stub` (fatcat extension) for releases which have notable external
identifiers, and thus are included "for completeness", but don't seem to
represent a "full work".
+- `component` (fatcat extension) for sub-components of a full paper (or other
+ work). Eg, figures or tables.
An example of a `stub` might be a paper that gets an extra DOI by accident; the
primary DOI should be a full release, and the accidental DOI can be a `stub`
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index faee6aac..d8abf3eb 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -11,12 +11,14 @@ from .common import EntityImporter, clean
# The docs/guide should be the cannonical home for these mappings; update there
# first
+# Can get a list of Crossref types (with counts) via API:
+# https://api.crossref.org/works?rows=0&facet=type-name:*
CROSSREF_TYPE_MAP = {
'book': 'book',
'book-chapter': 'chapter',
'book-part': 'chapter',
'book-section': 'chapter',
- 'component': None,
+ 'component': 'component',
'dataset': 'dataset',
'dissertation': 'thesis',
'edited-book': 'book',
@@ -158,6 +160,7 @@ class CrossrefImporter(EntityImporter):
def want(self, obj):
if not obj.get('title'):
+ self.counts['skip-blank-title'] += 1
return False
# do most of these checks in-line below
@@ -174,10 +177,12 @@ class CrossrefImporter(EntityImporter):
if obj.get('type') in (None, 'journal', 'proceedings',
'standard-series', 'report-series', 'book-series', 'book-set',
'book-track', 'proceedings-series'):
+ self.counts['skip-release-type'] += 1
return None
# Do require the 'title' keys to exsit, as release entities do
if (not 'title' in obj) or (not obj['title']):
+ self.counts['skip-blank-title'] += 1
return None
release_type = self.map_release_type(obj['type'])
@@ -376,10 +381,13 @@ class CrossrefImporter(EntityImporter):
# filter out unreasonably huge releases
if len(abstracts) > 100:
+ self.counts['skip-huge-abstracts'] += 1
return None
- if len(refs) > 2000:
+ if len(contribs) > 2000:
+ self.counts['skip-huge-contribs'] += 1
return None
if len(refs) > 5000:
+ self.counts['skip-huge-refs'] += 1
return None
# release date parsing is amazingly complex
@@ -406,6 +414,7 @@ class CrossrefImporter(EntityImporter):
title = clean(obj.get('title')[0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
+ self.counts['skip-blank-title'] += 1
return None
subtitle = None
@@ -413,7 +422,7 @@ class CrossrefImporter(EntityImporter):
subtitle = clean(obj.get('subtitle')[0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
- return None
+ subtitle = None
if extra_crossref:
extra['crossref'] = extra_crossref
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 3954abe2..afa2410f 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -58,7 +58,7 @@ def test_crossref_importer(crossref_importer):
def test_crossref_mappings(crossref_importer):
assert crossref_importer.map_release_type('journal-article') == "article-journal"
assert crossref_importer.map_release_type('asdf') is None
- assert crossref_importer.map_release_type('component') is None
+ assert crossref_importer.map_release_type('book-series') is None
assert crossref_importer.map_release_type('standard') == 'standard'
def test_crossref_importer_create(crossref_importer):
diff --git a/rust/src/identifiers.rs b/rust/src/identifiers.rs
index 597af338..180dc43b 100644
--- a/rust/src/identifiers.rs
+++ b/rust/src/identifiers.rs
@@ -540,6 +540,7 @@ pub fn check_release_type(raw: &str) -> Result<()> {
"letter",
"stub",
"retraction",
+ "component",
];
for good in valid_types {
if raw == good {