summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/crossref.py33
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py2
-rw-r--r--python/tests/import_crossref.py8
3 files changed, 39 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index c4695c7f..fe80c2d3 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -8,6 +8,28 @@ import fatcat_client
from fatcat_tools.importers.common import FatcatImporter
+CROSSREF_TYPE_MAP = {
+ 'book': 'book',
+ 'book-chapter': 'chapter',
+ 'book-part': 'chapter',
+ 'book-section': 'chapter',
+ 'component': None,
+ 'dataset': 'dataset',
+ 'dissertation': 'thesis',
+ 'edited-book': 'book',
+ 'journal-article': 'article-journal',
+ 'monograph': 'monograph',
+ 'other': None,
+ 'peer-review': 'peer_review',
+ 'posted-content': 'post',
+ 'proceedings-article': 'paper-conference',
+ 'reference-book': 'book',
+ 'reference-entry': 'entry',
+ 'report': 'report',
+ 'standard': 'standard',
+}
+
+
class FatcatCrossrefImporter(FatcatImporter):
def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
@@ -35,6 +57,9 @@ class FatcatCrossrefImporter(FatcatImporter):
pmcid=row[2],
wikidata_qid=row[3])
+ def map_release_type(self, crossref_type):
+ return CROSSREF_TYPE_MAP.get(crossref_type)
+
def parse_crossref_dict(self, obj):
"""
obj is a python dict (parsed from json).
@@ -46,7 +71,10 @@ class FatcatCrossrefImporter(FatcatImporter):
return None
# Other ways to be out of scope (provisionally)
- if (not 'type' in obj):
+ # journal-issue and journal-volume map to None, but allowed for now
+ if obj.get('type') in (None, 'journal', 'proceedings',
+ 'standard-series', 'report-series', 'book-series', 'book-set',
+ 'book-track', 'proceedings-series'):
return None
# contribs
@@ -76,6 +104,7 @@ class FatcatCrossrefImporter(FatcatImporter):
extra['sequence'] = am.get('sequence')
if not extra:
extra = None
+ assert(ctype in ("author", "editor", "translator"))
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
@@ -216,7 +245,7 @@ class FatcatCrossrefImporter(FatcatImporter):
refs=refs,
container_id=container_id,
publisher=publisher,
- release_type=obj['type'],
+ release_type=self.map_release_type(obj['type']),
release_status=release_status,
doi=obj['DOI'].lower(),
isbn13=isbn13,
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 6d635479..dedc9728 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -67,7 +67,7 @@ class FatcatGrobidMetadataImporter(FatcatImporter):
ref['extra'] = cite_extra
refs.append(ref)
- release_type = "journal-article"
+ release_type = "article-journal"
release_date = None
if obj.get('date'):
# TODO: only returns year, ever? how to handle?
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index ab33d0fc..078db184 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -17,6 +17,12 @@ def test_crossref_importer(crossref_importer):
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
crossref_importer.process_source(f)
+def test_crossref_mappings(crossref_importer):
+ assert crossref_importer.map_release_type('journal-article') == "article-journal"
+ assert crossref_importer.map_release_type('asdf') is None
+ assert crossref_importer.map_release_type('component') is None
+ assert crossref_importer.map_release_type('standard') == 'standard'
+
def test_crossref_importer_create(crossref_importer):
crossref_importer.create_containers = True
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
@@ -33,7 +39,7 @@ def test_crossref_dict_parse(crossref_importer):
assert r.publisher == "Wiley-Blackwell"
print(extra)
assert extra['container-title'] == ["International Journal of Quantum Chemistry"]
- assert r.release_type == "journal-article"
+ assert r.release_type == "article-journal"
assert r.release_status == "published"
assert r.isbn13 == "978-3-16-148410-0"
assert 'subtitle' not in extra