From 30bdb1b0ba28b2e4a81aa7209d294c224d8a2245 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 14 Nov 2018 21:45:09 -0800 Subject: update crossref controlled vocab --- python/fatcat_tools/importers/crossref.py | 33 ++++++++++++++++++++++-- python/fatcat_tools/importers/grobid_metadata.py | 2 +- python/tests/import_crossref.py | 8 +++++- 3 files changed, 39 insertions(+), 4 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index c4695c7f..fe80c2d3 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -8,6 +8,28 @@ import fatcat_client from fatcat_tools.importers.common import FatcatImporter +CROSSREF_TYPE_MAP = { + 'book': 'book', + 'book-chapter': 'chapter', + 'book-part': 'chapter', + 'book-section': 'chapter', + 'component': None, + 'dataset': 'dataset', + 'dissertation': 'thesis', + 'edited-book': 'book', + 'journal-article': 'article-journal', + 'monograph': 'monograph', + 'other': None, + 'peer-review': 'peer_review', + 'posted-content': 'post', + 'proceedings-article': 'paper-conference', + 'reference-book': 'book', + 'reference-entry': 'entry', + 'report': 'report', + 'standard': 'standard', +} + + class FatcatCrossrefImporter(FatcatImporter): def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): @@ -35,6 +57,9 @@ class FatcatCrossrefImporter(FatcatImporter): pmcid=row[2], wikidata_qid=row[3]) + def map_release_type(self, crossref_type): + return CROSSREF_TYPE_MAP.get(crossref_type) + def parse_crossref_dict(self, obj): """ obj is a python dict (parsed from json). @@ -46,7 +71,10 @@ class FatcatCrossrefImporter(FatcatImporter): return None # Other ways to be out of scope (provisionally) - if (not 'type' in obj): + # journal-issue and journal-volume map to None, but allowed for now + if obj.get('type') in (None, 'journal', 'proceedings', + 'standard-series', 'report-series', 'book-series', 'book-set', + 'book-track', 'proceedings-series'): return None # contribs @@ -76,6 +104,7 @@ class FatcatCrossrefImporter(FatcatImporter): extra['sequence'] = am.get('sequence') if not extra: extra = None + assert(ctype in ("author", "editor", "translator")) contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, @@ -216,7 +245,7 @@ class FatcatCrossrefImporter(FatcatImporter): refs=refs, container_id=container_id, publisher=publisher, - release_type=obj['type'], + release_type=self.map_release_type(obj['type']), release_status=release_status, doi=obj['DOI'].lower(), isbn13=isbn13, diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 6d635479..dedc9728 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -67,7 +67,7 @@ class FatcatGrobidMetadataImporter(FatcatImporter): ref['extra'] = cite_extra refs.append(ref) - release_type = "journal-article" + release_type = "article-journal" release_date = None if obj.get('date'): # TODO: only returns year, ever? how to handle? diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index ab33d0fc..078db184 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -17,6 +17,12 @@ def test_crossref_importer(crossref_importer): with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: crossref_importer.process_source(f) +def test_crossref_mappings(crossref_importer): + assert crossref_importer.map_release_type('journal-article') == "article-journal" + assert crossref_importer.map_release_type('asdf') is None + assert crossref_importer.map_release_type('component') is None + assert crossref_importer.map_release_type('standard') == 'standard' + def test_crossref_importer_create(crossref_importer): crossref_importer.create_containers = True with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: @@ -33,7 +39,7 @@ def test_crossref_dict_parse(crossref_importer): assert r.publisher == "Wiley-Blackwell" print(extra) assert extra['container-title'] == ["International Journal of Quantum Chemistry"] - assert r.release_type == "journal-article" + assert r.release_type == "article-journal" assert r.release_status == "published" assert r.isbn13 == "978-3-16-148410-0" assert 'subtitle' not in extra -- cgit v1.2.3