summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/crossref.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/crossref.py')
-rw-r--r--python/fatcat_tools/importers/crossref.py263
1 files changed, 154 insertions, 109 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 6365e491..00c719f1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
import itertools
import subprocess
import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
# The docs/guide should be the cannonical home for these mappings; update there
@@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
-class CrossrefImporter(FatcatImporter):
+CONTAINER_TYPE_MAP = {
+ 'article-journal': 'journal',
+ 'paper-conference': 'conference',
+ 'book': 'book-series',
+}
+
+# TODO:
+LICENSE_SLUG_MAP = {
+ "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+ "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+ "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+ "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+ "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+ "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+ "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+ "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+ "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+ "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+ "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+ "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+ "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+ # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+ # http://www.springer.com/tdm doesn't seem like a license
+}
+
+class CrossrefImporter(EntityImporter):
"""
Importer for Crossref metadata.
@@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter):
issn_map_file=issn_map_file,
editgroup_description=eg_desc,
editgroup_extra=eg_extra)
+
+ self.create_containers = kwargs.get('create_containers')
extid_map_file = kwargs.get('extid_map_file')
- create_containers = kwargs.get('create_containers')
- check_existing = kwargs.get('check_existing')
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter):
self.extid_map_db = sqlite3.connect(db_uri, uri=True)
else:
print("Not using external ID map")
- self.create_containers = create_containers
- self.check_existing = check_existing
+
+ self.read_issn_map_file(issn_map_file)
def lookup_ext_ids(self, doi):
if self.extid_map_db is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
[doi.lower()]).fetchone()
if row is None:
- return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
row = [str(cell or '') or None for cell in row]
return dict(
core_id=row[0],
pmid=row[1],
pmcid=row[2],
- wikidata_qid=row[3])
+ wikidata_qid=row[3],
+ # TODO:
+ arxiv_id=None,
+ jstor_id=None,
+ )
def map_release_type(self, crossref_type):
return CROSSREF_TYPE_MAP.get(crossref_type)
- def parse_crossref_dict(self, obj):
+ def map_container_type(self, crossref_type):
+ return CONTAINER_TYPE_MAP.get(crossref_type)
+
+ def want(self, obj):
+ if not obj.get('title'):
+ return False
+
+ # do most of these checks in-line below
+ return True
+
+ def parse_record(self, obj):
"""
obj is a python dict (parsed from json).
returns a ReleaseEntity
"""
- # Do require the 'title' keys to exsit, as release entities do
- if (not 'title' in obj) or (not obj['title']):
- return None
-
# Ways to be out of scope (provisionally)
# journal-issue and journal-volume map to None, but allowed for now
if obj.get('type') in (None, 'journal', 'proceedings',
@@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter):
'book-track', 'proceedings-series'):
return None
- # lookup existing DOI
- existing_release = None
- if self.check_existing:
- try:
- existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
- except fatcat_client.rest.ApiException as err:
- if err.status != 404:
- raise err
-
- # eventually we'll want to support "updates", but for now just skip if
- # entity already exists
- if existing_release:
+ # Do require the 'title' keys to exsit, as release entities do
+ if (not 'title' in obj) or (not obj['title']):
return None
+ release_type = self.map_release_type(obj['type'])
+
# contribs
def do_contribs(obj_list, ctype):
contribs = []
@@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter):
index = i
else:
index = None
+ raw_affiliation = None
if am.get('affiliation'):
- # note: affiliation => affiliations
- extra['affiliations'] = am.get('affiliation')
+ if len(am.get('affiliation')) > 0:
+ raw_affiliation = am.get('affiliation')[0]['name']
+ if len(am.get('affiliation')) > 1:
+ # note: affiliation => more_affiliations
+ extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
if am.get('sequence') and am.get('sequence') != "additional":
- extra['sequence'] = am.get('sequence')
+ extra['seq'] = clean(am.get('sequence'))
if not extra:
extra = None
assert ctype in ("author", "editor", "translator")
contribs.append(fatcat_client.ReleaseContrib(
creator_id=creator_id,
index=index,
- raw_name=raw_name,
+ raw_name=clean(raw_name),
+ raw_affiliation=clean(raw_affiliation),
role=ctype,
extra=extra))
return contribs
@@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter):
container_id = self.lookup_issnl(issnl)
publisher = obj.get('publisher')
- ce = None
if (container_id is None and self.create_containers and (issnl is not None)
and obj.get('container-title') and len(obj['container-title']) > 0):
ce = fatcat_client.ContainerEntity(
issnl=issnl,
- publisher=publisher,
- name=obj['container-title'][0])
+ publisher=clean(publisher),
+ container_type=self.map_container_type(release_type),
+ name=clean(obj['container-title'][0], force_xml=True))
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+
+ # license slug
+ license_slug = None
+ license_extra = []
+ for l in obj.get('license', []):
+ if l['content-version'] not in ('vor', 'unspecified'):
+ continue
+ slug = LICENSE_SLUG_MAP.get(l['URL'])
+ if slug:
+ license_slug = slug
+ if 'start' in l:
+ l['start'] = l['start']['date-time']
+ license_extra.append(l)
# references
refs = []
for i, rm in enumerate(obj.get('reference', [])):
try:
year = int(rm.get('year'))
- # NOTE: will need to update/config in the future!
+ # TODO: will need to update/config in the future!
# NOTE: are there crossref works with year < 100?
if year > 2025 or year < 100:
year = None
except:
year = None
- extra = rm.copy()
- if rm.get('DOI'):
- extra['doi'] = rm.get('DOI').lower()
key = rm.get('key')
if key and key.startswith(obj['DOI'].upper()):
key = key.replace(obj['DOI'].upper() + "-", '')
@@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter):
container_name = rm.get('volume-title')
if not container_name:
container_name = rm.get('journal-title')
- extra.pop('DOI', None)
- extra.pop('key', None)
- extra.pop('year', None)
- extra.pop('volume-name', None)
- extra.pop('journal-title', None)
- extra.pop('title', None)
- extra.pop('first-page', None)
- extra.pop('doi-asserted-by', None)
+ elif rm.get('journal-title'):
+ extra['journal-title'] = rm['journal-title']
+ extra = dict()
+ if rm.get('DOI'):
+ extra['doi'] = rm.get('DOI').lower()
+ # TODO: what fields here? CSL citation stuff
+ for k in ('author', 'editor', 'edition', 'authority', 'version',
+ 'genre', 'url', 'event', 'issue', 'volume', 'date',
+ 'accessed_date', 'issued', 'page', 'medium',
+ 'collection_title', 'chapter_number'):
+ if clean(rm.get(k)):
+ extra[k] = clean(rm[k])
if extra:
extra = dict(crossref=extra)
else:
@@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter):
target_release_id=None,
key=key,
year=year,
- container_name=container_name,
- title=rm.get('title'),
- locator=rm.get('first-page'),
+ container_name=clean(container_name),
+ title=clean(rm.get('title')),
+ locator=clean(rm.get('first-page')),
# TODO: just dump JSON somewhere here?
extra=extra))
@@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter):
if obj.get('abstract') != None:
abstracts.append(fatcat_client.ReleaseEntityAbstracts(
mimetype="application/xml+jats",
- content=obj.get('abstract')))
+ content=clean(obj.get('abstract'))))
# extra fields
extra = dict()
- for key in ('subject', 'type', 'license', 'alternative-id',
- 'container-title', 'original-title', 'subtitle', 'archive',
- 'funder', 'group-title'):
- # TODO: unpack "container-title" array
+ for key in ('subject', 'type', 'alternative-id', 'container-title',
+ 'subtitle', 'archive', 'funder', 'group-title'):
+ # TODO: unpack "container-title" array?
val = obj.get(key)
if val:
- extra[key] = val
- if 'license' in extra and extra['license']:
- for i in range(len(extra['license'])):
- if 'start' in extra['license'][i]:
- extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
+ if type(val) == str:
+ extra[key] = clean(val)
+ else:
+ extra[key] = val
+ if license_extra:
+ extra['license'] = license_extra
+
if len(obj['title']) > 1:
- extra['other-titles'] = obj['title'][1:]
- # TODO: this should be top-level
- extra['is_kept'] = len(obj.get('archive', [])) > 0
+ extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
# ISBN
isbn13 = None
@@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter):
re = fatcat_client.ReleaseEntity(
work_id=None,
- title=obj.get('title', [None])[0],
- contribs=contribs,
- refs=refs,
container_id=container_id,
- publisher=publisher,
- release_type=self.map_release_type(obj['type']),
+ title=clean(obj.get('title', [None])[0], force_xml=True),
+ original_title=clean(obj.get('original-title', [None])[0]),
+ release_type=release_type,
release_status=release_status,
+ release_date=release_date,
+ release_year=release_year,
+ publisher=clean(publisher),
doi=obj['DOI'].lower(),
- isbn13=isbn13,
- core_id=extids['core_id'],
pmid=extids['pmid'],
pmcid=extids['pmcid'],
wikidata_qid=extids['wikidata_qid'],
- release_date=release_date,
- release_year=release_year,
- issue=obj.get('issue'),
- volume=obj.get('volume'),
- pages=obj.get('page'),
+ isbn13=isbn13,
+ core_id=extids['core_id'],
+ arxiv_id=extids['arxiv_id'],
+ jstor_id=extids['jstor_id'],
+ volume=clean(obj.get('volume')),
+ issue=clean(obj.get('issue')),
+ pages=clean(obj.get('page')),
+ language=None, # crossref doesn't supply language info
+ license_slug=license_slug,
+ extra=dict(crossref=extra),
abstracts=abstracts,
- extra=dict(crossref=extra))
- return (re, ce)
+ contribs=contribs,
+ refs=refs,
+ )
+ return re
+
+ def try_update(self, re):
+
+ # lookup existing DOI (don't need to try other ext idents for crossref)
+ existing = None
+ try:
+ existing = self.api.lookup_release(doi=re.doi)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to update
+ return True
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ self.api.create_release_batch(batch,
+ autoaccept=True,
+ description=self.editgroup_description,
+ extra=json.dumps(self.editgroup_extra))
- def create_row(self, row, editgroup_id=None):
- if row is None:
- return
- obj = json.loads(row)
- entities = self.parse_crossref_dict(obj)
- if entities is not None:
- (re, ce) = entities
- if ce is not None:
- container = self.api.create_container(ce, editgroup_id=editgroup_id)
- re.container_id = container.ident
- self._issnl_id_map[ce.issnl] = container.ident
- self.api.create_release(re, editgroup_id=editgroup_id)
- self.counts['insert'] += 1
-
- def create_batch(self, batch):
- """Current work/release pairing disallows batch creation of releases.
- Could do batch work creation and then match against releases, but meh."""
- release_batch = []
- for row in batch:
- if row is None:
- continue
- obj = json.loads(row)
- entities = self.parse_crossref_dict(obj)
- if entities is not None:
- (re, ce) = entities
- if ce is not None:
- ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())
- container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)
- self.api.accept_editgroup(ce_eg.editgroup_id)
- re.container_id = container.ident
- self._issnl_id_map[ce.issnl] = container.ident
- release_batch.append(re)
- self.api.create_release_batch(release_batch, autoaccept="true")
- self.counts['insert'] += len(release_batch)