import sys import json import itertools import fatcat_client from .common import EntityImporter, clean class ChoculaImporter(EntityImporter): """ Creates or updates container entities based on output of "chocula" script, which munges/processes journal metadata from several sources, including fatcat itself. See guide for details on the many 'extra' fields used here. """ def __init__(self, api, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of container-level metadata from Chocula tool.") eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter') super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'): self.counts['skip-unknown-new-issnl'] += 1 return False if raw_record.get('issnl') and raw_record.get('name'): return True return False def parse_record(self, row): """ row is a python dict (parsed from JSON). returns a ContainerEntity (or None if invalid or couldn't parse) """ name = clean(row.get('name')) if not name: # Name is required (by schema) return None name = name.strip() if name.endswith(', Proceedings of the'): name = "Proceedings of the " + name.split(',')[0] if name.endswith('.'): name = name[:-1] extra = dict() for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'): if row['extra'].get(k): extra[k] = row['extra'][k] container_type = None if 'proceedings' in name.lower(): container_type = 'proceedings' elif 'journal ' in name.lower(): container_type = 'journal' ce = fatcat_client.ContainerEntity( issnl=row['issnl'], ident=row['ident'], name=name, container_type=container_type, publisher=clean(row.get('publisher')), wikidata_qid=row.get('wikidata_qid'), extra=extra) return ce def try_update(self, ce): existing = None if ce.ident: try: existing = self.api.get_container(ce.ident) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err self.counts['exists'] += 1 self.counts['exists-not-found'] += 1 return False if existing.state != 'active': self.counts['exists'] += 1 self.counts['exists-inactive'] += 1 return False if not existing: # check if existing by ISSN-L try: existing = self.api.lookup_container(issnl=ce.issnl) except fatcat_client.rest.ApiException as err: if err.status != 404: raise err if existing: self.counts['exists'] += 1 self.counts['exists-by-issnl'] += 1 return False # doesn't exist, always create return True # decide whether to update do_update = False if not existing.extra: existing.extra = dict() if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): do_update = True if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])): do_update = True for k in ('ezb', 'szczepanski', 'doaj'): if ce.extra.get(k) and not existing.extra.get(k): do_update = True if ce.publisher and not existing.publisher: do_update = True if ce.wikidata_qid and not existing.wikidata_qid: do_update = True if do_update: existing.wikidata_qid = ce.wikidata_qid existing.publisher = ce.publisher existing.container_type = existing.container_type or ce.container_type for k in ('urls', 'webarchive_urls'): # update, or clobber/remove any existing values. often # want/need to remove dead URLs if ce.extra.get(k): existing.extra[k] = ce.extra.get(k, []) elif k in existing.extra.keys(): existing.extra.pop(k) for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj'): # update, but don't remove any existing value if ce.extra.get(k): existing.extra[k] = ce.extra[k] if ce.extra.get('languages'): if not existing.extra.get('languages'): existing.extra['languages'] = ce.extra['languages'] elif not ce.extra['languages'][0] in existing.extra['languages']: existing.extra['languages'].append(ce.extra['languages'][0]) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 return False else: self.counts['exists'] += 1 self.counts['exists-skip-update'] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch): self.api.create_container_auto_batch(fatcat_client.ContainerAutoBatch( editgroup=fatcat_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch))