import sys import json import itertools import fatcat_openapi_client from .common import EntityImporter, clean def or_none(s): if s is None: return None if len(s) == 0: return None return s def truthy(s): if s is None: return None s = s.lower() if s in ('true', 't', 'yes', 'y', '1'): return True elif s in ('false', 'f', 'no', 'n', '0'): return False else: return None class JournalMetadataImporter(EntityImporter): """ Imports journal metadata ("containers") by ISSN, currently from a custom munged JSON format (see ../extra/journal_metadata/). See guide for details on the many 'extra' fields used here. """ def __init__(self, api, **kwargs): eg_desc = kwargs.get('editgroup_description', "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter') super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) def want(self, raw_record): if raw_record.get('issnl') and raw_record.get('name'): return True return False def parse_record(self, row): """ row is a python dict (parsed from JSON). returns a ContainerEntity (or None if invalid or couldn't parse) """ if not row.get('name'): # Name is required (by schema) return None extra = dict() for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', 'coden', 'aliases', 'original_name', 'first_year', 'last_year', 'platform', 'default_license', 'road', 'mimetypes', 'sherpa_romeo', 'kbart'): if row.get(key): extra[key] = row[key] # TODO: not including for now: norwegian, dois/crossref, ia extra_doaj = dict() if row.get('doaj'): if row['doaj'].get('as_of'): extra_doaj['as_of'] = row['doaj']['as_of'] if row['doaj'].get('works'): extra_doaj['works'] = row['doaj']['works'] if extra_doaj: extra['doaj'] = extra_doaj extra_ia = dict() # TODO: would like an ia.longtail_ia flag if row.get('sim'): # NB: None case of the .get() here is blech, but othrwise # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on extra_ia['sim'] = { 'year_spans': row['sim'].get('year_spans'), } if extra_ia: extra['ia'] = extra_ia name = clean(row.get('name')) if not name: return None ce = fatcat_openapi_client.ContainerEntity( issnl=row['issnl'], container_type=None, # TODO name=name, publisher=clean(row.get('publisher')), wikidata_qid=None, # TODO extra=extra) return ce def try_update(self, ce): existing = None try: existing = self.api.lookup_container(issnl=ce.issnl) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err if not existing: # doesn't exist, create it return True # for now, only update KBART, and only if there is new content if not existing.extra: existing.extra = dict() if ce.extra.get('kbart') and (existing.extra.get('kbart') != ce.extra['kbart']): if not existing.extra.get('kbart'): existing.extra['kbart'] = {} existing.extra['kbart'].update(ce.extra['kbart']) self.api.update_container(self.get_editgroup_id(), existing.ident, existing) self.counts['update'] += 1 return False else: self.counts['exists'] += 1 return False # if we got this far, it's a bug raise NotImplementedError def insert_batch(self, batch): self.api.create_container_auto_batch(fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch))