diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/chocula.py | 136 | 
2 files changed, 137 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 7e23ca8c..5e2948f4 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -20,6 +20,7 @@ from .arxiv import ArxivRawImporter  from .pubmed import PubmedImporter  from .grobid_metadata import GrobidMetadataImporter  from .journal_metadata import JournalMetadataImporter +from .chocula import ChoculaImporter  from .matched import MatchedImporter  from .orcid import OrcidImporter  from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py new file mode 100644 index 00000000..4ca8bd21 --- /dev/null +++ b/python/fatcat_tools/importers/chocula.py @@ -0,0 +1,136 @@ + +import sys +import json +import itertools +import fatcat_client +from .common import EntityImporter, clean + + +class ChoculaImporter(EntityImporter): +    """ +    Creates or updates container entities based on output of "chocula" script, +    which munges/processes journal metadata from several sources, including +    fatcat itself. + +    See guide for details on the many 'extra' fields used here. +    """ + +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of container-level metadata from Chocula tool.") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra, +            **kwargs) + +    def want(self, raw_record): +        if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'): +            self.counts['skip-unknown-new-issnl'] += 1 +            return False +        if raw_record.get('issnl') and raw_record.get('name'): +            return True +        return False + +    def parse_record(self, row): +        """ +        row is a python dict (parsed from JSON). + +        returns a ContainerEntity (or None if invalid or couldn't parse) +        """ + +        name = clean(row.get('name')) +        if not name: +            # Name is required (by schema) +            return None + +        if name.endswith(',  Proceedings of the'): +            name = "Proceedings of the " + name.split(',')[0] + +        extra = dict() +        for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', 'sherpa_romeo'): +            if row['extra'].get(k): +                extra[k] = row['extra'][k] + +        container_type = None +        if 'proceedings' in name.lower(): +            container_type = 'proceedings' +        elif 'journal ' in name.lower(): +            container_type = 'journal' + +        ce = fatcat_client.ContainerEntity( +            issnl=row['issnl'], +            ident=row['ident'], +            name=name, +            container_type=container_type, +            publisher=clean(row.get('publisher')), +            wikidata_qid=row.get('wikidata_qid'), +            extra=extra) +        return ce + +    def try_update(self, ce): + +        existing = None +        if ce.ident: +            try: +                existing = self.api.get_container(ce.ident) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +                self.counts['exists-not-found'] += 1 +                return False +            if existing.state != 'active': +                self.counts['exists-inactive'] += 1 +                return False + +        if not existing: +            # check if existing by ISSN-L +            try: +                existing = self.api.lookup_container(issnl=ce.issnl) +            except fatcat_client.rest.ApiException as err: +                if err.status != 404: +                    raise err +            if existing: +                self.counts['exists-by-issnl'] += 1 +                return False +            # doesn't exist, always create +            return True + +        # decide whether to update +        do_update = False +        if not existing.extra: +            existing.extra = dict() +        if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): +            do_update = True +        if ce.publisher and not existing.publisher: +            do_update = True +        if ce.wikidata_qid and not existing.wikidata_qid: +            do_update = True + +        if do_update: +            existing.wikidata_qid = ce.wikidata_qid +            existing.publisher = ce.publisher +            existing.container_type = existing.container_type or ce.container_type +            for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', +                      'sherpa_romeo', 'ezb', 'szczepanski'): +                if ce.extra.get(k): +                    existing.extra[k] = ce.extra[k] + +            self.api.update_container(self.get_editgroup_id(), existing.ident, existing) +            self.counts['update'] += 1 +            return False +        else: +            self.counts['skip-update'] += 1 +            return False + +        # if we got this far, it's a bug +        raise NotImplementedError + +    def insert_batch(self, batch): +        self.api.create_container_auto_batch(fatcat_client.ContainerAutoBatch( +            editgroup=fatcat_client.Editgroup( +                description=self.editgroup_description, +                extra=self.editgroup_extra), +            entity_list=batch)) | 
