From 228f780f84249cfda9dddd5487c0966f242342c9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 28 Aug 2019 17:34:15 +0200
Subject: implement ChoculaImporter

---
 python/fatcat_tools/importers/__init__.py |   1 +
 python/fatcat_tools/importers/chocula.py  | 136 ++++++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 python/fatcat_tools/importers/chocula.py

(limited to 'python/fatcat_tools')

diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 7e23ca8c..5e2948f4 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -20,6 +20,7 @@ from .arxiv import ArxivRawImporter
 from .pubmed import PubmedImporter
 from .grobid_metadata import GrobidMetadataImporter
 from .journal_metadata import JournalMetadataImporter
+from .chocula import ChoculaImporter
 from .matched import MatchedImporter
 from .orcid import OrcidImporter
 from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
new file mode 100644
index 00000000..4ca8bd21
--- /dev/null
+++ b/python/fatcat_tools/importers/chocula.py
@@ -0,0 +1,136 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+class ChoculaImporter(EntityImporter):
+    """
+    Creates or updates container entities based on output of "chocula" script,
+    which munges/processes journal metadata from several sources, including
+    fatcat itself.
+
+    See guide for details on the many 'extra' fields used here.
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of container-level metadata from Chocula tool.")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+    def want(self, raw_record):
+        if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'):
+            self.counts['skip-unknown-new-issnl'] += 1
+            return False
+        if raw_record.get('issnl') and raw_record.get('name'):
+            return True
+        return False
+
+    def parse_record(self, row):
+        """
+        row is a python dict (parsed from JSON).
+
+        returns a ContainerEntity (or None if invalid or couldn't parse)
+        """
+
+        name = clean(row.get('name'))
+        if not name:
+            # Name is required (by schema)
+            return None
+
+        if name.endswith(',  Proceedings of the'):
+            name = "Proceedings of the " + name.split(',')[0]
+
+        extra = dict()
+        for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', 'sherpa_romeo'):
+            if row['extra'].get(k):
+                extra[k] = row['extra'][k]
+
+        container_type = None
+        if 'proceedings' in name.lower():
+            container_type = 'proceedings'
+        elif 'journal ' in name.lower():
+            container_type = 'journal'
+
+        ce = fatcat_client.ContainerEntity(
+            issnl=row['issnl'],
+            ident=row['ident'],
+            name=name,
+            container_type=container_type,
+            publisher=clean(row.get('publisher')),
+            wikidata_qid=row.get('wikidata_qid'),
+            extra=extra)
+        return ce
+
+    def try_update(self, ce):
+
+        existing = None
+        if ce.ident:
+            try:
+                existing = self.api.get_container(ce.ident)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+                self.counts['exists-not-found'] += 1
+                return False
+            if existing.state != 'active':
+                self.counts['exists-inactive'] += 1
+                return False
+
+        if not existing:
+            # check if existing by ISSN-L
+            try:
+                existing = self.api.lookup_container(issnl=ce.issnl)
+            except fatcat_client.rest.ApiException as err:
+                if err.status != 404:
+                    raise err
+            if existing:
+                self.counts['exists-by-issnl'] += 1
+                return False
+            # doesn't exist, always create
+            return True
+
+        # decide whether to update
+        do_update = False
+        if not existing.extra:
+            existing.extra = dict()
+        if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+            do_update = True
+        if ce.publisher and not existing.publisher:
+            do_update = True
+        if ce.wikidata_qid and not existing.wikidata_qid:
+            do_update = True
+
+        if do_update:
+            existing.wikidata_qid = ce.wikidata_qid
+            existing.publisher = ce.publisher
+            existing.container_type = existing.container_type or ce.container_type
+            for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
+                      'sherpa_romeo', 'ezb', 'szczepanski'):
+                if ce.extra.get(k):
+                    existing.extra[k] = ce.extra[k]
+
+            self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
+            self.counts['update'] += 1
+            return False
+        else:
+            self.counts['skip-update'] += 1
+            return False
+
+        # if we got this far, it's a bug
+        raise NotImplementedError
+
+    def insert_batch(self, batch):
+        self.api.create_container_auto_batch(fatcat_client.ContainerAutoBatch(
+            editgroup=fatcat_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
-- 
cgit v1.2.3