aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-08-28 17:34:15 +0200
committerBryan Newbold <bnewbold@robocracy.org>2019-09-03 13:31:49 -0700
commit228f780f84249cfda9dddd5487c0966f242342c9 (patch)
tree90b080c845e62a49fdd760e68216fb88581f6e5f
parentbea64958a70040a7fa44434d3a23078f9cf871a0 (diff)
downloadfatcat-228f780f84249cfda9dddd5487c0966f242342c9.tar.gz
fatcat-228f780f84249cfda9dddd5487c0966f242342c9.zip
implement ChoculaImporter
-rw-r--r--python/fatcat_tools/importers/__init__.py1
-rw-r--r--python/fatcat_tools/importers/chocula.py136
2 files changed, 137 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 7e23ca8c..5e2948f4 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -20,6 +20,7 @@ from .arxiv import ArxivRawImporter
from .pubmed import PubmedImporter
from .grobid_metadata import GrobidMetadataImporter
from .journal_metadata import JournalMetadataImporter
+from .chocula import ChoculaImporter
from .matched import MatchedImporter
from .orcid import OrcidImporter
from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
new file mode 100644
index 00000000..4ca8bd21
--- /dev/null
+++ b/python/fatcat_tools/importers/chocula.py
@@ -0,0 +1,136 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+class ChoculaImporter(EntityImporter):
+ """
+ Creates or updates container entities based on output of "chocula" script,
+ which munges/processes journal metadata from several sources, including
+ fatcat itself.
+
+ See guide for details on the many 'extra' fields used here.
+ """
+
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of container-level metadata from Chocula tool.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ChoculaImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ def want(self, raw_record):
+ if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'):
+ self.counts['skip-unknown-new-issnl'] += 1
+ return False
+ if raw_record.get('issnl') and raw_record.get('name'):
+ return True
+ return False
+
+ def parse_record(self, row):
+ """
+ row is a python dict (parsed from JSON).
+
+ returns a ContainerEntity (or None if invalid or couldn't parse)
+ """
+
+ name = clean(row.get('name'))
+ if not name:
+ # Name is required (by schema)
+ return None
+
+ if name.endswith(', Proceedings of the'):
+ name = "Proceedings of the " + name.split(',')[0]
+
+ extra = dict()
+ for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country', 'sherpa_romeo'):
+ if row['extra'].get(k):
+ extra[k] = row['extra'][k]
+
+ container_type = None
+ if 'proceedings' in name.lower():
+ container_type = 'proceedings'
+ elif 'journal ' in name.lower():
+ container_type = 'journal'
+
+ ce = fatcat_client.ContainerEntity(
+ issnl=row['issnl'],
+ ident=row['ident'],
+ name=name,
+ container_type=container_type,
+ publisher=clean(row.get('publisher')),
+ wikidata_qid=row.get('wikidata_qid'),
+ extra=extra)
+ return ce
+
+ def try_update(self, ce):
+
+ existing = None
+ if ce.ident:
+ try:
+ existing = self.api.get_container(ce.ident)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ self.counts['exists-not-found'] += 1
+ return False
+ if existing.state != 'active':
+ self.counts['exists-inactive'] += 1
+ return False
+
+ if not existing:
+ # check if existing by ISSN-L
+ try:
+ existing = self.api.lookup_container(issnl=ce.issnl)
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ if existing:
+ self.counts['exists-by-issnl'] += 1
+ return False
+ # doesn't exist, always create
+ return True
+
+ # decide whether to update
+ do_update = False
+ if not existing.extra:
+ existing.extra = dict()
+ if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
+ do_update = True
+ if ce.publisher and not existing.publisher:
+ do_update = True
+ if ce.wikidata_qid and not existing.wikidata_qid:
+ do_update = True
+
+ if do_update:
+ existing.wikidata_qid = ce.wikidata_qid
+ existing.publisher = ce.publisher
+ existing.container_type = existing.container_type or ce.container_type
+ for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
+ 'sherpa_romeo', 'ezb', 'szczepanski'):
+ if ce.extra.get(k):
+ existing.extra[k] = ce.extra[k]
+
+ self.api.update_container(self.get_editgroup_id(), existing.ident, existing)
+ self.counts['update'] += 1
+ return False
+ else:
+ self.counts['skip-update'] += 1
+ return False
+
+ # if we got this far, it's a bug
+ raise NotImplementedError
+
+ def insert_batch(self, batch):
+ self.api.create_container_auto_batch(fatcat_client.ContainerAutoBatch(
+ editgroup=fatcat_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))