aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-08-05 16:48:32 +0000
committerbnewbold <bnewbold@archive.org>2020-08-05 16:48:32 +0000
commit59b772fa9af05b35ce14d26bcabb66cc124255d4 (patch)
tree909a2590e748d07eb721142515763b142ec60859
parent4702bee24dde8bae64df76ad411a6d8329cc9bdf (diff)
parentb675b60a183b57e276ba7044bd9ece0130f2c457 (diff)
downloadfatcat-59b772fa9af05b35ce14d26bcabb66cc124255d4.tar.gz
fatcat-59b772fa9af05b35ce14d26bcabb66cc124255d4.zip
Merge branch 'bnewbold-chocula-import-tweaks' into 'master'
chocula import tweaks See merge request webgroup/fatcat!74
-rw-r--r--python/fatcat_tools/importers/chocula.py34
1 files changed, 22 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index d5d1cce8..d08a98e0 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -24,7 +24,7 @@ class ChoculaImporter(EntityImporter):
**kwargs)
def want(self, raw_record):
- if not raw_record.get('fatcat_ident') and not raw_record.get('_known_issnl'):
+ if not raw_record.get('ident') and not raw_record.get('_known_issnl'):
self.counts['skip-unknown-new-issnl'] += 1
return False
if raw_record.get('issnl') and raw_record.get('name'):
@@ -53,7 +53,8 @@ class ChoculaImporter(EntityImporter):
extra = dict()
for k in ('urls', 'webarchive_urls', 'issne', 'issnp', 'country',
- 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages'):
+ 'sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'languages',
+ 'ia', 'scielo', 'kbart', 'publisher_type', 'platform'):
if row['extra'].get(k):
extra[k] = row['extra'][k]
@@ -115,30 +116,39 @@ class ChoculaImporter(EntityImporter):
do_update = True
if set(ce.extra.get('webarchive_urls', [])) != set(existing.extra.get('webarchive_urls', [])):
do_update = True
- for k in ('ezb', 'szczepanski', 'doaj'):
+ for k in ('ezb', 'szczepanski', 'publisher_type', 'platform'):
if ce.extra.get(k) and not existing.extra.get(k):
do_update = True
+ for k in ('kbart', 'ia', 'doaj'):
+ # always update these fields if not equal (chocula override)
+ if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):
+ do_update = True
if ce.publisher and not existing.publisher:
do_update = True
if ce.wikidata_qid and not existing.wikidata_qid:
do_update = True
if do_update:
- existing.wikidata_qid = ce.wikidata_qid
- existing.publisher = ce.publisher
+ existing.wikidata_qid = existing.wikidata_qid or ce.wikidata_qid
+ existing.publisher = existing.publisher or ce.publisher
existing.container_type = existing.container_type or ce.container_type
for k in ('urls', 'webarchive_urls'):
- # update, which might clobber, but won't remove
+ # always update if available. should probably make this more
+ # careful/subtle in the future!
+ # note: in some cases we might *want* to remove existing (if
+ # all URLs found to be bad), but being
+ # conservative/inclusionist for now
if ce.extra.get(k):
existing.extra[k] = ce.extra.get(k, [])
- # note: in some cases we might *want* to clobber existing (if
- # all URLs found to be bad), but being conservative for now so
- # we don't clobber human edits
- for k in ('issne', 'issnp', 'country', 'sherpa_romeo', 'ezb',
- 'szczepanski', 'doaj'):
- # update/overwrite, but don't remove any existing value
+ for k in ('sherpa_romeo', 'ezb', 'szczepanski', 'doaj', 'ia',
+ 'scielo', 'kbart', 'publisher_type', 'platform'):
+ # always update (chocula over-rides)
if ce.extra.get(k):
existing.extra[k] = ce.extra[k]
+ for k in ('issne', 'issnp', 'country'):
+ # only include if not set (don't clobber human edits)
+ if ce.extra.get(k) and not existing.extra.get(k):
+ existing.extra[k] = ce.extra[k]
if ce.extra.get('languages'):
if not existing.extra.get('languages'):
existing.extra['languages'] = ce.extra['languages']