From b289da087453f13571c5570d6be4a3fb4ac08acd Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 6 Jan 2020 16:54:30 -0800 Subject: importers: control update behavior with more-standard flag --- python/fatcat_import.py | 6 +++++- python/fatcat_tools/importers/arabesque.py | 2 +- python/fatcat_tools/importers/chocula.py | 3 +++ python/fatcat_tools/importers/common.py | 1 + python/fatcat_tools/importers/ingest.py | 2 +- python/fatcat_tools/importers/pubmed.py | 4 ++++ 6 files changed, 15 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 8d82dab3..184dcc0a 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -61,7 +61,8 @@ def run_journal_metadata(args): def run_chocula(args): fii = ChoculaImporter(args.api, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + do_updates=args.do_updates) JsonLinePusher(fii, args.json_file).run() def run_matched(args): @@ -301,6 +302,9 @@ def main(): sub_chocula.add_argument('json_file', help="chocula JSON entities file (or stdin)", default=sys.stdin, type=argparse.FileType('r')) + sub_chocula.add_argument('--do-updates', + action='store_true', + help="update pre-existing container entities") sub_matched = subparsers.add_parser('matched', help="add file entities matched against existing releases; custom JSON format") diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index acfc2b87..c71b33e9 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter): eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') if kwargs.get('crawl_id'): eg_extra['crawl_id'] = kwargs.get('crawl_id') + kwargs['do_updates'] = kwargs.get("do_updates", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, @@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel self.default_mimetype = kwargs.get("default_mimetype", None) - self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index eea50314..375b6051 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter): # decide whether to update do_update = False + if not self.do_updates: + self.counts['exists'] += 1 + return False if not existing.extra: existing.extra = dict() if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index be5db8d8..8d103372 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -287,6 +287,7 @@ class EntityImporter: eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') self.api = api + self.do_updates = bool(kwargs.get('do_updates', True)) self.bezerk_mode = kwargs.get('bezerk_mode', False) self.submit_mode = kwargs.get('submit_mode', False) self.edit_batch_size = kwargs.get('edit_batch_size', 100) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 33c40eff..16643eb5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter): eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') + kwargs['do_updates'] = kwargs.get("do_updates", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel - self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3611a299..c32ce34a 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter): re.ext_ids.doi = None re.work_id = existing.work_id + if existing and not self.do_updates: + self.counts['exists'] += 1 + return False + if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID -- cgit v1.2.3