diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-06 16:54:30 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-06 16:54:30 -0800 |
commit | b289da087453f13571c5570d6be4a3fb4ac08acd (patch) | |
tree | 2dc3a46afa072be3a248d2115cf02f5dcbb7253c /python/fatcat_tools/importers | |
parent | 3a57c35ddcf794d7211d1649e74a9917bd1c9495 (diff) | |
download | fatcat-b289da087453f13571c5570d6be4a3fb4ac08acd.tar.gz fatcat-b289da087453f13571c5570d6be4a3fb4ac08acd.zip |
importers: control update behavior with more-standard flag
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/chocula.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 4 |
5 files changed, 10 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index acfc2b87..c71b33e9 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter): eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter') if kwargs.get('crawl_id'): eg_extra['crawl_id'] = kwargs.get('crawl_id') + kwargs['do_updates'] = kwargs.get("do_updates", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, @@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel self.default_mimetype = kwargs.get("default_mimetype", None) - self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index eea50314..375b6051 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter): # decide whether to update do_update = False + if not self.do_updates: + self.counts['exists'] += 1 + return False if not existing.extra: existing.extra = dict() if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index be5db8d8..8d103372 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -287,6 +287,7 @@ class EntityImporter: eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter') self.api = api + self.do_updates = bool(kwargs.get('do_updates', True)) self.bezerk_mode = kwargs.get('bezerk_mode', False) self.submit_mode = kwargs.get('submit_mode', False) self.edit_batch_size = kwargs.get('edit_batch_size', 100) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 33c40eff..16643eb5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter): eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') + kwargs['do_updates'] = kwargs.get("do_updates", False) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel - self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3611a299..c32ce34a 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter): re.ext_ids.doi = None re.work_id = existing.work_id + if existing and not self.do_updates: + self.counts['exists'] += 1 + return False + if existing and existing.ext_ids.pmid and (existing.refs or not re.refs): # TODO: any other reasons to do an update? # don't update if it already has PMID |