diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/chocula.py | 3 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/ingest.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 4 | 
5 files changed, 10 insertions, 2 deletions
| diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index acfc2b87..c71b33e9 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter):          eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')          if kwargs.get('crawl_id'):              eg_extra['crawl_id'] = kwargs.get('crawl_id') +        kwargs['do_updates'] = kwargs.get("do_updates", False)          super().__init__(api,              editgroup_description=eg_desc,              editgroup_extra=eg_extra, @@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter):          self.default_link_rel = kwargs.get("default_link_rel", "web")          assert self.default_link_rel          self.default_mimetype = kwargs.get("default_mimetype", None) -        self.do_updates = kwargs.get("do_updates", False)          self.require_grobid = require_grobid          if self.require_grobid:              print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index eea50314..375b6051 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter):          # decide whether to update          do_update = False +        if not self.do_updates: +            self.counts['exists'] += 1 +            return False          if not existing.extra:              existing.extra = dict()          if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])): diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index be5db8d8..8d103372 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -287,6 +287,7 @@ class EntityImporter:          eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')          self.api = api +        self.do_updates = bool(kwargs.get('do_updates', True))          self.bezerk_mode = kwargs.get('bezerk_mode', False)          self.submit_mode = kwargs.get('submit_mode', False)          self.edit_batch_size = kwargs.get('edit_batch_size', 100) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 33c40eff..16643eb5 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter):          eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"          eg_extra = kwargs.pop('editgroup_extra', dict())          eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') +        kwargs['do_updates'] = kwargs.get("do_updates", False)          super().__init__(api,              editgroup_description=eg_desc,              editgroup_extra=eg_extra,              **kwargs)          self.default_link_rel = kwargs.get("default_link_rel", "web")          assert self.default_link_rel -        self.do_updates = kwargs.get("do_updates", False)          self.require_grobid = require_grobid          if self.require_grobid:              print("Requiring GROBID status == 200") diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3611a299..c32ce34a 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter):                  re.ext_ids.doi = None                  re.work_id = existing.work_id +        if existing and not self.do_updates: +            self.counts['exists'] += 1 +            return False +          if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):              # TODO: any other reasons to do an update?              # don't update if it already has PMID | 
