aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-06 16:54:30 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-06 16:54:30 -0800
commitb289da087453f13571c5570d6be4a3fb4ac08acd (patch)
tree2dc3a46afa072be3a248d2115cf02f5dcbb7253c
parent3a57c35ddcf794d7211d1649e74a9917bd1c9495 (diff)
downloadfatcat-b289da087453f13571c5570d6be4a3fb4ac08acd.tar.gz
fatcat-b289da087453f13571c5570d6be4a3fb4ac08acd.zip
importers: control update behavior with more-standard flag
-rwxr-xr-xpython/fatcat_import.py6
-rw-r--r--python/fatcat_tools/importers/arabesque.py2
-rw-r--r--python/fatcat_tools/importers/chocula.py3
-rw-r--r--python/fatcat_tools/importers/common.py1
-rw-r--r--python/fatcat_tools/importers/ingest.py2
-rw-r--r--python/fatcat_tools/importers/pubmed.py4
6 files changed, 15 insertions, 3 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 8d82dab3..184dcc0a 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -61,7 +61,8 @@ def run_journal_metadata(args):
def run_chocula(args):
fii = ChoculaImporter(args.api,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ do_updates=args.do_updates)
JsonLinePusher(fii, args.json_file).run()
def run_matched(args):
@@ -301,6 +302,9 @@ def main():
sub_chocula.add_argument('json_file',
help="chocula JSON entities file (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
+ sub_chocula.add_argument('--do-updates',
+ action='store_true',
+ help="update pre-existing container entities")
sub_matched = subparsers.add_parser('matched',
help="add file entities matched against existing releases; custom JSON format")
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index acfc2b87..c71b33e9 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter):
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
if kwargs.get('crawl_id'):
eg_extra['crawl_id'] = kwargs.get('crawl_id')
+ kwargs['do_updates'] = kwargs.get("do_updates", False)
super().__init__(api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
@@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter):
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
self.default_mimetype = kwargs.get("default_mimetype", None)
- self.do_updates = kwargs.get("do_updates", False)
self.require_grobid = require_grobid
if self.require_grobid:
print("Requiring GROBID status == 200")
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index eea50314..375b6051 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter):
# decide whether to update
do_update = False
+ if not self.do_updates:
+ self.counts['exists'] += 1
+ return False
if not existing.extra:
existing.extra = dict()
if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index be5db8d8..8d103372 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -287,6 +287,7 @@ class EntityImporter:
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
self.api = api
+ self.do_updates = bool(kwargs.get('do_updates', True))
self.bezerk_mode = kwargs.get('bezerk_mode', False)
self.submit_mode = kwargs.get('submit_mode', False)
self.edit_batch_size = kwargs.get('edit_batch_size', 100)
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 33c40eff..16643eb5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter):
eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
+ kwargs['do_updates'] = kwargs.get("do_updates", False)
super().__init__(api,
editgroup_description=eg_desc,
editgroup_extra=eg_extra,
**kwargs)
self.default_link_rel = kwargs.get("default_link_rel", "web")
assert self.default_link_rel
- self.do_updates = kwargs.get("do_updates", False)
self.require_grobid = require_grobid
if self.require_grobid:
print("Requiring GROBID status == 200")
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3611a299..c32ce34a 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter):
re.ext_ids.doi = None
re.work_id = existing.work_id
+ if existing and not self.do_updates:
+ self.counts['exists'] += 1
+ return False
+
if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
# TODO: any other reasons to do an update?
# don't update if it already has PMID