summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-12-03 16:38:08 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-12-03 16:38:10 -0800
commitf1bf46cac6c59964c285f03bf93c2856eeb231ef (patch)
tree79810683c76b973c13e283b1c15b0c3ef0192bb2 /python
parent0d9d71038b8a77baaeb7e9118d5b191b60eed7cc (diff)
downloadfatcat-f1bf46cac6c59964c285f03bf93c2856eeb231ef.tar.gz
fatcat-f1bf46cac6c59964c285f03bf93c2856eeb231ef.zip
tweaks to file ingest importer
- allow overriding source filter whitelist (common case for CLI use) - fix editgroup description env variable pass-through
Diffstat (limited to 'python')
-rwxr-xr-xpython/fatcat_import.py6
-rw-r--r--python/fatcat_tools/importers/ingest.py7
2 files changed, 10 insertions, 3 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index aa789dd9..656fe87d 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -77,6 +77,7 @@ def run_arabesque_match(args):
args.json_file):
print("Supply one of --sqlite-file or --json-file")
ami = ArabesqueMatchImporter(args.api,
+ editgroup_description=args.editgroup_description_override,
do_updates=args.do_updates,
require_grobid=(not args.no_require_grobid),
extid_type=args.extid_type,
@@ -91,6 +92,8 @@ def run_arabesque_match(args):
def run_ingest_file(args):
ifri = IngestFileResultImporter(args.api,
+ editgroup_description=args.editgroup_description_override,
+ skip_source_whitelist=args.skip_source_whitelist,
do_updates=args.do_updates,
default_link_rel=args.default_link_rel,
require_grobid=(not args.no_require_grobid),
@@ -333,6 +336,9 @@ def main():
sub_ingest_file.add_argument('json_file',
help="ingest_file JSON file to import from",
default=sys.stdin, type=argparse.FileType('r'))
+ sub_ingest_file.add_argument('--skip-source-whitelist',
+ action='store_true',
+ help="don't filter import based on request source whitelist")
sub_ingest_file.add_argument('--kafka-mode',
action='store_true',
help="consume from kafka topic (not stdin)")
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index bb410b63..7dad13ce 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -11,9 +11,9 @@ class IngestFileResultImporter(EntityImporter):
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.get('editgroup_description',
+ eg_desc = kwargs.pop('editgroup_description',
"Files crawled from web using sandcrawler ingest tool")
- eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
super().__init__(api,
editgroup_description=eg_desc,
@@ -29,7 +29,8 @@ class IngestFileResultImporter(EntityImporter):
else:
print("NOT checking GROBID success")
self.ingest_request_source_whitelist = ['fatcat-changelog']
- #self.ingest_request_source_whitelist = []
+ if kwargs.get('skip_source_whitelist', False):
+ self.ingest_request_source_whitelist = []
def want(self, row):
"""