diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-12-03 16:38:08 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-12-03 16:38:10 -0800 |
commit | f1bf46cac6c59964c285f03bf93c2856eeb231ef (patch) | |
tree | 79810683c76b973c13e283b1c15b0c3ef0192bb2 /python | |
parent | 0d9d71038b8a77baaeb7e9118d5b191b60eed7cc (diff) | |
download | fatcat-f1bf46cac6c59964c285f03bf93c2856eeb231ef.tar.gz fatcat-f1bf46cac6c59964c285f03bf93c2856eeb231ef.zip |
tweaks to file ingest importer
- allow overriding source filter whitelist (common case for CLI use)
- fix editgroup description env variable pass-through
Diffstat (limited to 'python')
-rwxr-xr-x | python/fatcat_import.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 7 |
2 files changed, 10 insertions, 3 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index aa789dd9..656fe87d 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -77,6 +77,7 @@ def run_arabesque_match(args): args.json_file): print("Supply one of --sqlite-file or --json-file") ami = ArabesqueMatchImporter(args.api, + editgroup_description=args.editgroup_description_override, do_updates=args.do_updates, require_grobid=(not args.no_require_grobid), extid_type=args.extid_type, @@ -91,6 +92,8 @@ def run_arabesque_match(args): def run_ingest_file(args): ifri = IngestFileResultImporter(args.api, + editgroup_description=args.editgroup_description_override, + skip_source_whitelist=args.skip_source_whitelist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, require_grobid=(not args.no_require_grobid), @@ -333,6 +336,9 @@ def main(): sub_ingest_file.add_argument('json_file', help="ingest_file JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) + sub_ingest_file.add_argument('--skip-source-whitelist', + action='store_true', + help="don't filter import based on request source whitelist") sub_ingest_file.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index bb410b63..7dad13ce 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -11,9 +11,9 @@ class IngestFileResultImporter(EntityImporter): def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.get('editgroup_description', + eg_desc = kwargs.pop('editgroup_description', "Files crawled from web using sandcrawler ingest tool") - eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') super().__init__(api, editgroup_description=eg_desc, @@ -29,7 +29,8 @@ class IngestFileResultImporter(EntityImporter): else: print("NOT checking GROBID success") self.ingest_request_source_whitelist = ['fatcat-changelog'] - #self.ingest_request_source_whitelist = [] + if kwargs.get('skip_source_whitelist', False): + self.ingest_request_source_whitelist = [] def want(self, row): """ |