diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-13 00:27:48 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-15 16:46:26 -0800 |
commit | a4db9ee2e18a18b23eb7ece484f95914421f877d (patch) | |
tree | 6d4856c8fb7854a75dbd43c983179d4492769039 /python/fatcat_import.py | |
parent | 169477c39dc772c0eb1d45f8097215e73f0f6044 (diff) | |
download | fatcat-a4db9ee2e18a18b23eb7ece484f95914421f877d.tar.gz fatcat-a4db9ee2e18a18b23eb7ece484f95914421f877d.zip |
ingest file result importer
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 2239f179..400b1915 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -89,6 +89,19 @@ def run_arabesque_match(args): elif args.json_file: JsonLinePusher(ami, args.json_file).run() +def run_ingest_file(args): + ifri = IngestFileResultImporter(args.api, + do_updates=args.do_updates, + default_link_rel=args.default_link_rel, + require_grobid=(not args.no_require_grobid), + edit_batch_size=args.batch_size) + if args.kafka_mode: + KafkaJsonPusher(ifri, args.kafka_hosts, args.kafka_env, "ingest-file-results", + "fatcat-ingest-file-result", kafka_namespace="sandcrawler", + consume_batch_size=args.batch_size).run() + else: + JsonLinePusher(ifri, args.json_file).run() + def run_grobid_metadata(args): fmi = GrobidMetadataImporter(args.api, edit_batch_size=args.batch_size, @@ -312,6 +325,27 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") + sub_ingest_file = subparsers.add_parser('ingest-file-result') + sub_ingest_file.set_defaults( + func=run_ingest_file, + auth_var="FATCAT_AUTH_WORKER_SANDCRAWLER", + ) + sub_ingest_file.add_argument('json_file', + help="ingest_file JSON file to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_ingest_file.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_ingest_file.add_argument('--do-updates', + action='store_true', + help="update pre-existing file entities if new match (instead of skipping)") + sub_ingest_file.add_argument('--no-require-grobid', + action='store_true', + help="whether postproc_status column must be '200'") + sub_ingest_file.add_argument('--default-link-rel', + default="web", + help="default URL rel for matches (eg, 'publisher', 'web')") + sub_grobid_metadata = subparsers.add_parser('grobid-metadata') sub_grobid_metadata.set_defaults( func=run_grobid_metadata, |