summaryrefslogtreecommitdiffstats
path: root/python/fatcat_import.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-11-13 00:27:48 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-11-15 16:46:26 -0800
commita4db9ee2e18a18b23eb7ece484f95914421f877d (patch)
tree6d4856c8fb7854a75dbd43c983179d4492769039 /python/fatcat_import.py
parent169477c39dc772c0eb1d45f8097215e73f0f6044 (diff)
downloadfatcat-a4db9ee2e18a18b23eb7ece484f95914421f877d.tar.gz
fatcat-a4db9ee2e18a18b23eb7ece484f95914421f877d.zip
ingest file result importer
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-xpython/fatcat_import.py34
1 files changed, 34 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 2239f179..400b1915 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -89,6 +89,19 @@ def run_arabesque_match(args):
elif args.json_file:
JsonLinePusher(ami, args.json_file).run()
+def run_ingest_file(args):
+ ifri = IngestFileResultImporter(args.api,
+ do_updates=args.do_updates,
+ default_link_rel=args.default_link_rel,
+ require_grobid=(not args.no_require_grobid),
+ edit_batch_size=args.batch_size)
+ if args.kafka_mode:
+ KafkaJsonPusher(ifri, args.kafka_hosts, args.kafka_env, "ingest-file-results",
+ "fatcat-ingest-file-result", kafka_namespace="sandcrawler",
+ consume_batch_size=args.batch_size).run()
+ else:
+ JsonLinePusher(ifri, args.json_file).run()
+
def run_grobid_metadata(args):
fmi = GrobidMetadataImporter(args.api,
edit_batch_size=args.batch_size,
@@ -312,6 +325,27 @@ def main():
default="web",
help="default URL rel for matches (eg, 'publisher', 'web')")
+ sub_ingest_file = subparsers.add_parser('ingest-file-result')
+ sub_ingest_file.set_defaults(
+ func=run_ingest_file,
+ auth_var="FATCAT_AUTH_WORKER_SANDCRAWLER",
+ )
+ sub_ingest_file.add_argument('json_file',
+ help="ingest_file JSON file to import from",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_ingest_file.add_argument('--kafka-mode',
+ action='store_true',
+ help="consume from kafka topic (not stdin)")
+ sub_ingest_file.add_argument('--do-updates',
+ action='store_true',
+ help="update pre-existing file entities if new match (instead of skipping)")
+ sub_ingest_file.add_argument('--no-require-grobid',
+ action='store_true',
+ help="whether postproc_status column must be '200'")
+ sub_ingest_file.add_argument('--default-link-rel',
+ default="web",
+ help="default URL rel for matches (eg, 'publisher', 'web')")
+
sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
sub_grobid_metadata.set_defaults(
func=run_grobid_metadata,