diff options
author | Martin Czygan <martin@archive.org> | 2020-11-19 22:36:55 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2020-11-19 22:36:55 +0000 |
commit | 03eadfc7e2bee4213345f6464378e87b8f741d20 (patch) | |
tree | 3e5b13af8ba46b240f9ae53d5f522fb7ee02c219 /python/fatcat_import.py | |
parent | 5afde4690a4653db53fe4962af5da3eb9188d9a2 (diff) | |
parent | a73b73c2944b3df2a62886c4e6b69c93f5e74222 (diff) | |
download | fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.tar.gz fatcat-03eadfc7e2bee4213345f6464378e87b8f741d20.zip |
Merge branch 'bnewbold-xml-html-ingest' into 'master'
HTML webcapture ingest (and XML file ingest)
See merge request webgroup/fatcat!88
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 48 |
1 files changed, 45 insertions, 3 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index e92b3106..19cf43ec 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -126,7 +126,7 @@ def run_arabesque_match(args): def run_ingest_file(args): ifri = IngestFileResultImporter(args.api, editgroup_description=args.editgroup_description_override, - skip_source_whitelist=args.skip_source_whitelist, + skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, require_grobid=(not args.no_require_grobid), @@ -144,6 +144,26 @@ def run_ingest_file(args): else: JsonLinePusher(ifri, args.json_file).run() +def run_ingest_web(args): + iwri = IngestWebResultImporter(args.api, + editgroup_description=args.editgroup_description_override, + skip_source_allowlist=args.skip_source_allowlist, + do_updates=args.do_updates, + default_link_rel=args.default_link_rel, + edit_batch_size=args.batch_size) + if args.kafka_mode: + KafkaJsonPusher( + iwri, + args.kafka_hosts, + args.kafka_env, + "ingest-file-results", + "fatcat-{}-ingest-web-result".format(args.kafka_env), + kafka_namespace="sandcrawler", + consume_batch_size=args.batch_size, + ).run() + else: + JsonLinePusher(iwri, args.json_file).run() + def run_savepapernow_file(args): ifri = SavePaperNowFileImporter(args.api, editgroup_description=args.editgroup_description_override, @@ -442,9 +462,9 @@ def main(): sub_ingest_file.add_argument('json_file', help="ingest_file JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_file.add_argument('--skip-source-whitelist', + sub_ingest_file.add_argument('--skip-source-allowlist', action='store_true', - help="don't filter import based on request source whitelist") + help="don't filter import based on request source allowlist") sub_ingest_file.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") @@ -458,6 +478,28 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") + sub_ingest_web = subparsers.add_parser('ingest-web-results', + help="add/update web entities linked to releases based on sandcrawler ingest results") + sub_ingest_web.set_defaults( + func=run_ingest_web, + auth_var="FATCAT_AUTH_WORKER_CRAWL", + ) + sub_ingest_web.add_argument('json_file', + help="ingest_web JSON file to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_ingest_web.add_argument('--skip-source-allowlist', + action='store_true', + help="don't filter import based on request source allowlist") + sub_ingest_web.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_ingest_web.add_argument('--do-updates', + action='store_true', + help="update pre-existing web entities if new match (instead of skipping)") + sub_ingest_web.add_argument('--default-link-rel', + default="web", + help="default URL rel for matches (eg, 'publisher', 'web')") + sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results', help="add file entities crawled due to async Save Paper Now request") sub_savepapernow_file.set_defaults( |