diff options
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 85 |
1 files changed, 82 insertions, 3 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index e92b3106..ff6c94dc 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -126,7 +126,7 @@ def run_arabesque_match(args): def run_ingest_file(args): ifri = IngestFileResultImporter(args.api, editgroup_description=args.editgroup_description_override, - skip_source_whitelist=args.skip_source_whitelist, + skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, require_grobid=(not args.no_require_grobid), @@ -144,6 +144,26 @@ def run_ingest_file(args): else: JsonLinePusher(ifri, args.json_file).run() +def run_ingest_web(args): + iwri = IngestWebResultImporter(args.api, + editgroup_description=args.editgroup_description_override, + skip_source_allowlist=args.skip_source_allowlist, + do_updates=args.do_updates, + default_link_rel=args.default_link_rel, + edit_batch_size=args.batch_size) + if args.kafka_mode: + KafkaJsonPusher( + iwri, + args.kafka_hosts, + args.kafka_env, + "ingest-file-results", + "fatcat-{}-ingest-web-result".format(args.kafka_env), + kafka_namespace="sandcrawler", + consume_batch_size=args.batch_size, + ).run() + else: + JsonLinePusher(iwri, args.json_file).run() + def run_savepapernow_file(args): ifri = SavePaperNowFileImporter(args.api, editgroup_description=args.editgroup_description_override, @@ -236,6 +256,24 @@ def run_datacite(args): else: JsonLinePusher(dci, args.json_file).run() +def run_doaj_article(args): + dai = DoajArticleImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size, + do_updates=args.do_updates, + ) + if args.kafka_mode: + KafkaJsonPusher( + dai, + args.kafka_hosts, + args.kafka_env, + "api-doaj", + "fatcat-{}-import-doaj".format(args.kafka_env), + consume_batch_size=args.batch_size, + ).run() + else: + JsonLinePusher(dai, args.json_file).run() + def run_file_meta(args): # do_updates defaults to true for this importer fmi = FileMetaImporter(args.api, @@ -442,9 +480,9 @@ def main(): sub_ingest_file.add_argument('json_file', help="ingest_file JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_file.add_argument('--skip-source-whitelist', + sub_ingest_file.add_argument('--skip-source-allowlist', action='store_true', - help="don't filter import based on request source whitelist") + help="don't filter import based on request source allowlist") sub_ingest_file.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") @@ -458,6 +496,28 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") + sub_ingest_web = subparsers.add_parser('ingest-web-results', + help="add/update web entities linked to releases based on sandcrawler ingest results") + sub_ingest_web.set_defaults( + func=run_ingest_web, + auth_var="FATCAT_AUTH_WORKER_CRAWL", + ) + sub_ingest_web.add_argument('json_file', + help="ingest_web JSON file to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_ingest_web.add_argument('--skip-source-allowlist', + action='store_true', + help="don't filter import based on request source allowlist") + sub_ingest_web.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_ingest_web.add_argument('--do-updates', + action='store_true', + help="update pre-existing web entities if new match (instead of skipping)") + sub_ingest_web.add_argument('--default-link-rel', + default="web", + help="default URL rel for matches (eg, 'publisher', 'web')") + sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results', help="add file entities crawled due to async Save Paper Now request") sub_savepapernow_file.set_defaults( @@ -564,6 +624,25 @@ def main(): auth_var="FATCAT_AUTH_WORKER_DATACITE", ) + sub_doaj_article = subparsers.add_parser('doaj-article', + help="import doaj.org article metadata") + sub_doaj_article.add_argument('json_file', + help="File with JSON lines from DOAJ API (or bulk dump) to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_doaj_article.add_argument('--issn-map-file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_doaj_article.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_doaj_article.add_argument('--do-updates', + action='store_true', + help="update any pre-existing release entities") + sub_doaj_article.set_defaults( + func=run_doaj_article, + auth_var="FATCAT_AUTH_WORKER_DOAJ", + ) + sub_file_meta = subparsers.add_parser('file-meta', help="simple update-only importer for file metadata") sub_file_meta.set_defaults( |