diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-08 23:31:40 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-08 23:31:40 +0100 |
commit | 081746837a55bf5f34c96f12f1abb5a00d5b478c (patch) | |
tree | 88af1ade558ad6695918d36648b3ed4a5bea6954 /python/fatcat_import.py | |
parent | 27723a61bde5591bae8115d801d0d09b7ef01b03 (diff) | |
parent | 277bd183d7139bb1a8857bc2a48c0aa92012455d (diff) | |
download | fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.tar.gz fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.zip |
Merge branch 'martin-datacite-import'
Pipfile.lock is broken.
* martin-datacite-import: (68 commits)
datacite: pass in doi into factored out method
datacite: reformat test cases and use jq . --sort-keys
datacite: factor out contributor handling
datacite: catch type mismatch in language detection
datacite: adjust tests for release_month
datacite: name extra.month, extra.release_month
datacite: mark additional files as stub
datacite: CCDC are entries, mostly
datacite: use more specific release_type, if possible
datacite: ignore certain names
datacite: over 3% records have the same title: stub
datacite: fill a few more release_type gaps
datacite: adding datacite-specific extra metadata
datacite: apply pylint suggestions
datacite: fix typos
datacite: set release_stage to published by default
datacite: month field should be top-level
datacite: include month in extra
datacite: indicate mismatched file in test
datacite: clean abstracts, use unknown value tokens
...
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 184dcc0a..fb8830ca 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -167,6 +167,20 @@ def run_cdl_dash_dat(args): print("fileset id: {}".format(fs.ident)) print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) +def run_datacite(args): + dci = DataciteImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size, + bezerk_mode=args.bezerk_mode, + debug=args.debug, + extid_map_file=args.extid_map_file, + insert_log_file=args.insert_log_file) + if args.kafka_mode: + KafkaJsonPusher(dci, args.kafka_hosts, args.kafka_env, "api-datacite", + "fatcat-import", consume_batch_size=args.batch_size).run() + else: + JsonLinePusher(dci, args.json_file).run() + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -443,6 +457,35 @@ def main(): type=str, help="use existing editgroup (instead of creating a new one)") + sub_datacite = subparsers.add_parser('datacite', + help="import datacite.org metadata") + sub_datacite.add_argument('json_file', + help="File with jsonlines from datacite.org v2 API to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_datacite.add_argument('issn_map_file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_datacite.add_argument('--extid-map-file', + help="DOI-to-other-identifiers sqlite3 database", + default=None, type=str) + sub_datacite.add_argument('--kafka-mode', + action='store_true', + help="consume from kafka topic (not stdin)") + sub_datacite.add_argument('--bezerk-mode', + action='store_true', + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") + sub_datacite.add_argument('--debug', + action='store_true', + help="write converted JSON to stdout") + sub_datacite.add_argument('--insert-log-file', + default='', + type=str, + help="write inserted documents into file (for debugging)") + sub_datacite.set_defaults( + func=run_datacite, + auth_var="FATCAT_AUTH_WORKER_DATACITE", + ) + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") |