diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-12 22:10:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-12 22:10:06 -0700 |
commit | 759867197348f93d04e695782e7e337c7a69ea85 (patch) | |
tree | 50b0ba54b7d63fbf1a566868c21e56b2a83db2eb /python/fatcat_import.py | |
parent | c63b663e2fcfb0c4544653f7f30f7a548103ef2b (diff) | |
download | fatcat-759867197348f93d04e695782e7e337c7a69ea85.tar.gz fatcat-759867197348f93d04e695782e7e337c7a69ea85.zip |
early version of arabesque importer
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index aea8c757..f04a63ef 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -31,6 +31,16 @@ def run_matched(args): edit_batch_size=args.batch_size) JsonLinePusher(fmi, args.json_file).run() +def run_arabesque_matched(args): + ami = ArabesqueMatchImporter(args.api, + do_updates=args.do_updates, + extid_type=args.extid_type, + crawl_id=args.crawl_id, + default_link_rel=args.default_link_rel, + edit_batch_size=args.batch_size) + SqlitePusher(ami, args.db_file, "crawl_result", + ARABESQUE_MATCH_WHERE_CLAUSE).run() + def run_grobid_metadata(args): fmi = GrobidMetadataImporter(args.api, edit_batch_size=args.batch_size, @@ -151,6 +161,24 @@ def main(): action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") + sub_arabesque_matched = subparsers.add_parser('arabesque_matched') + sub_arabesque_matched.set_defaults( + func=run_arabesque_matched, + auth_var="FATCAT_API_AUTH_TOKEN", + ) + sub_arabesque_matched.add_argument('db_file', + help="sqlite database file to import from") + sub_arabesque_matched.add_argument('--do-updates', + action='store_true', + help="update pre-existing file entities if new match (instead of skipping)") + sub_arabesque_matched.add_argument('--extid-type', + default="doi", + help="identifer type in the database (eg, 'doi', 'pmcid'") + sub_arabesque_matched.add_argument('--crawl-id', + help="crawl ID (optionally included in editgroup metadata)") + sub_arabesque_matched.add_argument('--default-link-rel', + help="default URL rel for matches (eg, 'publisher', 'web')") + sub_grobid_metadata = subparsers.add_parser('grobid-metadata') sub_grobid_metadata.set_defaults( func=run_grobid_metadata, |