diff options
Diffstat (limited to 'python/fatcat_import.py')
| -rwxr-xr-x | python/fatcat_import.py | 34 | 
1 files changed, 22 insertions, 12 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 4eae49ef..de5b4202 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -31,7 +31,10 @@ def run_matched(args):          edit_batch_size=args.batch_size)      JsonLinePusher(fmi, args.json_file).run() -def run_arabesque_matched(args): +def run_arabesque_match(args): +    if (args.sqlite_file and args.json_file) or not (args.sqlite_file or +            args.json_file): +        print("Supply one of --sqlite-file or --json-file")      ami = ArabesqueMatchImporter(args.api,          do_updates=args.do_updates,          require_grobid=(not args.no_require_grobid), @@ -39,8 +42,11 @@ def run_arabesque_matched(args):          crawl_id=args.crawl_id,          default_link_rel=args.default_link_rel,          edit_batch_size=args.batch_size) -    SqlitePusher(ami, args.db_file, "crawl_result", -        ARABESQUE_MATCH_WHERE_CLAUSE).run() +    if args.sqlite_file: +        SqlitePusher(ami, args.sqlite_file, "crawl_result", +            ARABESQUE_MATCH_WHERE_CLAUSE).run() +    elif args.json_file: +        JsonLinePusher(ami, args.json_file).run()  def run_grobid_metadata(args):      fmi = GrobidMetadataImporter(args.api, @@ -162,25 +168,29 @@ def main():          action='store_true',          help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") -    sub_arabesque_matched = subparsers.add_parser('arabesque_matched') -    sub_arabesque_matched.set_defaults( -        func=run_arabesque_matched, +    sub_arabesque_match = subparsers.add_parser('arabesque') +    sub_arabesque_match.set_defaults( +        func=run_arabesque_match,          auth_var="FATCAT_AUTH_WORKER_CRAWL",      ) -    sub_arabesque_matched.add_argument('db_file', +    sub_arabesque_match.add_argument('--sqlite-file',          help="sqlite database file to import from") -    sub_arabesque_matched.add_argument('--do-updates', +    sub_arabesque_match.add_argument('--json-file', +        help="JSON file to import from (or stdin)", +        type=argparse.FileType('r')) +    sub_arabesque_match.add_argument('--do-updates',          action='store_true',          help="update pre-existing file entities if new match (instead of skipping)") -    sub_arabesque_matched.add_argument('--no-require-grobid', +    sub_arabesque_match.add_argument('--no-require-grobid',          action='store_true',          help="whether postproc_status column must be '200'") -    sub_arabesque_matched.add_argument('--extid-type', +    sub_arabesque_match.add_argument('--extid-type',          default="doi",          help="identifer type in the database (eg, 'doi', 'pmcid'") -    sub_arabesque_matched.add_argument('--crawl-id', +    sub_arabesque_match.add_argument('--crawl-id',          help="crawl ID (optionally included in editgroup metadata)") -    sub_arabesque_matched.add_argument('--default-link-rel', +    sub_arabesque_match.add_argument('--default-link-rel', +        default="web",          help="default URL rel for matches (eg, 'publisher', 'web')")      sub_grobid_metadata = subparsers.add_parser('grobid-metadata') | 
