summaryrefslogtreecommitdiffstats
path: root/python/fatcat_import.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-18 16:07:14 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-04-18 16:07:14 -0700
commitbfc0d6597fa98e89615e6d82eed9488fd9a1e087 (patch)
tree769a7bb28fb32296ec49c0aa30d3e0ce115b1162 /python/fatcat_import.py
parent95fdfd31d73a2729eae3f20b3b6488d04782e0ce (diff)
downloadfatcat-bfc0d6597fa98e89615e6d82eed9488fd9a1e087.tar.gz
fatcat-bfc0d6597fa98e89615e6d82eed9488fd9a1e087.zip
minor arabesque tweaks
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-xpython/fatcat_import.py34
1 files changed, 22 insertions, 12 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 4eae49ef..de5b4202 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -31,7 +31,10 @@ def run_matched(args):
edit_batch_size=args.batch_size)
JsonLinePusher(fmi, args.json_file).run()
-def run_arabesque_matched(args):
+def run_arabesque_match(args):
+ if (args.sqlite_file and args.json_file) or not (args.sqlite_file or
+ args.json_file):
+ print("Supply one of --sqlite-file or --json-file")
ami = ArabesqueMatchImporter(args.api,
do_updates=args.do_updates,
require_grobid=(not args.no_require_grobid),
@@ -39,8 +42,11 @@ def run_arabesque_matched(args):
crawl_id=args.crawl_id,
default_link_rel=args.default_link_rel,
edit_batch_size=args.batch_size)
- SqlitePusher(ami, args.db_file, "crawl_result",
- ARABESQUE_MATCH_WHERE_CLAUSE).run()
+ if args.sqlite_file:
+ SqlitePusher(ami, args.sqlite_file, "crawl_result",
+ ARABESQUE_MATCH_WHERE_CLAUSE).run()
+ elif args.json_file:
+ JsonLinePusher(ami, args.json_file).run()
def run_grobid_metadata(args):
fmi = GrobidMetadataImporter(args.api,
@@ -162,25 +168,29 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
- sub_arabesque_matched = subparsers.add_parser('arabesque_matched')
- sub_arabesque_matched.set_defaults(
- func=run_arabesque_matched,
+ sub_arabesque_match = subparsers.add_parser('arabesque')
+ sub_arabesque_match.set_defaults(
+ func=run_arabesque_match,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
)
- sub_arabesque_matched.add_argument('db_file',
+ sub_arabesque_match.add_argument('--sqlite-file',
help="sqlite database file to import from")
- sub_arabesque_matched.add_argument('--do-updates',
+ sub_arabesque_match.add_argument('--json-file',
+ help="JSON file to import from (or stdin)",
+ type=argparse.FileType('r'))
+ sub_arabesque_match.add_argument('--do-updates',
action='store_true',
help="update pre-existing file entities if new match (instead of skipping)")
- sub_arabesque_matched.add_argument('--no-require-grobid',
+ sub_arabesque_match.add_argument('--no-require-grobid',
action='store_true',
help="whether postproc_status column must be '200'")
- sub_arabesque_matched.add_argument('--extid-type',
+ sub_arabesque_match.add_argument('--extid-type',
default="doi",
help="identifer type in the database (eg, 'doi', 'pmcid'")
- sub_arabesque_matched.add_argument('--crawl-id',
+ sub_arabesque_match.add_argument('--crawl-id',
help="crawl ID (optionally included in editgroup metadata)")
- sub_arabesque_matched.add_argument('--default-link-rel',
+ sub_arabesque_match.add_argument('--default-link-rel',
+ default="web",
help="default URL rel for matches (eg, 'publisher', 'web')")
sub_grobid_metadata = subparsers.add_parser('grobid-metadata')