From a859fddb227872ce52f06af1dd9fb80987f348c4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 29 Oct 2021 18:36:53 -0700 Subject: glue, utils, and worker code for crossref and grobid_refs --- python/persist_tool.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'python/persist_tool.py') diff --git a/python/persist_tool.py b/python/persist_tool.py index b124ddc..a4f9812 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -119,6 +119,22 @@ def run_ingest_request(args): pusher.run() +def run_crossref(args): + grobid_client = GrobidClient( + host_url=args.grobid_host, + ) + worker = PersistCrossrefWorker( + db_url=args.db_url, + grobid_client=grobid_client, + ) + pusher = JsonLinePusher( + worker, + args.json_file, + batch_size=10, + ) + pusher.run() + + def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( @@ -238,6 +254,20 @@ def main(): type=argparse.FileType("r"), ) + sub_crossref = subparsers.add_parser( + "crossref", + help="backfill a crossref JSON dump into postgresql, and extract references at the same time", + ) + sub_crossref.set_defaults(func=run_crossref) + sub_crossref.add_argument( + "json_file", + help="crossref file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) + sub_crossref.add_argument( + "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port" + ) + args = parser.parse_args() if not args.__dict__.get("func"): print("Tell me what to do!", file=sys.stderr) -- cgit v1.2.3