aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-29 18:36:53 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-04 17:19:52 -0700
commita859fddb227872ce52f06af1dd9fb80987f348c4 (patch)
treeced078e2d563feed196fdf00c33cec39a8b42031 /python/persist_tool.py
parent16f4b7f45ae8bdcd4018850efe164ed19069e9fe (diff)
downloadsandcrawler-a859fddb227872ce52f06af1dd9fb80987f348c4.tar.gz
sandcrawler-a859fddb227872ce52f06af1dd9fb80987f348c4.zip
glue, utils, and worker code for crossref and grobid_refs
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py30
1 files changed, 30 insertions, 0 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index b124ddc..a4f9812 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -119,6 +119,22 @@ def run_ingest_request(args):
pusher.run()
+def run_crossref(args):
+ grobid_client = GrobidClient(
+ host_url=args.grobid_host,
+ )
+ worker = PersistCrossrefWorker(
+ db_url=args.db_url,
+ grobid_client=grobid_client,
+ )
+ pusher = JsonLinePusher(
+ worker,
+ args.json_file,
+ batch_size=10,
+ )
+ pusher.run()
+
+
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
@@ -238,6 +254,20 @@ def main():
type=argparse.FileType("r"),
)
+ sub_crossref = subparsers.add_parser(
+ "crossref",
+ help="backfill a crossref JSON dump into postgresql, and extract references at the same time",
+ )
+ sub_crossref.set_defaults(func=run_crossref)
+ sub_crossref.add_argument(
+ "json_file",
+ help="crossref file to import from (or '-' for stdin)",
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
+ )
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print("Tell me what to do!", file=sys.stderr)