aboutsummaryrefslogtreecommitdiffstats
path: root/python/ingest_tool.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/ingest_tool.py')
-rwxr-xr-xpython/ingest_tool.py63
1 files changed, 43 insertions, 20 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index ce3a59c..0b74f9f 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -2,10 +2,11 @@
import argparse
import json
+import subprocess
import sys
from http.server import HTTPServer
-import raven
+import sentry_sdk
from sandcrawler import GrobidClient, JsonLinePusher, KafkaCompressSink, KafkaSink
from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
@@ -43,12 +44,6 @@ def run_single_ingest(args):
def run_requests(args):
- if args.enable_sentry:
- try:
- git_sha = raven.fetch_git_sha("..")
- except Exception:
- git_sha = None
- sentry_client = raven.Client(release=git_sha) # noqa:
# TODO: switch to using JsonLinePusher
file_worker = IngestFileWorker(
try_spn2=not args.no_spn2,
@@ -75,11 +70,11 @@ def run_file_requests_backfill(args):
Can be used to batch re-process known files.
"""
- grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
- pdftext_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env)
- thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env)
- xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.kafka_env)
- htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.kafka_env)
+ grobid_topic = "sandcrawler-{}.grobid-output-pg".format(args.env)
+ pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
+ thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
+ xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
grobid_sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=grobid_topic,
@@ -120,6 +115,20 @@ def run_file_requests_backfill(args):
pusher.run()
+def run_spn_status(args):
+ worker = IngestFileWorker(
+ sink=None,
+ try_spn2=False,
+ )
+
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/system")
+ resp.raise_for_status()
+ print(f"System status: {json.dumps(resp.json(), sort_keys=True)}")
+ resp = worker.spn_client.v2_session.get("https://web.archive.org/save/status/user")
+ resp.raise_for_status()
+ print(f"User status: {json.dumps(resp.json(), sort_keys=True)}")
+
+
def run_api(args):
port = 8083
print("Listening on localhost:{}".format(port))
@@ -129,6 +138,12 @@ def run_api(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--enable-sentry",
+ action="store_true",
+ help="report exceptions to Sentry",
+ )
+ parser.add_argument("--env", default="dev", help="environment (eg, prod, qa, dev)")
subparsers = parser.add_subparsers()
sub_single = subparsers.add_parser("single", help="ingests a single base URL")
@@ -163,11 +178,6 @@ def main():
"--no-spn2", action="store_true", help="don't use live web (SPNv2)"
)
sub_requests.add_argument(
- "--enable-sentry",
- action="store_true",
- help="report exceptions to Sentry",
- )
- sub_requests.add_argument(
"--html-quick-mode",
action="store_true",
help="don't fetch individual sub-resources, just use CDX",
@@ -203,17 +213,30 @@ def main():
help="list of Kafka brokers (host/port) to use",
)
sub_file_requests_backfill.add_argument(
- "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
- )
- sub_file_requests_backfill.add_argument(
"--grobid-host", default="https://grobid.qa.fatcat.wiki", help="GROBID API host/port"
)
+ sub_spn_status = subparsers.add_parser(
+ "spn-status", help="checks save-page-now v2 API status for bot user"
+ )
+ sub_spn_status.set_defaults(func=run_spn_status)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
parser.print_help(file=sys.stderr)
sys.exit(-1)
+ # configure sentry *after* parsing args
+ if args.enable_sentry:
+ try:
+ GIT_REVISION = (
+ subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")
+ )
+ except Exception:
+ print("failed to configure git revision", file=sys.stderr)
+ GIT_REVISION = None
+ sentry_sdk.init(release=GIT_REVISION, environment=args.env, max_breadcrumbs=10)
+
args.func(args)