summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:31 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 14:17:35 -0800
commit86e6850e70617e1609b79e0ee4bfe2a26f7f992e (patch)
treedeb5724e1c611e9b6aa0a020d5c0ead34e74853b /python/fatcat_tools/cleanups
parentad050445ac4f3e218ec101790bbf187731646361 (diff)
downloadfatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.tar.gz
fatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.zip
cleanups: tweaks to wayback CDX cleanup scripts
Diffstat (limited to 'python/fatcat_tools/cleanups')
-rw-r--r--python/fatcat_tools/cleanups/file_short_wayback_ts.py18
1 files changed, 13 insertions, 5 deletions
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
index 56a5c80e..a9b19921 100644
--- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
instead it has a __main__ function and is invoked like:
- python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json
+ python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
"""
def __init__(self, api: ApiClient, **kwargs):
@@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter):
if fe_url.url in url_expansions:
fix_url = url_expansions[fe_url.url]
# defensive checks
- assert f"/web/{partial_ts}" in fix_url
+ if not (
+ f"/web/{partial_ts}" in fix_url
+ and fe_url.url.endswith(original_url)
+ and fix_url.endswith(original_url)
+ ):
+ print(
+ f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
+ file=sys.stderr,
+ )
+ self.counts["skip-bad-replacement"] += 1
+ return None
assert "://" in fix_url
- assert fe_url.url.endswith(original_url)
- assert fix_url.endswith(original_url)
fe_url.url = fix_url
any_fixed = True
@@ -305,7 +313,7 @@ def main() -> None:
)
parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
parser.set_defaults(
- auth_var="FATCAT_API_AUTH_TOKEN",
+ auth_var="FATCAT_AUTH_WORKER_CLEANUP",
)
parser.add_argument(
"json_file",