diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:31 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:35 -0800 |
commit | 86e6850e70617e1609b79e0ee4bfe2a26f7f992e (patch) | |
tree | deb5724e1c611e9b6aa0a020d5c0ead34e74853b /python | |
parent | ad050445ac4f3e218ec101790bbf187731646361 (diff) | |
download | fatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.tar.gz fatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.zip |
cleanups: tweaks to wayback CDX cleanup scripts
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 |
1 files changed, 13 insertions, 5 deletions
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index 56a5c80e..a9b19921 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; instead it has a __main__ function and is invoked like: - python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json + python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json """ def __init__(self, api: ApiClient, **kwargs): @@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter): if fe_url.url in url_expansions: fix_url = url_expansions[fe_url.url] # defensive checks - assert f"/web/{partial_ts}" in fix_url + if not ( + f"/web/{partial_ts}" in fix_url + and fe_url.url.endswith(original_url) + and fix_url.endswith(original_url) + ): + print( + f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}", + file=sys.stderr, + ) + self.counts["skip-bad-replacement"] += 1 + return None assert "://" in fix_url - assert fe_url.url.endswith(original_url) - assert fix_url.endswith(original_url) fe_url.url = fix_url any_fixed = True @@ -305,7 +313,7 @@ def main() -> None: ) parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.set_defaults( - auth_var="FATCAT_API_AUTH_TOKEN", + auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) parser.add_argument( "json_file", |