diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:31 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:35 -0800 | 
| commit | 86e6850e70617e1609b79e0ee4bfe2a26f7f992e (patch) | |
| tree | deb5724e1c611e9b6aa0a020d5c0ead34e74853b | |
| parent | ad050445ac4f3e218ec101790bbf187731646361 (diff) | |
| download | fatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.tar.gz fatcat-86e6850e70617e1609b79e0ee4bfe2a26f7f992e.zip  | |
cleanups: tweaks to wayback CDX cleanup scripts
| -rw-r--r-- | notes/cleanups/scripts/fetch_full_cdx_ts.py | 9 | ||||
| -rw-r--r-- | python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 | 
2 files changed, 21 insertions, 6 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 6c6817ab..6f67c7e1 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -157,7 +157,14 @@ def process_file(fe, session) -> dict:          if short in full_urls:              continue -        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) +        cdx_record = None +        try: +            cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) +        except requests.exceptions.HTTPError as e: +            if e.response.status_code == 403: +                return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") +            else: +                raise          if cdx_record:              if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):                  assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index 56a5c80e..a9b19921 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter):      is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;      instead it has a __main__ function and is invoked like: -        python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json +        python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json      """      def __init__(self, api: ApiClient, **kwargs): @@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter):              if fe_url.url in url_expansions:                  fix_url = url_expansions[fe_url.url]                  # defensive checks -                assert f"/web/{partial_ts}" in fix_url +                if not ( +                    f"/web/{partial_ts}" in fix_url +                    and fe_url.url.endswith(original_url) +                    and fix_url.endswith(original_url) +                ): +                    print( +                        f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}", +                        file=sys.stderr, +                    ) +                    self.counts["skip-bad-replacement"] += 1 +                    return None                  assert "://" in fix_url -                assert fe_url.url.endswith(original_url) -                assert fix_url.endswith(original_url)                  fe_url.url = fix_url                  any_fixed = True @@ -305,7 +313,7 @@ def main() -> None:      )      parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)      parser.set_defaults( -        auth_var="FATCAT_API_AUTH_TOKEN", +        auth_var="FATCAT_AUTH_WORKER_CLEANUP",      )      parser.add_argument(          "json_file",  | 
