diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-04 17:07:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-09 14:17:35 -0800 |
commit | c2cdf60d509e380029f6e2566fc4f98eff4b9f1a (patch) | |
tree | 8feeaf39b540c6a224b6d26814a49f987dfb993a /notes/cleanups/scripts | |
parent | 1927a7da466164010f0a6467f4df0c887ba00ad3 (diff) | |
download | fatcat-c2cdf60d509e380029f6e2566fc4f98eff4b9f1a.tar.gz fatcat-c2cdf60d509e380029f6e2566fc4f98eff4b9f1a.zip |
wayback timestamps: updates to handle 4-digit case
Diffstat (limited to 'notes/cleanups/scripts')
-rw-r--r-- | notes/cleanups/scripts/fetch_full_cdx_ts.py | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 5ffd11cb..6c6817ab 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -117,7 +117,7 @@ def process_file(fe, session) -> dict: assert seg[3] == "web" assert seg[4].isdigit() original_url = "/".join(seg[5:]) - if len(seg[4]) == 12: + if len(seg[4]) == 12 or len(seg[4]) == 4: short_urls.append(u) elif len(seg[4]) == 14: self_urls[original_url] = u @@ -131,14 +131,14 @@ def process_file(fe, session) -> dict: for short in list(set(short_urls)): seg = short.split('/') ts = seg[4] - assert len(ts) == 12 and ts.isdigit() + assert len(ts) in [12,4] and ts.isdigit() original_url = '/'.join(seg[5:]) - if original_url in full_urls: + if short in full_urls: continue if original_url in self_urls: - full_urls[original_url] = self_urls[original_url] + full_urls[short] = self_urls[original_url] status = "success-self" continue @@ -146,7 +146,7 @@ def process_file(fe, session) -> dict: for cdx_row in cdx_row_list: if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() - full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" + full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" status = "success-db" break else: @@ -154,14 +154,14 @@ def process_file(fe, session) -> dict: pass cdx_row = None - if original_url in full_urls: + if short in full_urls: continue cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) if cdx_record: if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() - full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" + full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" status = "success-api" break else: @@ -169,7 +169,7 @@ def process_file(fe, session) -> dict: else: print(f"no CDX API record found: {original_url}", file=sys.stderr) - if original_url not in full_urls: + if short not in full_urls: return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") return dict( |