From 1927a7da466164010f0a6467f4df0c887ba00ad3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 14:00:56 -0700 Subject: start work on wayback short-timestamp cleanup --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 193 ++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 notes/cleanups/scripts/fetch_full_cdx_ts.py (limited to 'notes/cleanups/scripts') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py new file mode 100644 index 00000000..5ffd11cb --- /dev/null +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +import sys +import json +import base64 +from typing import Optional, List + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + +def requests_retry_session( + retries: int = 10, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], + session: requests.Session = None, +) -> requests.Session: + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + +def b32_hex(s: str) -> str: + """ + Converts a base32-encoded SHA-1 checksum into hex-encoded + + base32 checksums are used by, eg, heritrix and in wayback CDX files + """ + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + if len(s) == 40: + return s + raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + + +SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030" + +def get_db_cdx(url: str, http_session) -> List[dict]: + resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url)) + resp.raise_for_status() + rows = resp.json() + return rows or [] + +CDX_API_URL = "https://web.archive.org/cdx/search/cdx" + +def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]: + + params = { + "url": url, + "from": partial_dt, + "to": partial_dt, + "matchType": "exact", + "output": "json", + "limit": 20, + # can't filter status because might be warc/revisit + #"filter": "statuscode:200", + } + resp = http_session.get(CDX_API_URL, params=params) + resp.raise_for_status() + rows = resp.json() + + if not rows: + return None + #print(rows, file=sys.stderr) + if len(rows) < 2: + return None + + for raw in rows[1:]: + record = dict( + surt=raw[0], + datetime=raw[1], + url=raw[2], + mimetype=raw[3], + status_code=raw[4], + sha1b32=raw[5], + sha1hex=b32_hex(raw[5]), + ) + if record['url'] != url: + # TODO: could allow HTTP/HTTPS fuzzy match + print("CDX API near match: URL", file=sys.stderr) + continue + if not record['datetime'].startswith(partial_dt): + print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr) + continue + if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'): + return record + else: + print(f"CDX API near match: status {record['status_code']}", file=sys.stderr) + return None + +def process_file(fe, session) -> dict: + short_urls = [] + self_urls = dict() + full_urls = dict() + status = "unknown" + + for pair in fe['urls']: + u = pair['url'] + if not '://web.archive.org/web/' in u: + continue + seg = u.split('/') + assert seg[2] == "web.archive.org" + assert seg[3] == "web" + assert seg[4].isdigit() + original_url = "/".join(seg[5:]) + if len(seg[4]) == 12: + short_urls.append(u) + elif len(seg[4]) == 14: + self_urls[original_url] = u + else: + print(f"other bogus ts: {seg[4]}", file=sys.stderr) + return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts") + + if len(short_urls) == 0: + return dict(file_entity=fe, full_urls=[], status="skip-no-shorts") + + for short in list(set(short_urls)): + seg = short.split('/') + ts = seg[4] + assert len(ts) == 12 and ts.isdigit() + original_url = '/'.join(seg[5:]) + + if original_url in full_urls: + continue + + if original_url in self_urls: + full_urls[original_url] = self_urls[original_url] + status = "success-self" + continue + + cdx_row_list = get_db_cdx(original_url, http_session=session) + for cdx_row in cdx_row_list: + if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): + assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() + full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" + status = "success-db" + break + else: + #print(f"cdx DB found, but no match", file=sys.stderr) + pass + cdx_row = None + + if original_url in full_urls: + continue + + cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + if cdx_record: + if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): + assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() + full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" + status = "success-api" + break + else: + print(f"cdx API found, but no match", file=sys.stderr) + else: + print(f"no CDX API record found: {original_url}", file=sys.stderr) + + if original_url not in full_urls: + return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") + + return dict( + file_entity=fe, + full_urls=full_urls, + status=status, + ) + +def main(): + session = requests_retry_session() + session.headers.update({ + "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot", + }) + for line in sys.stdin: + if not line.strip(): + continue + fe = json.loads(line) + print(json.dumps(process_file(fe, session=session))) + +if __name__=="__main__": + main() -- cgit v1.2.3 From c2cdf60d509e380029f6e2566fc4f98eff4b9f1a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Nov 2021 17:07:54 -0700 Subject: wayback timestamps: updates to handle 4-digit case --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 16 ++--- notes/cleanups/wayback_timestamps.md | 103 +++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 11 deletions(-) (limited to 'notes/cleanups/scripts') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 5ffd11cb..6c6817ab 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -117,7 +117,7 @@ def process_file(fe, session) -> dict: assert seg[3] == "web" assert seg[4].isdigit() original_url = "/".join(seg[5:]) - if len(seg[4]) == 12: + if len(seg[4]) == 12 or len(seg[4]) == 4: short_urls.append(u) elif len(seg[4]) == 14: self_urls[original_url] = u @@ -131,14 +131,14 @@ def process_file(fe, session) -> dict: for short in list(set(short_urls)): seg = short.split('/') ts = seg[4] - assert len(ts) == 12 and ts.isdigit() + assert len(ts) in [12,4] and ts.isdigit() original_url = '/'.join(seg[5:]) - if original_url in full_urls: + if short in full_urls: continue if original_url in self_urls: - full_urls[original_url] = self_urls[original_url] + full_urls[short] = self_urls[original_url] status = "success-self" continue @@ -146,7 +146,7 @@ def process_file(fe, session) -> dict: for cdx_row in cdx_row_list: if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() - full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" + full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" status = "success-db" break else: @@ -154,14 +154,14 @@ def process_file(fe, session) -> dict: pass cdx_row = None - if original_url in full_urls: + if short in full_urls: continue cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) if cdx_record: if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() - full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" + full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" status = "success-api" break else: @@ -169,7 +169,7 @@ def process_file(fe, session) -> dict: else: print(f"no CDX API record found: {original_url}", file=sys.stderr) - if original_url not in full_urls: + if short not in full_urls: return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") return dict( diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index c70ec5b2..81785992 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -26,14 +26,53 @@ Filter to files with problem of interest: Wow, this is a lot more than I thought! +There might also be some other short URL patterns, check for those: + + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{1,11}/' \ + | gzip \ + > files_20211007_veryshortts.json.gz + # skipped, mergine with below + + zcat file_export.json.gz \ + | rg 'web.archive.org/web/None/' \ + | pv -l \ + > /dev/null + # 0.00 0:10:06 [0.00 /s] + # whew, that pattern has been fixed it seems + + zcat file_export.json.gz | rg '/None/' | pv -l > /dev/null + # 2.00 0:10:01 [3.33m/s] + + zcat file_export.json.gz \ + | rg 'web.archive.org/web/\d{13}/' \ + | pv -l \ + > /dev/null + # 0.00 0:10:09 [0.00 /s] + +Yes, 4-digit is a popular pattern as well, need to handle those: + + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{4,12}/' \ + | gzip \ + > files_20211007_moreshortts.json.gz + # 111M 0:13:22 [ 139k/s] + + zcat files_20211007_moreshortts.json.gz | wc -l + + zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json + # 9,958,854 + ## Fetch Complete URL Want to export JSON like: file_entity [existing file entity] - full_urls[] - : + full_urls[]: list of Dicts[str,str] + : status: str Status one of: @@ -41,5 +80,63 @@ Status one of: - 'success-self': the file already has a fixed URL internally - 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched - 'success-cdx': CDX API lookup succeeded, and SHA1 matched -- 'fail-hash': found a CDX record, but wrong hash - 'fail-not-found': no matching CDX record found + +Ran over a sample: + + cat files_20211007_shortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json + + cat sample_out.json | jq .status | sort | uniq -c + 5 "fail-not-found" + 576 "success-api" + 7212 "success-db" + 2207 "success-self" + + head -n1000 | ./fetch_full_cdx_ts.py > sample_out.json + + zcat files_20211007_veryshortts.json.gz | head -n1000 | ./fetch_full_cdx_ts.py | jq .status | sort | uniq -c + 2 "fail-not-found" + 168 "success-api" + 208 "success-db" + 622 "success-self" + +Investigating the "fail-not-found", they look like http/https URL +not-exact-matches. Going to put off handling these for now because it is a +small fraction and more delicate. + +Again with the broader set: + + cat files_20211007_moreshortts.10k_sample.json | ./fetch_full_cdx_ts.py > sample_out.json + + cat sample_out.json | jq .status | sort | uniq -c + 9 "fail-not-found" + 781 "success-api" + 6175 "success-db" + 3035 "success-self" + + +## Cleanup Process + +Other possible cleanups to run at the same time, which would not require +external requests or other context: + +- URL has ://archive.org/ link with rel=repository => rel=archive +- mimetype is bogus => clean mimetype +- bogus file => set some new extra field, like scope=stub or scope=partial (?) + +It looks like the rel swap is already implemented in `generic_file_cleanups()`. +From sampling it seems like the mimetype issue is pretty small, so not going to +bite that off now. The "bogus file" issue requires thought, so also skipping. + +## Commands + +Running with 8x parallelism to not break things; expecting some errors along +the way, may need to add handlers for connection errors etc: + + zcat files_20211007_moreshortts.json.gz \ + | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211007_moreshortts.fetched.json.gz + +At 300 records/sec, this should take around 9-10 hours to process. -- cgit v1.2.3 From 86e6850e70617e1609b79e0ee4bfe2a26f7f992e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 14:17:31 -0800 Subject: cleanups: tweaks to wayback CDX cleanup scripts --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 9 ++++++++- python/fatcat_tools/cleanups/file_short_wayback_ts.py | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'notes/cleanups/scripts') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 6c6817ab..6f67c7e1 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -157,7 +157,14 @@ def process_file(fe, session) -> dict: if short in full_urls: continue - cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + cdx_record = None + try: + cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") + else: + raise if cdx_record: if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index 56a5c80e..a9b19921 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -22,7 +22,7 @@ class FileShortWaybackTimestampCleanup(EntityImporter): is not integrated into the `fatcat_import` or `fatcat_cleanup` controller; instead it has a __main__ function and is invoked like: - python -m fatcat_tools.cleans.file_short_wayback-ts < blah.json + python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json """ def __init__(self, api: ApiClient, **kwargs): @@ -77,10 +77,18 @@ class FileShortWaybackTimestampCleanup(EntityImporter): if fe_url.url in url_expansions: fix_url = url_expansions[fe_url.url] # defensive checks - assert f"/web/{partial_ts}" in fix_url + if not ( + f"/web/{partial_ts}" in fix_url + and fe_url.url.endswith(original_url) + and fix_url.endswith(original_url) + ): + print( + f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}", + file=sys.stderr, + ) + self.counts["skip-bad-replacement"] += 1 + return None assert "://" in fix_url - assert fe_url.url.endswith(original_url) - assert fix_url.endswith(original_url) fe_url.url = fix_url any_fixed = True @@ -305,7 +313,7 @@ def main() -> None: ) parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.set_defaults( - auth_var="FATCAT_API_AUTH_TOKEN", + auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) parser.add_argument( "json_file", -- cgit v1.2.3 From 996b2e2084c1798126bd91dd950c063982398bec Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 15:46:20 -0800 Subject: more iteration on short wayback timestamp cleanup --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 2 +- notes/cleanups/wayback_timestamps.md | 129 ++++++++++++++++++++- .../fatcat_tools/cleanups/file_short_wayback_ts.py | 2 +- 3 files changed, 129 insertions(+), 4 deletions(-) (limited to 'notes/cleanups/scripts') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index 6f67c7e1..d5b0c476 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -137,7 +137,7 @@ def process_file(fe, session) -> dict: if short in full_urls: continue - if original_url in self_urls: + if original_url in self_urls and ts in self_urls[original_url]: full_urls[short] = self_urls[original_url] status = "success-self" continue diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index 81785992..85e5f94f 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -61,9 +61,10 @@ Yes, 4-digit is a popular pattern as well, need to handle those: # 111M 0:13:22 [ 139k/s] zcat files_20211007_moreshortts.json.gz | wc -l + # 9,958,854 zcat files_20211007_moreshortts.json.gz | shuf -n10000 > files_20211007_moreshortts.10k_sample.json - # 9,958,854 + ## Fetch Complete URL @@ -114,6 +115,14 @@ Again with the broader set: 6175 "success-db" 3035 "success-self" +While running a larger batch, got a CDX API error: + + requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://web.archive.org/cdx/search/cdx?url=https%3A%2F%2Fwww.psychologytoday.com%2Ffiles%2Fu47%2FHenry_et_al.pdf&from=2017&to=2017&matchType=exact&output=json&limit=20 + + org.archive.util.io.RuntimeIOException: org.archive.wayback.exception.AdministrativeAccessControlException: Blocked Site Error + +So maybe need to use credentials after all. + ## Cleanup Process @@ -128,11 +137,13 @@ It looks like the rel swap is already implemented in `generic_file_cleanups()`. From sampling it seems like the mimetype issue is pretty small, so not going to bite that off now. The "bogus file" issue requires thought, so also skipping. -## Commands + +## Commands (old) Running with 8x parallelism to not break things; expecting some errors along the way, may need to add handlers for connection errors etc: + # OLD SNAPSHOT zcat files_20211007_moreshortts.json.gz \ | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ | pv -l \ @@ -140,3 +151,117 @@ the way, may need to add handlers for connection errors etc: > files_20211007_moreshortts.fetched.json.gz At 300 records/sec, this should take around 9-10 hours to process. + + + +## Prep Again (2021-11-09) + +After fixing "sort" issue and re-dumping file entities (2021-11-05 snapshot). + +Filter again: + + # note: in the future use pigz instead of gzip here + zcat file_export.json.gz \ + | pv -l \ + | rg 'web.archive.org/web/\d{4,12}/' \ + | gzip \ + > files_20211105_moreshortts.json.gz + # 112M 0:13:27 [ 138k/s] + + zcat files_20211105_moreshortts.json.gz | wc -l + # 9,958,854 + # good, exact same number as previous snapshot + + zcat files_20211105_moreshortts.json.gz | shuf -n10000 > files_20211105_moreshortts.10k_sample.json + # done + + cat files_20211105_moreshortts.10k_sample.json \ + | ./fetch_full_cdx_ts.py \ + | pv -l \ + > files_20211105_moreshortts.10k_sample.fetched.json + # 10.0k 0:03:36 [46.3 /s] + + cat files_20211105_moreshortts.10k_sample.fetched.json | jq .status | sort | uniq -c + 13 "fail-not-found" + 774 "success-api" + 6193 "success-db" + 3020 "success-self" + +After tweaking `success-self` logic: + + 13 "fail-not-found" + 859 "success-api" + 6229 "success-db" + 2899 "success-self" + + +## Testing in QA + +Copied `sample_out.json` to fatcat QA instance and renamed as `files_20211007_moreshortts.10k_sample.fetched.json` + + # OLD ATTEMPT + export FATCAT_API_AUTH_TOKEN=[...] + head -n10 /srv/fatcat/datasets/files_20211007_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + +Ran in to issues, iterated above. + +Trying again with updated script and sample file: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0}) + +Manually inspected and these look good. Trying some repeats and larger batched: + + head -n10 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 10, 'skip-revision-changed': 10, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + + head -n1000 /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + + [...] + bad replacement URL: partial_ts=201807271139 original=http://www.scielo.br/pdf/qn/v20n1/4918.pdf fix_url=https://web.archive.org/web/20170819080342/http://www.scielo.br/pdf/qn/v20n1/4918.pdf + bad replacement URL: partial_ts=201904270207 original=https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf fix_url=https://web.archive.org/web/20190501060839/https://www.matec-conferences.org/articles/matecconf/pdf/2018/62/matecconf_iccoee2018_03008.pdf + bad replacement URL: partial_ts=201905011445 original=https://cdn.intechopen.com/pdfs/5886.pdf fix_url=https://web.archive.org/web/20190502203832/https://cdn.intechopen.com/pdfs/5886.pdf + [...] + + # Counter({'total': 1000, 'update': 969, 'skip': 19, 'skip-bad-replacement': 18, 'skip-revision-changed': 10, 'skip-bad-wayback-timestamp': 2, 'skip-status': 1, 'insert': 0, 'exists': 0}) + + +It looks like these "bad replacement URLs" are due to timestamp mismatches. Eg, the partial timestamp is not part of the final timestamp. + +Tweaked fetch script and re-ran: + + # Counter({'total': 1000, 'skip-revision-changed': 979, 'update': 18, 'skip-bad-wayback-timestamp': 2, 'skip': 1, 'skip-status': 1, 'insert': 0, 'exists': 0}) + +Cool. Sort of curious what the deal is with those `skip-bad-wayback-timestamp`. + +Run the rest through: + + cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 10000, 'update': 8976, 'skip-revision-changed': 997, 'skip-bad-wayback-timestamp': 14, 'skip': 13, 'skip-status': 13, 'insert': 0, 'exists': 0}) + +Should tweak batch size to 100 (vs. 50). + +How to parallelize import: + + # from within pipenv + cat /srv/fatcat/datasets/files_20211105_moreshortts.10k_sample.fetched.json \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts - + + +## Full Batch Commands + +Running in bulk again: + + zcat files_20211105_moreshortts.json.gz \ + | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211105_moreshortts.fetched.json.gz + diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py index a9b19921..e2595912 100644 --- a/python/fatcat_tools/cleanups/file_short_wayback_ts.py +++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py @@ -311,7 +311,7 @@ def main() -> None: parser.add_argument( "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" ) - parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int) parser.set_defaults( auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) -- cgit v1.2.3 From cd09c6d6bd4deef0627de4f8a8a301725db01e14 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 22:55:58 -0800 Subject: wayback ts cleanup: one more filter tweak --- notes/cleanups/scripts/fetch_full_cdx_ts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'notes/cleanups/scripts') diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py index d5b0c476..ebcf0d62 100644 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py @@ -115,7 +115,8 @@ def process_file(fe, session) -> dict: seg = u.split('/') assert seg[2] == "web.archive.org" assert seg[3] == "web" - assert seg[4].isdigit() + if not seg[4].isdigit(): + continue original_url = "/".join(seg[5:]) if len(seg[4]) == 12 or len(seg[4]) == 4: short_urls.append(u) -- cgit v1.2.3