start work on wayback short-timestamp cleanup

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-04 14:00:56 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-09 14:17:35 -0800
commit: 1927a7da466164010f0a6467f4df0c887ba00ad3 (patch)
tree: 53d6228a8cadb083942163585663acc275152830
parent: a6d994fbc18debcf3860e6deb12eb54234a42839 (diff)
download: fatcat-1927a7da466164010f0a6467f4df0c887ba00ad3.tar.gz
fatcat-1927a7da466164010f0a6467f4df0c887ba00ad3.zip
2 files changed, 238 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
new file mode 100644
index 00000000..5ffd11cb
--- /dev/null
+++ b/notes/cleanups/scripts/fetch_full_cdx_ts.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import base64
+from typing import Optional, List
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
+
+def requests_retry_session(
+    retries: int = 10,
+    backoff_factor: int = 3,
+    status_forcelist: List[int] = [500, 502, 504],
+    session: requests.Session = None,
+) -> requests.Session:
+    """
+    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+    """
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+def b32_hex(s: str) -> str:
+    """
+    Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+    base32 checksums are used by, eg, heritrix and in wayback CDX files
+    """
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        if len(s) == 40:
+            return s
+        raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+def get_db_cdx(url: str, http_session) -> List[dict]:
+    resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url))
+    resp.raise_for_status()
+    rows = resp.json()
+    return rows or []
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]:
+
+    params = {
+        "url": url,
+        "from": partial_dt,
+        "to": partial_dt,
+        "matchType": "exact",
+        "output": "json",
+        "limit": 20,
+        # can't filter status because might be warc/revisit
+        #"filter": "statuscode:200",
+    }
+    resp = http_session.get(CDX_API_URL, params=params)
+    resp.raise_for_status()
+    rows = resp.json()
+
+    if not rows:
+        return None
+    #print(rows, file=sys.stderr)
+    if len(rows) < 2:
+        return None
+
+    for raw in rows[1:]:
+        record = dict(
+            surt=raw[0],
+            datetime=raw[1],
+            url=raw[2],
+            mimetype=raw[3],
+            status_code=raw[4],
+            sha1b32=raw[5],
+            sha1hex=b32_hex(raw[5]),
+        )
+        if record['url'] != url:
+            # TODO: could allow HTTP/HTTPS fuzzy match
+            print("CDX API near match: URL", file=sys.stderr)
+            continue
+        if not record['datetime'].startswith(partial_dt):
+            print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr)
+            continue
+        if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'):
+            return record
+        else:
+            print(f"CDX API near match: status  {record['status_code']}", file=sys.stderr)
+    return None
+
+def process_file(fe, session) -> dict:
+    short_urls = []
+    self_urls = dict()
+    full_urls = dict()
+    status = "unknown"
+
+    for pair in fe['urls']:
+        u = pair['url']
+        if not '://web.archive.org/web/' in u:
+            continue
+        seg = u.split('/')
+        assert seg[2] == "web.archive.org"
+        assert seg[3] == "web"
+        assert seg[4].isdigit()
+        original_url = "/".join(seg[5:])
+        if len(seg[4]) == 12:
+            short_urls.append(u)
+        elif len(seg[4]) == 14:
+            self_urls[original_url] = u
+        else:
+            print(f"other bogus ts: {seg[4]}", file=sys.stderr)
+            return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts")
+
+    if len(short_urls) == 0:
+        return dict(file_entity=fe, full_urls=[], status="skip-no-shorts")
+
+    for short in list(set(short_urls)):
+        seg = short.split('/')
+        ts = seg[4]
+        assert len(ts) == 12 and ts.isdigit()
+        original_url = '/'.join(seg[5:])
+
+        if original_url in full_urls:
+            continue
+
+        if original_url in self_urls:
+            full_urls[original_url] = self_urls[original_url]
+            status = "success-self"
+            continue
+
+        cdx_row_list = get_db_cdx(original_url, http_session=session)
+        for cdx_row in cdx_row_list:
+            if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
+                assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
+                full_urls[original_url] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+                status = "success-db"
+                break
+            else:
+                #print(f"cdx DB found, but no match", file=sys.stderr)
+                pass
+        cdx_row = None
+
+        if original_url in full_urls:
+            continue
+
+        cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+        if cdx_record:
+            if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
+                assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
+                full_urls[original_url] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+                status = "success-api"
+                break
+            else:
+                print(f"cdx API found, but no match", file=sys.stderr)
+        else:
+            print(f"no CDX API record found: {original_url}", file=sys.stderr)
+
+        if original_url not in full_urls:
+            return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
+
+    return dict(
+        file_entity=fe,
+        full_urls=full_urls,
+        status=status,
+    )
+
+def main():
+    session = requests_retry_session()
+    session.headers.update({
+        "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+    })
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        fe = json.loads(line)
+        print(json.dumps(process_file(fe, session=session)))
+
+if __name__=="__main__":
+    main()
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
new file mode 100644
index 00000000..c70ec5b2
--- /dev/null
+++ b/notes/cleanups/wayback_timestamps.md
@@ -0,0 +1,45 @@
+
+At some point, using the arabesque importer (from targetted crawling), we
+accidentially imported a bunch of files with wayback URLs that have 12-digit
+timestamps, instead of the full canonical 14-digit timestamps.
+
+
+## Prep (2021-11-04)
+
+Download most recent file export:
+
+    wget https://archive.org/download/fatcat_bulk_exports_2021-10-07/file_export.json.gz
+
+Filter to files with problem of interest:
+
+    zcat file_export.json.gz \
+        | pv -l \
+        | rg 'web.archive.org/web/\d{12}/' \
+        | gzip \
+        > files_20211007_shortts.json.gz
+    # 111M 0:12:35
+
+    zcat files_20211007_shortts.json.gz | wc -l
+    # 7,935,009
+
+    zcat files_20211007_shortts.json.gz | shuf -n10000 > files_20211007_shortts.10k_sample.json
+
+Wow, this is a lot more than I thought!
+
+## Fetch Complete URL
+
+Want to export JSON like:
+
+    file_entity
+        [existing file entity]
+    full_urls[]
+        <short>: <long>
+    status: str
+
+Status one of:
+
+- 'success-self': the file already has a fixed URL internally
+- 'success-db': lookup URL against sandcrawler-db succeeded, and SHA1 matched
+- 'success-cdx': CDX API lookup succeeded, and SHA1 matched
+- 'fail-hash': found a CDX record, but wrong hash
+- 'fail-not-found': no matching CDX record found
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-04 14:00:56 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-09 14:17:35 -0800
commit	1927a7da466164010f0a6467f4df0c887ba00ad3 (patch)
tree	53d6228a8cadb083942163585663acc275152830
parent	a6d994fbc18debcf3860e6deb12eb54234a42839 (diff)
download	fatcat-1927a7da466164010f0a6467f4df0c887ba00ad3.tar.gz fatcat-1927a7da466164010f0a6467f4df0c887ba00ad3.zip