From c5ea2dba358624f4c14da0a1a988ae14d0edfd59 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 14:33:14 -0800 Subject: move 'cleanups' directory from notes to extra/ --- notes/cleanups/scripts/container_dupe_to_json.py | 55 ------- notes/cleanups/scripts/fetch_full_cdx_ts.py | 201 ----------------------- notes/cleanups/scripts/file2ingestrequest.py | 44 ----- notes/cleanups/scripts/file_dupe_to_json.py | 72 -------- 4 files changed, 372 deletions(-) delete mode 100755 notes/cleanups/scripts/container_dupe_to_json.py delete mode 100644 notes/cleanups/scripts/fetch_full_cdx_ts.py delete mode 100755 notes/cleanups/scripts/file2ingestrequest.py delete mode 100755 notes/cleanups/scripts/file_dupe_to_json.py (limited to 'notes/cleanups/scripts') diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py deleted file mode 100755 index 2e841c69..00000000 --- a/notes/cleanups/scripts/container_dupe_to_json.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script can be used to transform duplicate container entity rows into JSON -objects which can be passed to the container entity merger. - -It is initially used to de-dupe ISSN-Ls. The script is based on -`file_dupe_to_json.py`. -""" - -import json, sys -from typing import Optional - -EXTID_TYPE = "issnl" - - -def print_group(extid, dupe_ids): - if len(dupe_ids) < 2: - return - group = dict( - entity_type="container", - primary_id=None, - duplicate_ids=dupe_ids, - evidence=dict( - extid=extid, - extid_type=EXTID_TYPE, - ), - ) - print(json.dumps(group, sort_keys=True)) - -def run(): - last_extid = None - dupe_ids = [] - for l in sys.stdin: - l = l.strip() - if not l: - continue - (row_extid, row_id) = l.split("\t")[0:2] - if EXTID_TYPE == "issnl": - assert len(row_extid) == 9 - else: - raise Exception(f"extid type not supported yet: {EXTID_TYPE}") - if row_extid == last_extid: - dupe_ids.append(row_id) - continue - elif dupe_ids: - print_group(last_extid, dupe_ids) - last_extid = row_extid - dupe_ids = [row_id] - if last_extid and dupe_ids: - print_group(last_extid, dupe_ids) - - -if __name__=="__main__": - run() diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py deleted file mode 100644 index ebcf0d62..00000000 --- a/notes/cleanups/scripts/fetch_full_cdx_ts.py +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -import base64 -from typing import Optional, List - -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error - -def requests_retry_session( - retries: int = 10, - backoff_factor: int = 3, - status_forcelist: List[int] = [500, 502, 504], - session: requests.Session = None, -) -> requests.Session: - """ - From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests - """ - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session - -def b32_hex(s: str) -> str: - """ - Converts a base32-encoded SHA-1 checksum into hex-encoded - - base32 checksums are used by, eg, heritrix and in wayback CDX files - """ - s = s.strip().split()[0].lower() - if s.startswith("sha1:"): - s = s[5:] - if len(s) != 32: - if len(s) == 40: - return s - raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) - return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") - - -SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030" - -def get_db_cdx(url: str, http_session) -> List[dict]: - resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url)) - resp.raise_for_status() - rows = resp.json() - return rows or [] - -CDX_API_URL = "https://web.archive.org/cdx/search/cdx" - -def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]: - - params = { - "url": url, - "from": partial_dt, - "to": partial_dt, - "matchType": "exact", - "output": "json", - "limit": 20, - # can't filter status because might be warc/revisit - #"filter": "statuscode:200", - } - resp = http_session.get(CDX_API_URL, params=params) - resp.raise_for_status() - rows = resp.json() - - if not rows: - return None - #print(rows, file=sys.stderr) - if len(rows) < 2: - return None - - for raw in rows[1:]: - record = dict( - surt=raw[0], - datetime=raw[1], - url=raw[2], - mimetype=raw[3], - status_code=raw[4], - sha1b32=raw[5], - sha1hex=b32_hex(raw[5]), - ) - if record['url'] != url: - # TODO: could allow HTTP/HTTPS fuzzy match - print("CDX API near match: URL", file=sys.stderr) - continue - if not record['datetime'].startswith(partial_dt): - print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr) - continue - if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'): - return record - else: - print(f"CDX API near match: status {record['status_code']}", file=sys.stderr) - return None - -def process_file(fe, session) -> dict: - short_urls = [] - self_urls = dict() - full_urls = dict() - status = "unknown" - - for pair in fe['urls']: - u = pair['url'] - if not '://web.archive.org/web/' in u: - continue - seg = u.split('/') - assert seg[2] == "web.archive.org" - assert seg[3] == "web" - if not seg[4].isdigit(): - continue - original_url = "/".join(seg[5:]) - if len(seg[4]) == 12 or len(seg[4]) == 4: - short_urls.append(u) - elif len(seg[4]) == 14: - self_urls[original_url] = u - else: - print(f"other bogus ts: {seg[4]}", file=sys.stderr) - return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts") - - if len(short_urls) == 0: - return dict(file_entity=fe, full_urls=[], status="skip-no-shorts") - - for short in list(set(short_urls)): - seg = short.split('/') - ts = seg[4] - assert len(ts) in [12,4] and ts.isdigit() - original_url = '/'.join(seg[5:]) - - if short in full_urls: - continue - - if original_url in self_urls and ts in self_urls[original_url]: - full_urls[short] = self_urls[original_url] - status = "success-self" - continue - - cdx_row_list = get_db_cdx(original_url, http_session=session) - for cdx_row in cdx_row_list: - if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): - assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() - full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" - status = "success-db" - break - else: - #print(f"cdx DB found, but no match", file=sys.stderr) - pass - cdx_row = None - - if short in full_urls: - continue - - cdx_record = None - try: - cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) - except requests.exceptions.HTTPError as e: - if e.response.status_code == 403: - return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") - else: - raise - if cdx_record: - if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): - assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() - full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" - status = "success-api" - break - else: - print(f"cdx API found, but no match", file=sys.stderr) - else: - print(f"no CDX API record found: {original_url}", file=sys.stderr) - - if short not in full_urls: - return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") - - return dict( - file_entity=fe, - full_urls=full_urls, - status=status, - ) - -def main(): - session = requests_retry_session() - session.headers.update({ - "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot", - }) - for line in sys.stdin: - if not line.strip(): - continue - fe = json.loads(line) - print(json.dumps(process_file(fe, session=session))) - -if __name__=="__main__": - main() diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py deleted file mode 100755 index a005837f..00000000 --- a/notes/cleanups/scripts/file2ingestrequest.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -from typing import Optional -import json, sys - - -def transform(row: dict) -> Optional[dict]: - if row.get('mimetype') not in [None, 'application/pdf']: - return None - if row.get('state') != 'active': - return None - base_url = None - for url in (row.get('urls') or []): - url = url['url'] - if '://web.archive.org/' not in url and '://archive.org/' not in url: - base_url = url - break - if not base_url: - return None - if not row.get('sha1'): - return None - return dict( - base_url=base_url, - ingest_type="pdf", - link_source="fatcat", - link_source_id=f"file_{row['ident']}", - ingest_request_source="file-backfill", - ext_ids=dict( - sha1=row['sha1'], - ), - ) - - -def run(): - for l in sys.stdin: - if not l.strip(): - continue - row = json.loads(l) - request = transform(row) - if request: - print(json.dumps(request, sort_keys=True)) - -if __name__=="__main__": - run() diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py deleted file mode 100755 index 2064dc1c..00000000 --- a/notes/cleanups/scripts/file_dupe_to_json.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script can be used to transform duplicate file entity hash export rows -into JSON objects which can be passed to the file entity merger. - -The input is expected to be a TSV with two columns: a hash value in the first -column, and a fatcat file entity ident (in UUID format, not "fatcat ident" -encoded) in the second column. The rows are assumed to be sorted by hash value -(the first column), and duplicate values (same hash, differing UUID) are -contiguous. - -File hashes aren't really "external identifiers" (ext_id), but we treat them as -such here. - -Script is pretty simple, should be possible to copy and reuse for release, -container, creator entity duplicates. -""" - -import json, sys -from typing import Optional -import base64, uuid - -EXTID_TYPE = "sha1" - -def uuid2fcid(s: str) -> str: - """ - Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) - """ - raw = uuid.UUID(s).bytes - return base64.b32encode(raw)[:26].lower().decode("utf-8") - -def print_group(extid, dupe_ids): - if len(dupe_ids) < 2: - return - group = dict( - entity_type="file", - primary_id=None, - duplicate_ids=dupe_ids, - evidence=dict( - extid=extid, - extid_type=EXTID_TYPE, - ), - ) - print(json.dumps(group, sort_keys=True)) - -def run(): - last_extid = None - dupe_ids = [] - for l in sys.stdin: - l = l.strip() - if not l: - continue - (row_extid, row_uuid) = l.split("\t")[0:2] - if EXTID_TYPE == "sha1": - assert len(row_extid) == 40 - else: - raise Exception(f"extid type not supported yet: {EXTID_TYPE}") - row_id = uuid2fcid(row_uuid) - if row_extid == last_extid: - dupe_ids.append(row_id) - continue - elif dupe_ids: - print_group(last_extid, dupe_ids) - last_extid = row_extid - dupe_ids = [row_id] - if last_extid and dupe_ids: - print_group(last_extid, dupe_ids) - - -if __name__=="__main__": - run() -- cgit v1.2.3