diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 14:33:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 14:33:14 -0800 |
commit | c5ea2dba358624f4c14da0a1a988ae14d0edfd59 (patch) | |
tree | 7d3934e4922439402f882a374fe477906fd41aae /extra/cleanups/scripts | |
parent | ec2809ef2ac51c992463839c1e3451927f5e1661 (diff) | |
download | fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.tar.gz fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.zip |
move 'cleanups' directory from notes to extra/
Diffstat (limited to 'extra/cleanups/scripts')
-rwxr-xr-x | extra/cleanups/scripts/container_dupe_to_json.py | 55 | ||||
-rw-r--r-- | extra/cleanups/scripts/fetch_full_cdx_ts.py | 201 | ||||
-rwxr-xr-x | extra/cleanups/scripts/file2ingestrequest.py | 44 | ||||
-rwxr-xr-x | extra/cleanups/scripts/file_dupe_to_json.py | 72 |
4 files changed, 372 insertions, 0 deletions
diff --git a/extra/cleanups/scripts/container_dupe_to_json.py b/extra/cleanups/scripts/container_dupe_to_json.py new file mode 100755 index 00000000..2e841c69 --- /dev/null +++ b/extra/cleanups/scripts/container_dupe_to_json.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +""" +This script can be used to transform duplicate container entity rows into JSON +objects which can be passed to the container entity merger. + +It is initially used to de-dupe ISSN-Ls. The script is based on +`file_dupe_to_json.py`. +""" + +import json, sys +from typing import Optional + +EXTID_TYPE = "issnl" + + +def print_group(extid, dupe_ids): + if len(dupe_ids) < 2: + return + group = dict( + entity_type="container", + primary_id=None, + duplicate_ids=dupe_ids, + evidence=dict( + extid=extid, + extid_type=EXTID_TYPE, + ), + ) + print(json.dumps(group, sort_keys=True)) + +def run(): + last_extid = None + dupe_ids = [] + for l in sys.stdin: + l = l.strip() + if not l: + continue + (row_extid, row_id) = l.split("\t")[0:2] + if EXTID_TYPE == "issnl": + assert len(row_extid) == 9 + else: + raise Exception(f"extid type not supported yet: {EXTID_TYPE}") + if row_extid == last_extid: + dupe_ids.append(row_id) + continue + elif dupe_ids: + print_group(last_extid, dupe_ids) + last_extid = row_extid + dupe_ids = [row_id] + if last_extid and dupe_ids: + print_group(last_extid, dupe_ids) + + +if __name__=="__main__": + run() diff --git a/extra/cleanups/scripts/fetch_full_cdx_ts.py b/extra/cleanups/scripts/fetch_full_cdx_ts.py new file mode 100644 index 00000000..ebcf0d62 --- /dev/null +++ b/extra/cleanups/scripts/fetch_full_cdx_ts.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +import sys +import json +import base64 +from typing import Optional, List + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + +def requests_retry_session( + retries: int = 10, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], + session: requests.Session = None, +) -> requests.Session: + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + +def b32_hex(s: str) -> str: + """ + Converts a base32-encoded SHA-1 checksum into hex-encoded + + base32 checksums are used by, eg, heritrix and in wayback CDX files + """ + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + if len(s) == 40: + return s + raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + + +SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030" + +def get_db_cdx(url: str, http_session) -> List[dict]: + resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url)) + resp.raise_for_status() + rows = resp.json() + return rows or [] + +CDX_API_URL = "https://web.archive.org/cdx/search/cdx" + +def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]: + + params = { + "url": url, + "from": partial_dt, + "to": partial_dt, + "matchType": "exact", + "output": "json", + "limit": 20, + # can't filter status because might be warc/revisit + #"filter": "statuscode:200", + } + resp = http_session.get(CDX_API_URL, params=params) + resp.raise_for_status() + rows = resp.json() + + if not rows: + return None + #print(rows, file=sys.stderr) + if len(rows) < 2: + return None + + for raw in rows[1:]: + record = dict( + surt=raw[0], + datetime=raw[1], + url=raw[2], + mimetype=raw[3], + status_code=raw[4], + sha1b32=raw[5], + sha1hex=b32_hex(raw[5]), + ) + if record['url'] != url: + # TODO: could allow HTTP/HTTPS fuzzy match + print("CDX API near match: URL", file=sys.stderr) + continue + if not record['datetime'].startswith(partial_dt): + print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr) + continue + if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'): + return record + else: + print(f"CDX API near match: status {record['status_code']}", file=sys.stderr) + return None + +def process_file(fe, session) -> dict: + short_urls = [] + self_urls = dict() + full_urls = dict() + status = "unknown" + + for pair in fe['urls']: + u = pair['url'] + if not '://web.archive.org/web/' in u: + continue + seg = u.split('/') + assert seg[2] == "web.archive.org" + assert seg[3] == "web" + if not seg[4].isdigit(): + continue + original_url = "/".join(seg[5:]) + if len(seg[4]) == 12 or len(seg[4]) == 4: + short_urls.append(u) + elif len(seg[4]) == 14: + self_urls[original_url] = u + else: + print(f"other bogus ts: {seg[4]}", file=sys.stderr) + return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts") + + if len(short_urls) == 0: + return dict(file_entity=fe, full_urls=[], status="skip-no-shorts") + + for short in list(set(short_urls)): + seg = short.split('/') + ts = seg[4] + assert len(ts) in [12,4] and ts.isdigit() + original_url = '/'.join(seg[5:]) + + if short in full_urls: + continue + + if original_url in self_urls and ts in self_urls[original_url]: + full_urls[short] = self_urls[original_url] + status = "success-self" + continue + + cdx_row_list = get_db_cdx(original_url, http_session=session) + for cdx_row in cdx_row_list: + if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts): + assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit() + full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}" + status = "success-db" + break + else: + #print(f"cdx DB found, but no match", file=sys.stderr) + pass + cdx_row = None + + if short in full_urls: + continue + + cdx_record = None + try: + cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403") + else: + raise + if cdx_record: + if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts): + assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit() + full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}" + status = "success-api" + break + else: + print(f"cdx API found, but no match", file=sys.stderr) + else: + print(f"no CDX API record found: {original_url}", file=sys.stderr) + + if short not in full_urls: + return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found") + + return dict( + file_entity=fe, + full_urls=full_urls, + status=status, + ) + +def main(): + session = requests_retry_session() + session.headers.update({ + "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot", + }) + for line in sys.stdin: + if not line.strip(): + continue + fe = json.loads(line) + print(json.dumps(process_file(fe, session=session))) + +if __name__=="__main__": + main() diff --git a/extra/cleanups/scripts/file2ingestrequest.py b/extra/cleanups/scripts/file2ingestrequest.py new file mode 100755 index 00000000..a005837f --- /dev/null +++ b/extra/cleanups/scripts/file2ingestrequest.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +from typing import Optional +import json, sys + + +def transform(row: dict) -> Optional[dict]: + if row.get('mimetype') not in [None, 'application/pdf']: + return None + if row.get('state') != 'active': + return None + base_url = None + for url in (row.get('urls') or []): + url = url['url'] + if '://web.archive.org/' not in url and '://archive.org/' not in url: + base_url = url + break + if not base_url: + return None + if not row.get('sha1'): + return None + return dict( + base_url=base_url, + ingest_type="pdf", + link_source="fatcat", + link_source_id=f"file_{row['ident']}", + ingest_request_source="file-backfill", + ext_ids=dict( + sha1=row['sha1'], + ), + ) + + +def run(): + for l in sys.stdin: + if not l.strip(): + continue + row = json.loads(l) + request = transform(row) + if request: + print(json.dumps(request, sort_keys=True)) + +if __name__=="__main__": + run() diff --git a/extra/cleanups/scripts/file_dupe_to_json.py b/extra/cleanups/scripts/file_dupe_to_json.py new file mode 100755 index 00000000..2064dc1c --- /dev/null +++ b/extra/cleanups/scripts/file_dupe_to_json.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +""" +This script can be used to transform duplicate file entity hash export rows +into JSON objects which can be passed to the file entity merger. + +The input is expected to be a TSV with two columns: a hash value in the first +column, and a fatcat file entity ident (in UUID format, not "fatcat ident" +encoded) in the second column. The rows are assumed to be sorted by hash value +(the first column), and duplicate values (same hash, differing UUID) are +contiguous. + +File hashes aren't really "external identifiers" (ext_id), but we treat them as +such here. + +Script is pretty simple, should be possible to copy and reuse for release, +container, creator entity duplicates. +""" + +import json, sys +from typing import Optional +import base64, uuid + +EXTID_TYPE = "sha1" + +def uuid2fcid(s: str) -> str: + """ + Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) + """ + raw = uuid.UUID(s).bytes + return base64.b32encode(raw)[:26].lower().decode("utf-8") + +def print_group(extid, dupe_ids): + if len(dupe_ids) < 2: + return + group = dict( + entity_type="file", + primary_id=None, + duplicate_ids=dupe_ids, + evidence=dict( + extid=extid, + extid_type=EXTID_TYPE, + ), + ) + print(json.dumps(group, sort_keys=True)) + +def run(): + last_extid = None + dupe_ids = [] + for l in sys.stdin: + l = l.strip() + if not l: + continue + (row_extid, row_uuid) = l.split("\t")[0:2] + if EXTID_TYPE == "sha1": + assert len(row_extid) == 40 + else: + raise Exception(f"extid type not supported yet: {EXTID_TYPE}") + row_id = uuid2fcid(row_uuid) + if row_extid == last_extid: + dupe_ids.append(row_id) + continue + elif dupe_ids: + print_group(last_extid, dupe_ids) + last_extid = row_extid + dupe_ids = [row_id] + if last_extid and dupe_ids: + print_group(last_extid, dupe_ids) + + +if __name__=="__main__": + run() |