aboutsummaryrefslogtreecommitdiffstats
path: root/extra/cleanups/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'extra/cleanups/scripts')
-rwxr-xr-xextra/cleanups/scripts/container_dupe_to_json.py55
-rw-r--r--extra/cleanups/scripts/fetch_full_cdx_ts.py201
-rwxr-xr-xextra/cleanups/scripts/file2ingestrequest.py44
-rwxr-xr-xextra/cleanups/scripts/file_dupe_to_json.py72
4 files changed, 372 insertions, 0 deletions
diff --git a/extra/cleanups/scripts/container_dupe_to_json.py b/extra/cleanups/scripts/container_dupe_to_json.py
new file mode 100755
index 00000000..2e841c69
--- /dev/null
+++ b/extra/cleanups/scripts/container_dupe_to_json.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+"""
+This script can be used to transform duplicate container entity rows into JSON
+objects which can be passed to the container entity merger.
+
+It is initially used to de-dupe ISSN-Ls. The script is based on
+`file_dupe_to_json.py`.
+"""
+
+import json, sys
+from typing import Optional
+
+EXTID_TYPE = "issnl"
+
+
+def print_group(extid, dupe_ids):
+ if len(dupe_ids) < 2:
+ return
+ group = dict(
+ entity_type="container",
+ primary_id=None,
+ duplicate_ids=dupe_ids,
+ evidence=dict(
+ extid=extid,
+ extid_type=EXTID_TYPE,
+ ),
+ )
+ print(json.dumps(group, sort_keys=True))
+
+def run():
+ last_extid = None
+ dupe_ids = []
+ for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ (row_extid, row_id) = l.split("\t")[0:2]
+ if EXTID_TYPE == "issnl":
+ assert len(row_extid) == 9
+ else:
+ raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
+ if row_extid == last_extid:
+ dupe_ids.append(row_id)
+ continue
+ elif dupe_ids:
+ print_group(last_extid, dupe_ids)
+ last_extid = row_extid
+ dupe_ids = [row_id]
+ if last_extid and dupe_ids:
+ print_group(last_extid, dupe_ids)
+
+
+if __name__=="__main__":
+ run()
diff --git a/extra/cleanups/scripts/fetch_full_cdx_ts.py b/extra/cleanups/scripts/fetch_full_cdx_ts.py
new file mode 100644
index 00000000..ebcf0d62
--- /dev/null
+++ b/extra/cleanups/scripts/fetch_full_cdx_ts.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import base64
+from typing import Optional, List
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+def requests_retry_session(
+ retries: int = 10,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: requests.Session = None,
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+def b32_hex(s: str) -> str:
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
+
+SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
+
+def get_db_cdx(url: str, http_session) -> List[dict]:
+ resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url))
+ resp.raise_for_status()
+ rows = resp.json()
+ return rows or []
+
+CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
+
+def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]:
+
+ params = {
+ "url": url,
+ "from": partial_dt,
+ "to": partial_dt,
+ "matchType": "exact",
+ "output": "json",
+ "limit": 20,
+ # can't filter status because might be warc/revisit
+ #"filter": "statuscode:200",
+ }
+ resp = http_session.get(CDX_API_URL, params=params)
+ resp.raise_for_status()
+ rows = resp.json()
+
+ if not rows:
+ return None
+ #print(rows, file=sys.stderr)
+ if len(rows) < 2:
+ return None
+
+ for raw in rows[1:]:
+ record = dict(
+ surt=raw[0],
+ datetime=raw[1],
+ url=raw[2],
+ mimetype=raw[3],
+ status_code=raw[4],
+ sha1b32=raw[5],
+ sha1hex=b32_hex(raw[5]),
+ )
+ if record['url'] != url:
+ # TODO: could allow HTTP/HTTPS fuzzy match
+ print("CDX API near match: URL", file=sys.stderr)
+ continue
+ if not record['datetime'].startswith(partial_dt):
+ print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr)
+ continue
+ if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'):
+ return record
+ else:
+ print(f"CDX API near match: status {record['status_code']}", file=sys.stderr)
+ return None
+
+def process_file(fe, session) -> dict:
+ short_urls = []
+ self_urls = dict()
+ full_urls = dict()
+ status = "unknown"
+
+ for pair in fe['urls']:
+ u = pair['url']
+ if not '://web.archive.org/web/' in u:
+ continue
+ seg = u.split('/')
+ assert seg[2] == "web.archive.org"
+ assert seg[3] == "web"
+ if not seg[4].isdigit():
+ continue
+ original_url = "/".join(seg[5:])
+ if len(seg[4]) == 12 or len(seg[4]) == 4:
+ short_urls.append(u)
+ elif len(seg[4]) == 14:
+ self_urls[original_url] = u
+ else:
+ print(f"other bogus ts: {seg[4]}", file=sys.stderr)
+ return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts")
+
+ if len(short_urls) == 0:
+ return dict(file_entity=fe, full_urls=[], status="skip-no-shorts")
+
+ for short in list(set(short_urls)):
+ seg = short.split('/')
+ ts = seg[4]
+ assert len(ts) in [12,4] and ts.isdigit()
+ original_url = '/'.join(seg[5:])
+
+ if short in full_urls:
+ continue
+
+ if original_url in self_urls and ts in self_urls[original_url]:
+ full_urls[short] = self_urls[original_url]
+ status = "success-self"
+ continue
+
+ cdx_row_list = get_db_cdx(original_url, http_session=session)
+ for cdx_row in cdx_row_list:
+ if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
+ assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
+ full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
+ status = "success-db"
+ break
+ else:
+ #print(f"cdx DB found, but no match", file=sys.stderr)
+ pass
+ cdx_row = None
+
+ if short in full_urls:
+ continue
+
+ cdx_record = None
+ try:
+ cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
+ except requests.exceptions.HTTPError as e:
+ if e.response.status_code == 403:
+ return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403")
+ else:
+ raise
+ if cdx_record:
+ if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
+ assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
+ full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
+ status = "success-api"
+ break
+ else:
+ print(f"cdx API found, but no match", file=sys.stderr)
+ else:
+ print(f"no CDX API record found: {original_url}", file=sys.stderr)
+
+ if short not in full_urls:
+ return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
+
+ return dict(
+ file_entity=fe,
+ full_urls=full_urls,
+ status=status,
+ )
+
+def main():
+ session = requests_retry_session()
+ session.headers.update({
+ "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
+ })
+ for line in sys.stdin:
+ if not line.strip():
+ continue
+ fe = json.loads(line)
+ print(json.dumps(process_file(fe, session=session)))
+
+if __name__=="__main__":
+ main()
diff --git a/extra/cleanups/scripts/file2ingestrequest.py b/extra/cleanups/scripts/file2ingestrequest.py
new file mode 100755
index 00000000..a005837f
--- /dev/null
+++ b/extra/cleanups/scripts/file2ingestrequest.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from typing import Optional
+import json, sys
+
+
+def transform(row: dict) -> Optional[dict]:
+ if row.get('mimetype') not in [None, 'application/pdf']:
+ return None
+ if row.get('state') != 'active':
+ return None
+ base_url = None
+ for url in (row.get('urls') or []):
+ url = url['url']
+ if '://web.archive.org/' not in url and '://archive.org/' not in url:
+ base_url = url
+ break
+ if not base_url:
+ return None
+ if not row.get('sha1'):
+ return None
+ return dict(
+ base_url=base_url,
+ ingest_type="pdf",
+ link_source="fatcat",
+ link_source_id=f"file_{row['ident']}",
+ ingest_request_source="file-backfill",
+ ext_ids=dict(
+ sha1=row['sha1'],
+ ),
+ )
+
+
+def run():
+ for l in sys.stdin:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ request = transform(row)
+ if request:
+ print(json.dumps(request, sort_keys=True))
+
+if __name__=="__main__":
+ run()
diff --git a/extra/cleanups/scripts/file_dupe_to_json.py b/extra/cleanups/scripts/file_dupe_to_json.py
new file mode 100755
index 00000000..2064dc1c
--- /dev/null
+++ b/extra/cleanups/scripts/file_dupe_to_json.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+"""
+This script can be used to transform duplicate file entity hash export rows
+into JSON objects which can be passed to the file entity merger.
+
+The input is expected to be a TSV with two columns: a hash value in the first
+column, and a fatcat file entity ident (in UUID format, not "fatcat ident"
+encoded) in the second column. The rows are assumed to be sorted by hash value
+(the first column), and duplicate values (same hash, differing UUID) are
+contiguous.
+
+File hashes aren't really "external identifiers" (ext_id), but we treat them as
+such here.
+
+Script is pretty simple, should be possible to copy and reuse for release,
+container, creator entity duplicates.
+"""
+
+import json, sys
+from typing import Optional
+import base64, uuid
+
+EXTID_TYPE = "sha1"
+
+def uuid2fcid(s: str) -> str:
+ """
+ Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
+ """
+ raw = uuid.UUID(s).bytes
+ return base64.b32encode(raw)[:26].lower().decode("utf-8")
+
+def print_group(extid, dupe_ids):
+ if len(dupe_ids) < 2:
+ return
+ group = dict(
+ entity_type="file",
+ primary_id=None,
+ duplicate_ids=dupe_ids,
+ evidence=dict(
+ extid=extid,
+ extid_type=EXTID_TYPE,
+ ),
+ )
+ print(json.dumps(group, sort_keys=True))
+
+def run():
+ last_extid = None
+ dupe_ids = []
+ for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ (row_extid, row_uuid) = l.split("\t")[0:2]
+ if EXTID_TYPE == "sha1":
+ assert len(row_extid) == 40
+ else:
+ raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
+ row_id = uuid2fcid(row_uuid)
+ if row_extid == last_extid:
+ dupe_ids.append(row_id)
+ continue
+ elif dupe_ids:
+ print_group(last_extid, dupe_ids)
+ last_extid = row_extid
+ dupe_ids = [row_id]
+ if last_extid and dupe_ids:
+ print_group(last_extid, dupe_ids)
+
+
+if __name__=="__main__":
+ run()