aboutsummaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:33:14 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:33:14 -0800
commitc5ea2dba358624f4c14da0a1a988ae14d0edfd59 (patch)
tree7d3934e4922439402f882a374fe477906fd41aae /notes/cleanups/scripts
parentec2809ef2ac51c992463839c1e3451927f5e1661 (diff)
downloadfatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.tar.gz
fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.zip
move 'cleanups' directory from notes to extra/
Diffstat (limited to 'notes/cleanups/scripts')
-rwxr-xr-xnotes/cleanups/scripts/container_dupe_to_json.py55
-rw-r--r--notes/cleanups/scripts/fetch_full_cdx_ts.py201
-rwxr-xr-xnotes/cleanups/scripts/file2ingestrequest.py44
-rwxr-xr-xnotes/cleanups/scripts/file_dupe_to_json.py72
4 files changed, 0 insertions, 372 deletions
diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py
deleted file mode 100755
index 2e841c69..00000000
--- a/notes/cleanups/scripts/container_dupe_to_json.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script can be used to transform duplicate container entity rows into JSON
-objects which can be passed to the container entity merger.
-
-It is initially used to de-dupe ISSN-Ls. The script is based on
-`file_dupe_to_json.py`.
-"""
-
-import json, sys
-from typing import Optional
-
-EXTID_TYPE = "issnl"
-
-
-def print_group(extid, dupe_ids):
- if len(dupe_ids) < 2:
- return
- group = dict(
- entity_type="container",
- primary_id=None,
- duplicate_ids=dupe_ids,
- evidence=dict(
- extid=extid,
- extid_type=EXTID_TYPE,
- ),
- )
- print(json.dumps(group, sort_keys=True))
-
-def run():
- last_extid = None
- dupe_ids = []
- for l in sys.stdin:
- l = l.strip()
- if not l:
- continue
- (row_extid, row_id) = l.split("\t")[0:2]
- if EXTID_TYPE == "issnl":
- assert len(row_extid) == 9
- else:
- raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
- if row_extid == last_extid:
- dupe_ids.append(row_id)
- continue
- elif dupe_ids:
- print_group(last_extid, dupe_ids)
- last_extid = row_extid
- dupe_ids = [row_id]
- if last_extid and dupe_ids:
- print_group(last_extid, dupe_ids)
-
-
-if __name__=="__main__":
- run()
diff --git a/notes/cleanups/scripts/fetch_full_cdx_ts.py b/notes/cleanups/scripts/fetch_full_cdx_ts.py
deleted file mode 100644
index ebcf0d62..00000000
--- a/notes/cleanups/scripts/fetch_full_cdx_ts.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import base64
-from typing import Optional, List
-
-import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
-
-def requests_retry_session(
- retries: int = 10,
- backoff_factor: int = 3,
- status_forcelist: List[int] = [500, 502, 504],
- session: requests.Session = None,
-) -> requests.Session:
- """
- From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
- """
- session = session or requests.Session()
- retry = Retry(
- total=retries,
- read=retries,
- connect=retries,
- backoff_factor=backoff_factor,
- status_forcelist=status_forcelist,
- )
- adapter = HTTPAdapter(max_retries=retry)
- session.mount("http://", adapter)
- session.mount("https://", adapter)
- return session
-
-def b32_hex(s: str) -> str:
- """
- Converts a base32-encoded SHA-1 checksum into hex-encoded
-
- base32 checksums are used by, eg, heritrix and in wayback CDX files
- """
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- if len(s) == 40:
- return s
- raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-
-
-SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030"
-
-def get_db_cdx(url: str, http_session) -> List[dict]:
- resp = http_session.get(SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(url="eq." + url))
- resp.raise_for_status()
- rows = resp.json()
- return rows or []
-
-CDX_API_URL = "https://web.archive.org/cdx/search/cdx"
-
-def get_api_cdx(url: str, partial_dt: str, http_session) -> Optional[dict]:
-
- params = {
- "url": url,
- "from": partial_dt,
- "to": partial_dt,
- "matchType": "exact",
- "output": "json",
- "limit": 20,
- # can't filter status because might be warc/revisit
- #"filter": "statuscode:200",
- }
- resp = http_session.get(CDX_API_URL, params=params)
- resp.raise_for_status()
- rows = resp.json()
-
- if not rows:
- return None
- #print(rows, file=sys.stderr)
- if len(rows) < 2:
- return None
-
- for raw in rows[1:]:
- record = dict(
- surt=raw[0],
- datetime=raw[1],
- url=raw[2],
- mimetype=raw[3],
- status_code=raw[4],
- sha1b32=raw[5],
- sha1hex=b32_hex(raw[5]),
- )
- if record['url'] != url:
- # TODO: could allow HTTP/HTTPS fuzzy match
- print("CDX API near match: URL", file=sys.stderr)
- continue
- if not record['datetime'].startswith(partial_dt):
- print(f"CDX API near match: datetime {partial_dt} {record['datetime']}", file=sys.stderr)
- continue
- if record['status_code'] == "200" or (record['status_code'] == '-' and record['mimetype'] == 'warc/revisit'):
- return record
- else:
- print(f"CDX API near match: status {record['status_code']}", file=sys.stderr)
- return None
-
-def process_file(fe, session) -> dict:
- short_urls = []
- self_urls = dict()
- full_urls = dict()
- status = "unknown"
-
- for pair in fe['urls']:
- u = pair['url']
- if not '://web.archive.org/web/' in u:
- continue
- seg = u.split('/')
- assert seg[2] == "web.archive.org"
- assert seg[3] == "web"
- if not seg[4].isdigit():
- continue
- original_url = "/".join(seg[5:])
- if len(seg[4]) == 12 or len(seg[4]) == 4:
- short_urls.append(u)
- elif len(seg[4]) == 14:
- self_urls[original_url] = u
- else:
- print(f"other bogus ts: {seg[4]}", file=sys.stderr)
- return dict(file_entity=fe, full_urls=full_urls, status="fail-bogus-ts")
-
- if len(short_urls) == 0:
- return dict(file_entity=fe, full_urls=[], status="skip-no-shorts")
-
- for short in list(set(short_urls)):
- seg = short.split('/')
- ts = seg[4]
- assert len(ts) in [12,4] and ts.isdigit()
- original_url = '/'.join(seg[5:])
-
- if short in full_urls:
- continue
-
- if original_url in self_urls and ts in self_urls[original_url]:
- full_urls[short] = self_urls[original_url]
- status = "success-self"
- continue
-
- cdx_row_list = get_db_cdx(original_url, http_session=session)
- for cdx_row in cdx_row_list:
- if cdx_row['sha1hex'] == fe['sha1'] and cdx_row['url'] == original_url and cdx_row['datetime'].startswith(ts):
- assert len(cdx_row['datetime']) == 14 and cdx_row['datetime'].isdigit()
- full_urls[short] = f"https://web.archive.org/web/{cdx_row['datetime']}/{original_url}"
- status = "success-db"
- break
- else:
- #print(f"cdx DB found, but no match", file=sys.stderr)
- pass
- cdx_row = None
-
- if short in full_urls:
- continue
-
- cdx_record = None
- try:
- cdx_record = get_api_cdx(original_url, partial_dt=ts, http_session=session)
- except requests.exceptions.HTTPError as e:
- if e.response.status_code == 403:
- return dict(file_entity=fe, full_urls=full_urls, status="fail-cdx-403")
- else:
- raise
- if cdx_record:
- if cdx_record['sha1hex'] == fe['sha1'] and cdx_record['url'] == original_url and cdx_record['datetime'].startswith(ts):
- assert len(cdx_record['datetime']) == 14 and cdx_record['datetime'].isdigit()
- full_urls[short] = f"https://web.archive.org/web/{cdx_record['datetime']}/{original_url}"
- status = "success-api"
- break
- else:
- print(f"cdx API found, but no match", file=sys.stderr)
- else:
- print(f"no CDX API record found: {original_url}", file=sys.stderr)
-
- if short not in full_urls:
- return dict(file_entity=fe, full_urls=full_urls, status="fail-not-found")
-
- return dict(
- file_entity=fe,
- full_urls=full_urls,
- status=status,
- )
-
-def main():
- session = requests_retry_session()
- session.headers.update({
- "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot",
- })
- for line in sys.stdin:
- if not line.strip():
- continue
- fe = json.loads(line)
- print(json.dumps(process_file(fe, session=session)))
-
-if __name__=="__main__":
- main()
diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py
deleted file mode 100755
index a005837f..00000000
--- a/notes/cleanups/scripts/file2ingestrequest.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-
-from typing import Optional
-import json, sys
-
-
-def transform(row: dict) -> Optional[dict]:
- if row.get('mimetype') not in [None, 'application/pdf']:
- return None
- if row.get('state') != 'active':
- return None
- base_url = None
- for url in (row.get('urls') or []):
- url = url['url']
- if '://web.archive.org/' not in url and '://archive.org/' not in url:
- base_url = url
- break
- if not base_url:
- return None
- if not row.get('sha1'):
- return None
- return dict(
- base_url=base_url,
- ingest_type="pdf",
- link_source="fatcat",
- link_source_id=f"file_{row['ident']}",
- ingest_request_source="file-backfill",
- ext_ids=dict(
- sha1=row['sha1'],
- ),
- )
-
-
-def run():
- for l in sys.stdin:
- if not l.strip():
- continue
- row = json.loads(l)
- request = transform(row)
- if request:
- print(json.dumps(request, sort_keys=True))
-
-if __name__=="__main__":
- run()
diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py
deleted file mode 100755
index 2064dc1c..00000000
--- a/notes/cleanups/scripts/file_dupe_to_json.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script can be used to transform duplicate file entity hash export rows
-into JSON objects which can be passed to the file entity merger.
-
-The input is expected to be a TSV with two columns: a hash value in the first
-column, and a fatcat file entity ident (in UUID format, not "fatcat ident"
-encoded) in the second column. The rows are assumed to be sorted by hash value
-(the first column), and duplicate values (same hash, differing UUID) are
-contiguous.
-
-File hashes aren't really "external identifiers" (ext_id), but we treat them as
-such here.
-
-Script is pretty simple, should be possible to copy and reuse for release,
-container, creator entity duplicates.
-"""
-
-import json, sys
-from typing import Optional
-import base64, uuid
-
-EXTID_TYPE = "sha1"
-
-def uuid2fcid(s: str) -> str:
- """
- Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
- """
- raw = uuid.UUID(s).bytes
- return base64.b32encode(raw)[:26].lower().decode("utf-8")
-
-def print_group(extid, dupe_ids):
- if len(dupe_ids) < 2:
- return
- group = dict(
- entity_type="file",
- primary_id=None,
- duplicate_ids=dupe_ids,
- evidence=dict(
- extid=extid,
- extid_type=EXTID_TYPE,
- ),
- )
- print(json.dumps(group, sort_keys=True))
-
-def run():
- last_extid = None
- dupe_ids = []
- for l in sys.stdin:
- l = l.strip()
- if not l:
- continue
- (row_extid, row_uuid) = l.split("\t")[0:2]
- if EXTID_TYPE == "sha1":
- assert len(row_extid) == 40
- else:
- raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
- row_id = uuid2fcid(row_uuid)
- if row_extid == last_extid:
- dupe_ids.append(row_id)
- continue
- elif dupe_ids:
- print_group(last_extid, dupe_ids)
- last_extid = row_extid
- dupe_ids = [row_id]
- if last_extid and dupe_ids:
- print_group(last_extid, dupe_ids)
-
-
-if __name__=="__main__":
- run()