From c5ea2dba358624f4c14da0a1a988ae14d0edfd59 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 14:33:14 -0800 Subject: move 'cleanups' directory from notes to extra/ --- notes/cleanups/scripts/file2ingestrequest.py | 44 ---------------------------- 1 file changed, 44 deletions(-) delete mode 100755 notes/cleanups/scripts/file2ingestrequest.py (limited to 'notes/cleanups/scripts/file2ingestrequest.py') diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py deleted file mode 100755 index a005837f..00000000 --- a/notes/cleanups/scripts/file2ingestrequest.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -from typing import Optional -import json, sys - - -def transform(row: dict) -> Optional[dict]: - if row.get('mimetype') not in [None, 'application/pdf']: - return None - if row.get('state') != 'active': - return None - base_url = None - for url in (row.get('urls') or []): - url = url['url'] - if '://web.archive.org/' not in url and '://archive.org/' not in url: - base_url = url - break - if not base_url: - return None - if not row.get('sha1'): - return None - return dict( - base_url=base_url, - ingest_type="pdf", - link_source="fatcat", - link_source_id=f"file_{row['ident']}", - ingest_request_source="file-backfill", - ext_ids=dict( - sha1=row['sha1'], - ), - ) - - -def run(): - for l in sys.stdin: - if not l.strip(): - continue - row = json.loads(l) - request = transform(row) - if request: - print(json.dumps(request, sort_keys=True)) - -if __name__=="__main__": - run() -- cgit v1.2.3