diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 14:33:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 14:33:14 -0800 |
commit | c5ea2dba358624f4c14da0a1a988ae14d0edfd59 (patch) | |
tree | 7d3934e4922439402f882a374fe477906fd41aae /notes/cleanups/scripts/file2ingestrequest.py | |
parent | ec2809ef2ac51c992463839c1e3451927f5e1661 (diff) | |
download | fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.tar.gz fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.zip |
move 'cleanups' directory from notes to extra/
Diffstat (limited to 'notes/cleanups/scripts/file2ingestrequest.py')
-rwxr-xr-x | notes/cleanups/scripts/file2ingestrequest.py | 44 |
1 files changed, 0 insertions, 44 deletions
diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py deleted file mode 100755 index a005837f..00000000 --- a/notes/cleanups/scripts/file2ingestrequest.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 - -from typing import Optional -import json, sys - - -def transform(row: dict) -> Optional[dict]: - if row.get('mimetype') not in [None, 'application/pdf']: - return None - if row.get('state') != 'active': - return None - base_url = None - for url in (row.get('urls') or []): - url = url['url'] - if '://web.archive.org/' not in url and '://archive.org/' not in url: - base_url = url - break - if not base_url: - return None - if not row.get('sha1'): - return None - return dict( - base_url=base_url, - ingest_type="pdf", - link_source="fatcat", - link_source_id=f"file_{row['ident']}", - ingest_request_source="file-backfill", - ext_ids=dict( - sha1=row['sha1'], - ), - ) - - -def run(): - for l in sys.stdin: - if not l.strip(): - continue - row = json.loads(l) - request = transform(row) - if request: - print(json.dumps(request, sort_keys=True)) - -if __name__=="__main__": - run() |