diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 14:33:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 14:33:14 -0800 |
commit | c5ea2dba358624f4c14da0a1a988ae14d0edfd59 (patch) | |
tree | 7d3934e4922439402f882a374fe477906fd41aae /extra/cleanups/scripts/file2ingestrequest.py | |
parent | ec2809ef2ac51c992463839c1e3451927f5e1661 (diff) | |
download | fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.tar.gz fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.zip |
move 'cleanups' directory from notes to extra/
Diffstat (limited to 'extra/cleanups/scripts/file2ingestrequest.py')
-rwxr-xr-x | extra/cleanups/scripts/file2ingestrequest.py | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/extra/cleanups/scripts/file2ingestrequest.py b/extra/cleanups/scripts/file2ingestrequest.py new file mode 100755 index 00000000..a005837f --- /dev/null +++ b/extra/cleanups/scripts/file2ingestrequest.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +from typing import Optional +import json, sys + + +def transform(row: dict) -> Optional[dict]: + if row.get('mimetype') not in [None, 'application/pdf']: + return None + if row.get('state') != 'active': + return None + base_url = None + for url in (row.get('urls') or []): + url = url['url'] + if '://web.archive.org/' not in url and '://archive.org/' not in url: + base_url = url + break + if not base_url: + return None + if not row.get('sha1'): + return None + return dict( + base_url=base_url, + ingest_type="pdf", + link_source="fatcat", + link_source_id=f"file_{row['ident']}", + ingest_request_source="file-backfill", + ext_ids=dict( + sha1=row['sha1'], + ), + ) + + +def run(): + for l in sys.stdin: + if not l.strip(): + continue + row = json.loads(l) + request = transform(row) + if request: + print(json.dumps(request, sort_keys=True)) + +if __name__=="__main__": + run() |