diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 19:58:20 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 19:58:20 -0800 |
commit | eb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (patch) | |
tree | 28e396669b9758447bc35bd2190608ce5c4116c1 /notes/cleanups/scripts/file2ingestrequest.py | |
parent | 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 (diff) | |
download | fatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.tar.gz fatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.zip |
notes on file_meta partial cleanup
Diffstat (limited to 'notes/cleanups/scripts/file2ingestrequest.py')
-rwxr-xr-x | notes/cleanups/scripts/file2ingestrequest.py | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py new file mode 100755 index 00000000..a005837f --- /dev/null +++ b/notes/cleanups/scripts/file2ingestrequest.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +from typing import Optional +import json, sys + + +def transform(row: dict) -> Optional[dict]: + if row.get('mimetype') not in [None, 'application/pdf']: + return None + if row.get('state') != 'active': + return None + base_url = None + for url in (row.get('urls') or []): + url = url['url'] + if '://web.archive.org/' not in url and '://archive.org/' not in url: + base_url = url + break + if not base_url: + return None + if not row.get('sha1'): + return None + return dict( + base_url=base_url, + ingest_type="pdf", + link_source="fatcat", + link_source_id=f"file_{row['ident']}", + ingest_request_source="file-backfill", + ext_ids=dict( + sha1=row['sha1'], + ), + ) + + +def run(): + for l in sys.stdin: + if not l.strip(): + continue + row = json.loads(l) + request = transform(row) + if request: + print(json.dumps(request, sort_keys=True)) + +if __name__=="__main__": + run() |