summaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts/file2ingestrequest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-24 19:58:20 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-24 19:58:20 -0800
commiteb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (patch)
tree28e396669b9758447bc35bd2190608ce5c4116c1 /notes/cleanups/scripts/file2ingestrequest.py
parent75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 (diff)
downloadfatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.tar.gz
fatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.zip
notes on file_meta partial cleanup
Diffstat (limited to 'notes/cleanups/scripts/file2ingestrequest.py')
-rwxr-xr-xnotes/cleanups/scripts/file2ingestrequest.py44
1 files changed, 44 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py
new file mode 100755
index 00000000..a005837f
--- /dev/null
+++ b/notes/cleanups/scripts/file2ingestrequest.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from typing import Optional
+import json, sys
+
+
+def transform(row: dict) -> Optional[dict]:
+ if row.get('mimetype') not in [None, 'application/pdf']:
+ return None
+ if row.get('state') != 'active':
+ return None
+ base_url = None
+ for url in (row.get('urls') or []):
+ url = url['url']
+ if '://web.archive.org/' not in url and '://archive.org/' not in url:
+ base_url = url
+ break
+ if not base_url:
+ return None
+ if not row.get('sha1'):
+ return None
+ return dict(
+ base_url=base_url,
+ ingest_type="pdf",
+ link_source="fatcat",
+ link_source_id=f"file_{row['ident']}",
+ ingest_request_source="file-backfill",
+ ext_ids=dict(
+ sha1=row['sha1'],
+ ),
+ )
+
+
+def run():
+ for l in sys.stdin:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ request = transform(row)
+ if request:
+ print(json.dumps(request, sort_keys=True))
+
+if __name__=="__main__":
+ run()