summaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts/file2ingestrequest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:33:14 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:33:14 -0800
commitc5ea2dba358624f4c14da0a1a988ae14d0edfd59 (patch)
tree7d3934e4922439402f882a374fe477906fd41aae /notes/cleanups/scripts/file2ingestrequest.py
parentec2809ef2ac51c992463839c1e3451927f5e1661 (diff)
downloadfatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.tar.gz
fatcat-c5ea2dba358624f4c14da0a1a988ae14d0edfd59.zip
move 'cleanups' directory from notes to extra/
Diffstat (limited to 'notes/cleanups/scripts/file2ingestrequest.py')
-rwxr-xr-xnotes/cleanups/scripts/file2ingestrequest.py44
1 files changed, 0 insertions, 44 deletions
diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py
deleted file mode 100755
index a005837f..00000000
--- a/notes/cleanups/scripts/file2ingestrequest.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env python3
-
-from typing import Optional
-import json, sys
-
-
-def transform(row: dict) -> Optional[dict]:
- if row.get('mimetype') not in [None, 'application/pdf']:
- return None
- if row.get('state') != 'active':
- return None
- base_url = None
- for url in (row.get('urls') or []):
- url = url['url']
- if '://web.archive.org/' not in url and '://archive.org/' not in url:
- base_url = url
- break
- if not base_url:
- return None
- if not row.get('sha1'):
- return None
- return dict(
- base_url=base_url,
- ingest_type="pdf",
- link_source="fatcat",
- link_source_id=f"file_{row['ident']}",
- ingest_request_source="file-backfill",
- ext_ids=dict(
- sha1=row['sha1'],
- ),
- )
-
-
-def run():
- for l in sys.stdin:
- if not l.strip():
- continue
- row = json.loads(l)
- request = transform(row)
- if request:
- print(json.dumps(request, sort_keys=True))
-
-if __name__=="__main__":
- run()