aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/container_count_update/README.md41
-rwxr-xr-xpython/fatcat_ingest.py8
2 files changed, 41 insertions, 8 deletions
diff --git a/extra/container_count_update/README.md b/extra/container_count_update/README.md
new file mode 100644
index 00000000..3e9a4315
--- /dev/null
+++ b/extra/container_count_update/README.md
@@ -0,0 +1,41 @@
+
+Here are the fields we want to populate:
+
+ "releases_total": { "type": "integer" },
+ "preservation_bright": { "type": "integer" },
+ "preservation_dark": { "type": "integer" },
+ "preservation_shadows_only":{ "type": "integer" },
+ "preservation_none": { "type": "integer" },
+
+Populate local index for testing:
+
+ fatcat-cli search container --index-json --limit 100 state:active \
+ | pv -l \
+ > container_es_docs.json
+
+ cat container_es_docs.json \
+ | esbulk -verbose -index fatcat_container_v03c -id ident
+
+ cat container_es_docs.json \
+ | jq .ident -r \
+ > container_idents.tsv
+
+Quick way to dump all idents in the current index:
+
+ fatcat-cli search container --index-json --limit 0 state:active \
+ | jq .ident -r \
+ | pv -l \
+ > container_idents.tsv
+
+ cat container_idents.tsv \
+ | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/{}/stats.json' \
+ | jq -c . \
+ | pv -l \
+ > container_stats.json
+
+ cat container_stats.json \
+ | jq '{ ident: .ident, releases_total: .total, preservation_bright: .preservation.bright, preservation_dark: .preservation.dark, preservation_shadows_only: .preservation.shadows_only, preservation_none: .preservation.none }' -c \
+ | esbulk -verbose -index fatcat_container_v03c -optype update -id ident
+
+This requires a recent version of esbulk (v0.7.5+)
+
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 964907c8..3f8666ca 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -79,14 +79,6 @@ def _run_search_dump(args: argparse.Namespace, search: Search) -> None:
file=sys.stderr,
)
- # don't try to clean up scroll if we are connected to public server (behind
- # nginx proxy that disallows DELETE)
- if args.elasticsearch_endpoint in (
- "https://search.fatcat.wiki",
- "https://search.qa.fatcat.wiki",
- ):
- search = search.params(clear_scroll=False)
-
results = search.scan()
for esr in results:
if args.limit and counts["ingest_request"] >= args.limit: