diff options
-rw-r--r-- | extra/container_count_update/README.md | 41 | ||||
-rwxr-xr-x | python/fatcat_ingest.py | 8 |
2 files changed, 41 insertions, 8 deletions
diff --git a/extra/container_count_update/README.md b/extra/container_count_update/README.md new file mode 100644 index 00000000..3e9a4315 --- /dev/null +++ b/extra/container_count_update/README.md @@ -0,0 +1,41 @@ + +Here are the fields we want to populate: + + "releases_total": { "type": "integer" }, + "preservation_bright": { "type": "integer" }, + "preservation_dark": { "type": "integer" }, + "preservation_shadows_only":{ "type": "integer" }, + "preservation_none": { "type": "integer" }, + +Populate local index for testing: + + fatcat-cli search container --index-json --limit 100 state:active \ + | pv -l \ + > container_es_docs.json + + cat container_es_docs.json \ + | esbulk -verbose -index fatcat_container_v03c -id ident + + cat container_es_docs.json \ + | jq .ident -r \ + > container_idents.tsv + +Quick way to dump all idents in the current index: + + fatcat-cli search container --index-json --limit 0 state:active \ + | jq .ident -r \ + | pv -l \ + > container_idents.tsv + + cat container_idents.tsv \ + | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/{}/stats.json' \ + | jq -c . \ + | pv -l \ + > container_stats.json + + cat container_stats.json \ + | jq '{ ident: .ident, releases_total: .total, preservation_bright: .preservation.bright, preservation_dark: .preservation.dark, preservation_shadows_only: .preservation.shadows_only, preservation_none: .preservation.none }' -c \ + | esbulk -verbose -index fatcat_container_v03c -optype update -id ident + +This requires a recent version of esbulk (v0.7.5+) + diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 964907c8..3f8666ca 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -79,14 +79,6 @@ def _run_search_dump(args: argparse.Namespace, search: Search) -> None: file=sys.stderr, ) - # don't try to clean up scroll if we are connected to public server (behind - # nginx proxy that disallows DELETE) - if args.elasticsearch_endpoint in ( - "https://search.fatcat.wiki", - "https://search.qa.fatcat.wiki", - ): - search = search.params(clear_scroll=False) - results = search.scan() for esr in results: if args.limit and counts["ingest_request"] >= args.limit: |