From 2280af9a1d0849c41950b44df18fe76e3b7c52c8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 20 Apr 2022 16:06:05 -0700 Subject: bulk edits: docs on initial dataset/fileset ingest --- extra/bulk_edits/2022-04-07_initial_datasets.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 extra/bulk_edits/2022-04-07_initial_datasets.md diff --git a/extra/bulk_edits/2022-04-07_initial_datasets.md b/extra/bulk_edits/2022-04-07_initial_datasets.md new file mode 100644 index 00000000..90827a38 --- /dev/null +++ b/extra/bulk_edits/2022-04-07_initial_datasets.md @@ -0,0 +1,22 @@ + +Importing fileset and file entities from initial sandcrawler ingests. + +Git commit: `ede98644a89afd15d903061e0998dbd08851df6d` + +Filesets: + + export FATCAT_AUTH_SANDCRAWLER=[...] + cat /tmp/ingest_dataset_combined_results.2022-04-04.partial.json \ + | ./fatcat_import.py ingest-fileset-results - + # editgroup_5l47i7bscvfmpf4ddytauoekea + # Counter({'total': 195, 'skip': 176, 'skip-hit': 160, 'insert': 19, 'skip-single-file': 14, 'skip-partial-file-info': 2, 'update': 0, 'exists': 0}) + + cat /srv/fatcat/datasets/ingest_dataset_combined_results.2022-04-04.partial.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # editgroup_i2k2ucon7nap3gui3z7amuiug4 + # Counter({'total': 195, 'skip': 184, 'skip-hit': 160, 'skip-status': 24, 'insert': 11, 'update': 0, 'exists': 0}) + +Tried running again, to ensure that there are not duplicate inserts, and that +worked ('exists' instead of 'insert' counts). + +Finally! -- cgit v1.2.3