aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-04-26 15:29:57 -0700
committerBryan Newbold <bnewbold@archive.org>2022-04-26 15:29:57 -0700
commitd3e30483fbfba5c57f86240d351de3580f5ae6fa (patch)
tree4ee61fb0f762e43c1c0a398fdafccfe6edbe8628
parentc0db231f1eebcf3acd78f0bf759e3df84e1d3b79 (diff)
downloadsandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.tar.gz
sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.zip
more dataset crawl notes
-rw-r--r--notes/ingest/2021-12-13_datasets.md53
1 files changed, 53 insertions, 0 deletions
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
index 1df633f..786c3b2 100644
--- a/notes/ingest/2021-12-13_datasets.md
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -336,6 +336,43 @@ Need to update fatcat file worker to support single-file filesets... was that th
# Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
+Trying again 2022-03-23:
+
+ git log | head -n1
+ # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+ head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+ | ./fatcat_import.py ingest-fileset-file-results -
+ # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+ head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+ | ./fatcat_import.py ingest-fileset-results -
+ # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
## Summary
@@ -449,3 +486,19 @@ These are ready to crawl, in the existing dataset crawl.
| awk '{print "F+ " $1}' \
> /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+ # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+ # skip_cleanup_local_files=True is still default
+
+ zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+ | shuf \
+ | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+ | pv -l \
+ > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+ # filter out zenodo, very slow:
+ # rg -v 10.5281 \