diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:29:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:29:57 -0700 |
commit | d3e30483fbfba5c57f86240d351de3580f5ae6fa (patch) | |
tree | 4ee61fb0f762e43c1c0a398fdafccfe6edbe8628 /notes | |
parent | c0db231f1eebcf3acd78f0bf759e3df84e1d3b79 (diff) | |
download | sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.tar.gz sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.zip |
more dataset crawl notes
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2021-12-13_datasets.md | 53 |
1 files changed, 53 insertions, 0 deletions
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md index 1df633f..786c3b2 100644 --- a/notes/ingest/2021-12-13_datasets.md +++ b/notes/ingest/2021-12-13_datasets.md @@ -336,6 +336,43 @@ Need to update fatcat file worker to support single-file filesets... was that th # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0}) +Trying again 2022-03-23: + + git log | head -n1 + # commit 134cb050988be2c545af89e0a67c4998307bb819 + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0}) + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0}) + + head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0}) + +Fixed a small logic error in insert path. + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0}) + +archive.org datasets are *not* getting uploaded with the correct path. path +directory prefixes are getting clobbered. ## Summary @@ -449,3 +486,19 @@ These are ready to crawl, in the existing dataset crawl. | awk '{print "F+ " $1}' \ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule +## Running Uploads Again + +Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a +big bummer! Will need to download many of these over again. + + # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316 + # skip_cleanup_local_files=True is still default + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json + + # filter out zenodo, very slow: + # rg -v 10.5281 \ |