diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:29:57 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:29:57 -0700 | 
| commit | d3e30483fbfba5c57f86240d351de3580f5ae6fa (patch) | |
| tree | 4ee61fb0f762e43c1c0a398fdafccfe6edbe8628 | |
| parent | c0db231f1eebcf3acd78f0bf759e3df84e1d3b79 (diff) | |
| download | sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.tar.gz sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.zip | |
more dataset crawl notes
| -rw-r--r-- | notes/ingest/2021-12-13_datasets.md | 53 | 
1 files changed, 53 insertions, 0 deletions
| diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md index 1df633f..786c3b2 100644 --- a/notes/ingest/2021-12-13_datasets.md +++ b/notes/ingest/2021-12-13_datasets.md @@ -336,6 +336,43 @@ Need to update fatcat file worker to support single-file filesets... was that th      # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0}) +Trying again 2022-03-23: + +    git log | head -n1 +    # commit 134cb050988be2c545af89e0a67c4998307bb819 + +    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ +        | ./fatcat_import.py ingest-fileset-results - +    # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + +    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ +        | ./fatcat_import.py ingest-fileset-file-results - +    # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0}) + +    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ +        | ./fatcat_import.py ingest-fileset-results - +    # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0}) + +    head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ +        | ./fatcat_import.py ingest-fileset-results - +    # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0}) + +    head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ +        | ./fatcat_import.py ingest-fileset-results - +    # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0}) + +    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ +        | ./fatcat_import.py ingest-fileset-file-results - +    # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0}) + +Fixed a small logic error in insert path. + +    head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ +        | ./fatcat_import.py ingest-fileset-results - +    # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0}) + +archive.org datasets are *not* getting uploaded with the correct path. path +directory prefixes are getting clobbered.  ## Summary @@ -449,3 +486,19 @@ These are ready to crawl, in the existing dataset crawl.          | awk '{print "F+ " $1}' \          > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule +## Running Uploads Again + +Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a +big bummer! Will need to download many of these over again. + +    # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316 +    # skip_cleanup_local_files=True is still default + +    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ +        | shuf \ +        | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \ +        | pv -l \ +        > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json + +    # filter out zenodo, very slow: +    # rg -v 10.5281 \ | 
