From d3e30483fbfba5c57f86240d351de3580f5ae6fa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Apr 2022 15:29:57 -0700 Subject: more dataset crawl notes --- notes/ingest/2021-12-13_datasets.md | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md index 1df633f..786c3b2 100644 --- a/notes/ingest/2021-12-13_datasets.md +++ b/notes/ingest/2021-12-13_datasets.md @@ -336,6 +336,43 @@ Need to update fatcat file worker to support single-file filesets... was that th # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0}) +Trying again 2022-03-23: + + git log | head -n1 + # commit 134cb050988be2c545af89e0a67c4998307bb819 + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0}) + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0}) + + head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0}) + +Fixed a small logic error in insert path. + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0}) + +archive.org datasets are *not* getting uploaded with the correct path. path +directory prefixes are getting clobbered. ## Summary @@ -449,3 +486,19 @@ These are ready to crawl, in the existing dataset crawl. | awk '{print "F+ " $1}' \ > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule +## Running Uploads Again + +Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a +big bummer! Will need to download many of these over again. + + # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316 + # skip_cleanup_local_files=True is still default + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json + + # filter out zenodo, very slow: + # rg -v 10.5281 \ -- cgit v1.2.3