more dataset crawl notes

author: Bryan Newbold <bnewbold@archive.org> 2022-04-26 15:29:57 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-04-26 15:29:57 -0700
commit: d3e30483fbfba5c57f86240d351de3580f5ae6fa (patch)
tree: 4ee61fb0f762e43c1c0a398fdafccfe6edbe8628
parent: c0db231f1eebcf3acd78f0bf759e3df84e1d3b79 (diff)
download: sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.tar.gz
sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.zip
1 files changed, 53 insertions, 0 deletions
diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md
index 1df633f..786c3b2 100644
--- a/notes/ingest/2021-12-13_datasets.md
+++ b/notes/ingest/2021-12-13_datasets.md
@@ -336,6 +336,43 @@ Need to update fatcat file worker to support single-file filesets... was that th
 
     # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0})
 
+Trying again 2022-03-23:
+
+    git log | head -n1
+    # commit 134cb050988be2c545af89e0a67c4998307bb819
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-file-results -
+    # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0})
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0})
+
+    head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0})
+
+    head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0})
+
+    head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \
+        | ./fatcat_import.py ingest-fileset-file-results -
+    # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0})
+
+Fixed a small logic error in insert path.
+
+    head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0})
+
+archive.org datasets are *not* getting uploaded with the correct path. path
+directory prefixes are getting clobbered.
 
 ## Summary
 
@@ -449,3 +486,19 @@ These are ready to crawl, in the existing dataset crawl.
         | awk '{print "F+ " $1}' \
         > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule
 
+## Running Uploads Again
+
+Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a
+big bummer! Will need to download many of these over again.
+
+    # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316
+    # skip_cleanup_local_files=True is still default
+
+    zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \
+        | shuf \
+        | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \
+        | pv -l \
+        > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json
+
+    # filter out zenodo, very slow:
+    # rg -v 10.5281 \
author	Bryan Newbold <bnewbold@archive.org>	2022-04-26 15:29:57 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-04-26 15:29:57 -0700
commit	d3e30483fbfba5c57f86240d351de3580f5ae6fa (patch)
tree	4ee61fb0f762e43c1c0a398fdafccfe6edbe8628
parent	c0db231f1eebcf3acd78f0bf759e3df84e1d3b79 (diff)
download	sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.tar.gz sandcrawler-d3e30483fbfba5c57f86240d351de3580f5ae6fa.zip