aboutsummaryrefslogtreecommitdiffstats
path: root/notes/crawl_cdx_merge.md
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-24 13:39:02 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-24 13:39:02 -0700
commit1ae7fd2f0c5661560b15be86614c2c4d41b21205 (patch)
tree71ed116cfbc65562bfcbd2d913402c098c23c1df /notes/crawl_cdx_merge.md
parentf21bf5c66382a475a5127e449d05a75ba41a9a25 (diff)
downloadsandcrawler-1ae7fd2f0c5661560b15be86614c2c4d41b21205.tar.gz
sandcrawler-1ae7fd2f0c5661560b15be86614c2c4d41b21205.zip
commit notes from my laptop
Diffstat (limited to 'notes/crawl_cdx_merge.md')
-rw-r--r--notes/crawl_cdx_merge.md16
1 files changed, 16 insertions, 0 deletions
diff --git a/notes/crawl_cdx_merge.md b/notes/crawl_cdx_merge.md
new file mode 100644
index 0000000..a843a8d
--- /dev/null
+++ b/notes/crawl_cdx_merge.md
@@ -0,0 +1,16 @@
+
+## Old Way
+
+
+Use metamgr to export an items list.
+
+Get all the CDX files and merge/sort:
+
+ mkdir CRAWL-2000 && cd CRAWL-2000
+ cat ../CRAWL-2000.items | shuf | parallel --bar -j6 ia download {} {}.cdx.gz
+ ls */*.cdx.gz | parallel --bar -j1 zcat {} > CRAWL-2000.unsorted.cdx
+ sort -u CRAWL-2000.unsorted.cdx > CRAWL-2000.cdx
+ wc -l CRAWL-2000.cdx
+ rm CRAWL-2000.unsorted.cdx
+
+ # gzip and upload to petabox, or send to HDFS, or whatever