From b1a68eefd96025ffe81a4e8e05b23dd71b9a6d92 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 15 Apr 2020 12:39:12 -0700
Subject: partial notes on S2 crawl ingest

---
 notes/ingest/2020-03_s2_ingest.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 notes/ingest/2020-03_s2_ingest.md

(limited to 'notes')

diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2_ingest.md
new file mode 100644
index 0000000..fedaba0
--- /dev/null
+++ b/notes/ingest/2020-03_s2_ingest.md
@@ -0,0 +1,35 @@
+
+Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these
+ingested, as well as any previous existing content.
+
+Also, there are a bunch of PDF outlinks to the web; should do S2-specific
+matching and ingest of those.
+
+There are a few categories of paper from pdfs.s.o:
+
+1. we had previous GWB crawl, didn't re-crawl
+2. we had PDF from elsewhere on the web, didn't re-crawl
+3. crawled successfully
+4. crawl failed
+
+In this ingest, want to get all of categories 1 and 3. Could try to do this by
+dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl),
+and join that against the ingest request list.
+
+For other random web URLs, can do the usual persist/backfill/recrawl pipeline.
+
+## Create Seedlist
+
+    zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz
+    zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz
+
+    zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list
+    zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list
+
+    zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz
+    zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz
+
+    zcat s2_external_ingestrequest.json.gz | wc -l
+    41201427
+    zcat s2_hosted_ingestrequest.json.gz | wc -l
+    23345761
-- 
cgit v1.2.3