diff options
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2020-03_s2_ingest.md | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2_ingest.md new file mode 100644 index 0000000..fedaba0 --- /dev/null +++ b/notes/ingest/2020-03_s2_ingest.md @@ -0,0 +1,35 @@ + +Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these +ingested, as well as any previous existing content. + +Also, there are a bunch of PDF outlinks to the web; should do S2-specific +matching and ingest of those. + +There are a few categories of paper from pdfs.s.o: + +1. we had previous GWB crawl, didn't re-crawl +2. we had PDF from elsewhere on the web, didn't re-crawl +3. crawled successfully +4. crawl failed + +In this ingest, want to get all of categories 1 and 3. Could try to do this by +dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl), +and join that against the ingest request list. + +For other random web URLs, can do the usual persist/backfill/recrawl pipeline. + +## Create Seedlist + + zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz + zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz + + zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list + zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list + + zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz + zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz + + zcat s2_external_ingestrequest.json.gz | wc -l + 41201427 + zcat s2_hosted_ingestrequest.json.gz | wc -l + 23345761 |