From 4e8407758618bece136addffe301ba8357366de3 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 7 Jan 2022 18:02:09 -0800
Subject: enqueue PLATFORM PDFs for crawl

---
 notes/tasks/2022-01-07_grobid_platform_pdfs.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 notes/tasks/2022-01-07_grobid_platform_pdfs.md

diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM cdx
+        LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+    WITH NULL '';
+    => COPY 8801527
+
+    cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+    # for pdfextract, would be: sandcrawler-prod.unextracted
-- 
cgit v1.2.3