diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-01-07 18:02:09 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-01-07 18:02:09 -0800 |
commit | 4e8407758618bece136addffe301ba8357366de3 (patch) | |
tree | f9c9e186a68793b7e37ffe042bc01f823922931c | |
parent | 65a0d38bcedca0610ca6fa8e053199f324062ace (diff) | |
download | sandcrawler-4e8407758618bece136addffe301ba8357366de3.tar.gz sandcrawler-4e8407758618bece136addffe301ba8357366de3.zip |
enqueue PLATFORM PDFs for crawl
-rw-r--r-- | notes/tasks/2022-01-07_grobid_platform_pdfs.md | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md new file mode 100644 index 0000000..b5422c2 --- /dev/null +++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md @@ -0,0 +1,23 @@ + +Martin crawled more than 10 million new PDFs from various platform domains. We +should get these processed and included in sandcrawler-db. + +## Select CDX Rows + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json' + WITH NULL ''; + => COPY 8801527 + + cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + # for pdfextract, would be: sandcrawler-prod.unextracted |