From eebacc4fb395d829e342dab02f42591b29b942ae Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Mon, 4 Oct 2021 12:56:29 -0700
Subject: notes on dumping PDF URL lists for partners

---
 notes/tasks/2021-09-09_pdf_url_lists.md | 66 +++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 notes/tasks/2021-09-09_pdf_url_lists.md

(limited to 'notes')

diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..52a3264
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,66 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+    COPY (
+        SELECT
+            base_url as start_url,
+            terminal_url as pdf_url,
+            terminal_dt as pdf_url_timestamp,
+            terminal_sha1hex as pdf_sha1hex
+        FROM ingest_file_result
+        WHERE
+            ingest_type = 'pdf'
+            AND status = 'success'
+    )
+    TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+    WITH NULL '';
+    => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+    COPY (
+        SELECT
+            cdx.url as pdf_url,
+            cdx.datetime as pdf_url_timestamp,
+            cdx.sha1hex as pdf_sha1hex
+        FROM cdx
+        LEFT JOIN file_meta
+        ON
+            cdx.sha1hex = file_meta.sha1hex
+        WHERE
+            file_meta.mimetype = 'application/pdf'
+            OR (
+                file_meta.mimetype IS NULL
+                AND cdx.mimetype = 'application/pdf'
+            )
+    )
+    TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+    WITH NULL '';
+    => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)
-- 
cgit v1.2.3