aboutsummaryrefslogtreecommitdiffstats
path: root/notes/tasks
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-04 12:56:29 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-04 12:56:29 -0700
commiteebacc4fb395d829e342dab02f42591b29b942ae (patch)
tree6f06e588c770ba042ad6ff08da09ee4374107acc /notes/tasks
parent001ea86c59d10c9bd8f8bf6b8980ae23d52f9216 (diff)
downloadsandcrawler-eebacc4fb395d829e342dab02f42591b29b942ae.tar.gz
sandcrawler-eebacc4fb395d829e342dab02f42591b29b942ae.zip
notes on dumping PDF URL lists for partners
Diffstat (limited to 'notes/tasks')
-rw-r--r--notes/tasks/2021-09-09_pdf_url_lists.md66
1 files changed, 66 insertions, 0 deletions
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
new file mode 100644
index 0000000..52a3264
--- /dev/null
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -0,0 +1,66 @@
+
+Want to dump a URL list to share with partners, filtered to content we think is
+likely to be scholarly.
+
+Columns to include:
+
+- original URL
+- capture timestamp
+- SHA1
+
+## Stats Overview
+
+file_meta table, mimetype=application/pdf: 173,816,433
+
+cdx table, mimetype=application/pdf: 131,346,703
+
+ingest_file_result table, pdf, success: 66,487,928
+
+## Ingested PDF URLs
+
+"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also?
+
+ COPY (
+ SELECT
+ base_url as start_url,
+ terminal_url as pdf_url,
+ terminal_dt as pdf_url_timestamp,
+ terminal_sha1hex as pdf_sha1hex
+ FROM ingest_file_result
+ WHERE
+ ingest_type = 'pdf'
+ AND status = 'success'
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv'
+ WITH NULL '';
+ => 77,892,849
+
+## CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+ COPY (
+ SELECT
+ cdx.url as pdf_url,
+ cdx.datetime as pdf_url_timestamp,
+ cdx.sha1hex as pdf_sha1hex
+ FROM cdx
+ LEFT JOIN file_meta
+ ON
+ cdx.sha1hex = file_meta.sha1hex
+ WHERE
+ file_meta.mimetype = 'application/pdf'
+ OR (
+ file_meta.mimetype IS NULL
+ AND cdx.mimetype = 'application/pdf'
+ )
+ )
+ TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv'
+ WITH NULL '';
+ => 147,837,935
+
+## Processed web PDFs
+
+"Parsed web PDFs": `file_meta`, left join CDX
+
+(didn't do this one)