diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-04 12:56:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-04 12:56:29 -0700 |
commit | eebacc4fb395d829e342dab02f42591b29b942ae (patch) | |
tree | 6f06e588c770ba042ad6ff08da09ee4374107acc /notes/tasks | |
parent | 001ea86c59d10c9bd8f8bf6b8980ae23d52f9216 (diff) | |
download | sandcrawler-eebacc4fb395d829e342dab02f42591b29b942ae.tar.gz sandcrawler-eebacc4fb395d829e342dab02f42591b29b942ae.zip |
notes on dumping PDF URL lists for partners
Diffstat (limited to 'notes/tasks')
-rw-r--r-- | notes/tasks/2021-09-09_pdf_url_lists.md | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md new file mode 100644 index 0000000..52a3264 --- /dev/null +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -0,0 +1,66 @@ + +Want to dump a URL list to share with partners, filtered to content we think is +likely to be scholarly. + +Columns to include: + +- original URL +- capture timestamp +- SHA1 + +## Stats Overview + +file_meta table, mimetype=application/pdf: 173,816,433 + +cdx table, mimetype=application/pdf: 131,346,703 + +ingest_file_result table, pdf, success: 66,487,928 + +## Ingested PDF URLs + +"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also? + + COPY ( + SELECT + base_url as start_url, + terminal_url as pdf_url, + terminal_dt as pdf_url_timestamp, + terminal_sha1hex as pdf_sha1hex + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv' + WITH NULL ''; + => 77,892,849 + +## CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.url as pdf_url, + cdx.datetime as pdf_url_timestamp, + cdx.sha1hex as pdf_sha1hex + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv' + WITH NULL ''; + => 147,837,935 + +## Processed web PDFs + +"Parsed web PDFs": `file_meta`, left join CDX + +(didn't do this one) |