diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-05-03 17:14:08 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-05-03 17:15:18 -0700 |
commit | ac7c44d332fcba83faae6a3e732c3415f6ab78a6 (patch) | |
tree | 3cf258f75b301d93e01552b59d439cd10e2c8a13 /notes | |
parent | 6dd9bc8d3312107796344341e43044907677bf85 (diff) | |
download | sandcrawler-ac7c44d332fcba83faae6a3e732c3415f6ab78a6.tar.gz sandcrawler-ac7c44d332fcba83faae6a3e732c3415f6ab78a6.zip |
PDF URL lists update
Diffstat (limited to 'notes')
-rw-r--r-- | notes/tasks/2021-09-09_pdf_url_lists.md | 4 | ||||
-rw-r--r-- | notes/tasks/2022-04-27_pdf_url_lists.md | 72 |
2 files changed, 76 insertions, 0 deletions
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md index 52a3264..cd8176e 100644 --- a/notes/tasks/2021-09-09_pdf_url_lists.md +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -64,3 +64,7 @@ ingest_file_result table, pdf, success: 66,487,928 "Parsed web PDFs": `file_meta`, left join CDX (didn't do this one) + +--- + +Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09> diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md new file mode 100644 index 0000000..273ff32 --- /dev/null +++ b/notes/tasks/2022-04-27_pdf_url_lists.md @@ -0,0 +1,72 @@ + +Another dump of PDF URLs for partners. This time want to provide TSV with full +wayback download URLs, as well as "access" URLs. + + export TASKDATE=2022-04-27 + +## "Ingested", AKA, "Targetted" PDF URLs + +These are URLs where we did a successful ingest run. + + COPY ( + SELECT + terminal_sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url, + ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + AND hit = true + ORDER BY terminal_sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv' + WITH NULL ''; + => COPY 85712674 + +May contain duplicates, both by sha1hex, URL, or both. + +Note that this could be filtered by timestamp, to make it monthly/annual. + + +## All CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url, + ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ORDER BY cdx.sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv' + WITH NULL ''; + => COPY 161504070 + +Should be unique by wayback URL; may contain near-duplicates or duplicates by + +## Upload to archive.org + +TODO: next time compress these files first (gzip/pigz) + +ia upload ia_scholarly_urls_$TASKDATE \ + -m collection:ia_biblio_metadata \ + -m title:"IA Scholarly URLs ($TASKDATE)" \ + -m date:$TASKDATE \ + -m creator:"Internet Archive Web Group" \ + -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \ + /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv + |