blob: 273ff326ccd6bf7d12499cccd86a8db3857810c3 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
Another dump of PDF URLs for partners. This time want to provide TSV with full
wayback download URLs, as well as "access" URLs.
export TASKDATE=2022-04-27
## "Ingested", AKA, "Targetted" PDF URLs
These are URLs where we did a successful ingest run.
COPY (
SELECT
terminal_sha1hex as pdf_sha1hex,
('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
FROM ingest_file_result
WHERE
ingest_type = 'pdf'
AND status = 'success'
AND hit = true
ORDER BY terminal_sha1hex ASC
-- LIMIT 10;
)
TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
WITH NULL '';
=> COPY 85712674
May contain duplicates, both by sha1hex, URL, or both.
Note that this could be filtered by timestamp, to make it monthly/annual.
## All CDX PDFs
"All web PDFs": CDX query; left join file_meta, but don't require
COPY (
SELECT
cdx.sha1hex as pdf_sha1hex,
('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
FROM cdx
LEFT JOIN file_meta
ON
cdx.sha1hex = file_meta.sha1hex
WHERE
file_meta.mimetype = 'application/pdf'
OR (
file_meta.mimetype IS NULL
AND cdx.mimetype = 'application/pdf'
)
ORDER BY cdx.sha1hex ASC
-- LIMIT 10;
)
TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
WITH NULL '';
=> COPY 161504070
Should be unique by wayback URL; may contain near-duplicates or duplicates by
## Upload to archive.org
TODO: next time compress these files first (gzip/pigz)
ia upload ia_scholarly_urls_$TASKDATE \
-m collection:ia_biblio_metadata \
-m title:"IA Scholarly URLs ($TASKDATE)" \
-m date:$TASKDATE \
-m creator:"Internet Archive Web Group" \
-m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
|