blob: e65edd5e33c2e90585b3e257be6fe55feef24a62 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
-- Run like:
-- psql sandcrawler < dump_ungrobid_pdf.sql
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
COPY (
SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
FROM cdx
WHERE cdx.mimetype = 'application/pdf'
AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
-- uncomment/comment this to control whether only fatcat files are included
--AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
)
TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json'
WITH NULL '';
ROLLBACK;
|