blob: b846834849c0c7e76a6b3ea97cc9503b9d910810 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
-- Run like:
-- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
COPY (
SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx
WHERE cdx.mimetype = 'application/pdf'
AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
)
TO STDOUT
WITH NULL '';
ROLLBACK;
|