blob: e7c48f318dd0f4fef363e2356c6e3841701fddbc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
-- Run like:
-- psql sandcrawler < dump_regrobid_pdf_petabox.sql
-- cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json
BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
COPY (
SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox
WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL)
)
TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json'
WITH NULL '';
ROLLBACK;
|