aboutsummaryrefslogtreecommitdiffstats
path: root/sql/dump_ungrobid_pdf.sql
blob: e65edd5e33c2e90585b3e257be6fe55feef24a62 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

-- Run like:
--   psql sandcrawler < dump_ungrobid_pdf.sql

BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;

COPY (
  SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
  FROM cdx
  WHERE cdx.mimetype = 'application/pdf'
  AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL)
  -- uncomment/comment this to control whether only fatcat files are included
  --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex)
)
TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json'
WITH NULL '';

ROLLBACK;