diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:38:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:38:09 -0700 |
commit | 4a9fba8005e0a65c03198c674d2c65f7440d71a6 (patch) | |
tree | d424ca51049632386aaf5762c2b45685d304cd1f /sql/dump_unextracted_pdf.sql | |
parent | f1936476985231286ad1abc74318cc06e20e2627 (diff) | |
download | sandcrawler-4a9fba8005e0a65c03198c674d2c65f7440d71a6.tar.gz sandcrawler-4a9fba8005e0a65c03198c674d2c65f7440d71a6.zip |
SQL: update weekly/quarterly ingest retry scripts
Diffstat (limited to 'sql/dump_unextracted_pdf.sql')
-rw-r--r-- | sql/dump_unextracted_pdf.sql | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql index 7b5e823..fb4b0af 100644 --- a/sql/dump_unextracted_pdf.sql +++ b/sql/dump_unextracted_pdf.sql @@ -9,12 +9,14 @@ COPY ( FROM grobid LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex WHERE cdx.sha1hex IS NOT NULL --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL AND pdf_meta.sha1hex IS NULL ) -TO '/grande/snapshots/dump_unextracted_pdf.fatcat.2020-07-22.json' +TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' WITH NULL ''; ROLLBACK; |