6 files changed, 547 insertions, 0 deletions
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md
index 52a3264..cd8176e 100644
--- a/notes/tasks/2021-09-09_pdf_url_lists.md
+++ b/notes/tasks/2021-09-09_pdf_url_lists.md
@@ -64,3 +64,7 @@ ingest_file_result table, pdf, success: 66,487,928
 "Parsed web PDFs": `file_meta`, left join CDX
 
 (didn't do this one)
+
+---
+
+Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09>
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
index 65e9fe3..5fb69d1 100644
--- a/notes/tasks/2021-12-06_regrobid.md
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -191,6 +191,84 @@ And some earlier files of interest on `aitio`:
         | pv -l \
         | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
 
+
+## Ancient Fatcat Files
+
+Files from an era where we didn't record GROBID version or status, even for
+success.
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.status_code = 200
+            AND grobid.status IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
+    WITH NULL '';
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 107k 0:00:03 [29.9k/s]
+
+
+## Start Re-Processing Old GROBID Versions
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.status = 'success'
+            AND grobid.grobid_version NOT LIKE '0.7.%'
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
+    WITH NULL '';
+
+This one is huge, and want to process in batches/chunks of ~8 million at a time.
+
+    cd /srv/sandcrawler/tasks/
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
+        | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json
+
+Submit individual batches like:
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+Overall progress:
+
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
+    x  ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)
+
+This finally finished on 2022-04-26. Horray!
+
 ## General Counts
 
 How many fatcat files of what mimetype (reported in sandcrawler-db)?
@@ -287,3 +365,16 @@ What are the GROBID status codes for fatcat files? Narrowed down:
      error          |         200 |        3
     (7 rows)
 
+Ran the same query again on 2021-12-15:
+
+         status     | status_code |  count   
+    ----------------+-------------+----------
+     success        |         200 | 45092915
+     error          |         500 |   302373
+                    |             |   250335
+                    |         200 |    53352
+     bad-grobid-xml |         200 |       39
+     error-timeout  |          -4 |       37
+     error          |         200 |       34
+     error          |         503 |        2
+    (8 rows)
diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
new file mode 100644
index 0000000..b5422c2
--- /dev/null
+++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md
@@ -0,0 +1,23 @@
+
+Martin crawled more than 10 million new PDFs from various platform domains. We
+should get these processed and included in sandcrawler-db.
+
+## Select CDX Rows
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM cdx
+        LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%'
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json'
+    WITH NULL '';
+    => COPY 8801527
+
+    cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+    # for pdfextract, would be: sandcrawler-prod.unextracted
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..c727a57
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,225 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+    (country_code:ua OR lang:uk)
+    => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+       later in day, already some 22k missing found! wow
+    => 2022-04-04, after ingests:  476,174 total, 131,063 missing, 49k OA missing
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+    x  wikidata SPARQL update
+    x  chocula run
+    x  journal metadata update (fatcat)
+    x  update journal stats (fatcat extra)
+- DOAJ article metadata import
+    x  prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+    python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+    wc -l homepage_urls.2022-03-08.ua_tld.tsv
+    1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+    select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+    1952
+
+    SELECT COUNT(*) FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ua'
+        OR journal.lang = 'uk'
+        OR journal.name like '%ukrain%'
+        OR journal.publisher like '%ukrain%';
+    => 1970
+
+    .mode csv
+    .once homepage_urls_ukraine.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ua'
+        OR journal.lang = 'uk'
+        OR journal.name like '%ukrain%'
+        OR journal.publisher like '%ukrain%';
+
+    .mode csv
+    .once homepage_urls_russia.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'ru'
+        OR journal.lang = 'ru'
+        OR journal.name like '%russ%'
+        OR journal.publisher like '%russ%';
+
+    .mode csv
+    .once homepage_urls_belarus.tsv
+    SELECT homepage.url FROM homepage
+    LEFT JOIN journal ON homepage.issnl = journal.issnl
+    WHERE
+        journal.country = 'by'
+        OR journal.lang = 'be'
+        OR journal.name like '%belarus%'
+        OR journal.publisher like '%belarus%';
+
+    cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+    wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv 
+        1550 homepage_urls.2022-03-08.ua_tld.tsv
+        1971 homepage_urls_ukraine.tsv
+        3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+    cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+    wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+        3728 homepage_urls_russia.tsv
+        2420 homepage_urls.2022-03-08.ru_tld.tsv
+        6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+    cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+    wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+        138 homepage_urls_belarus.tsv
+        85 homepage_urls.2022-03-08.by_tld.tsv
+        222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+    => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+    => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+    ./fatcat_ingest.py --env prod \
+        --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ua OR lang:uk"
+
+    # around Tue 08 Mar 2022 01:07:37 PM PST
+    # Expecting 185659 release objects in search queries
+    # didn't complete successfully? hrm
+
+    # ok, retry "manually" (with kafkacat)
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ua OR lang:uk" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+    # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+    # 103k 0:25:04 [68.7 /s]
+
+    zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+    # 103k 0:00:02 [38.1k/s]
+
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:by OR lang:be" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+    # Expecting 2266 release objects in search queries
+    # 1.29k 0:00:34 [37.5 /s]
+
+    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+    ./fatcat_ingest.py --env prod \
+        --ingest-type pdf \
+        --allow-non-oa \
+        query "country_code:ru OR lang:ru" \
+    | pv -l \
+    | gzip \
+    > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+    # Expecting 1515246 release objects in search queries
+
+    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+    # 309k 0:00:03 [81.0k/s]
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+    # 71.2k 0:00:03 [19.0k/s]
+
+    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+    # 276k 0:00:03 [72.9k/s]
+
+
+### Landing Page Bulk Ingest
+
+Running these 2022-03-24, after targeted crawl completed:
+
+    zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 103k 0:00:02 [36.1k/s]
+
+    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 1.29k 0:00:00 [15.8k/s]
+
+    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 546k 0:00:13 [40.6k/s]
+
+It will probably take a week or more for these to complete.
+
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com
diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md
new file mode 100644
index 0000000..273ff32
--- /dev/null
+++ b/notes/tasks/2022-04-27_pdf_url_lists.md
@@ -0,0 +1,72 @@
+
+Another dump of PDF URLs for partners. This time want to provide TSV with full
+wayback download URLs, as well as "access" URLs.
+
+    export TASKDATE=2022-04-27
+
+## "Ingested", AKA, "Targetted" PDF URLs
+
+These are URLs where we did a successful ingest run.
+
+    COPY (
+        SELECT
+            terminal_sha1hex as pdf_sha1hex,
+            ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url,
+            ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url
+        FROM ingest_file_result
+        WHERE
+            ingest_type = 'pdf'
+            AND status = 'success'
+            AND hit = true
+        ORDER BY terminal_sha1hex ASC
+        -- LIMIT 10;
+    )
+    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv'
+    WITH NULL '';
+    => COPY 85712674
+
+May contain duplicates, both by sha1hex, URL, or both.
+
+Note that this could be filtered by timestamp, to make it monthly/annual.
+
+
+## All CDX PDFs
+
+"All web PDFs": CDX query; left join file_meta, but don't require
+
+    COPY (
+        SELECT
+            cdx.sha1hex as pdf_sha1hex,
+            ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url,
+            ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url
+        FROM cdx
+        LEFT JOIN file_meta
+        ON
+            cdx.sha1hex = file_meta.sha1hex
+        WHERE
+            file_meta.mimetype = 'application/pdf'
+            OR (
+                file_meta.mimetype IS NULL
+                AND cdx.mimetype = 'application/pdf'
+            )
+        ORDER BY cdx.sha1hex ASC
+        -- LIMIT 10;
+    )
+    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv'
+    WITH NULL '';
+    => COPY 161504070
+
+Should be unique by wayback URL; may contain near-duplicates or duplicates by 
+
+## Upload to archive.org
+
+TODO: next time compress these files first (gzip/pigz)
+
+ia upload ia_scholarly_urls_$TASKDATE \
+    -m collection:ia_biblio_metadata \
+    -m title:"IA Scholarly URLs ($TASKDATE)" \
+    -m date:$TASKDATE \
+    -m creator:"Internet Archive Web Group" \
+    -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \
+    /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv
+
diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
new file mode 100644
index 0000000..74d3857
--- /dev/null
+++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md
@@ -0,0 +1,132 @@
+
+Had a huge number of SPN requests for the andrzejklimczuk.com domain,
+presumably from the author.
+
+Many were duplicates (same file, multiple releases, often things like zenodo
+duplication). Many were also GROBID 500s, due to truncated common crawl
+captures.
+
+Needed to cleanup! Basically sorted through a few editgroups manually, then
+rejected all the rest and manually re-submitted with the below queries and
+commands:
+
+    SELECT COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%';
+    => 589
+
+    SELECT ingest_file_result.status, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY ingest_file_result.status;
+
+         status     | count 
+    ----------------+-------
+     cdx-error      |     1
+     success        |   587
+     wrong-mimetype |     1
+    (3 rows)
+
+
+    SELECT grobid.status_code, COUNT(*) from ingest_request
+    LEFT JOIN ingest_file_result ON
+        ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    LEFT JOIN grobid ON
+        grobid.sha1hex = ingest_file_result.terminal_sha1hex
+    WHERE
+        ingest_request.link_source = 'spn'
+        AND ingest_request.ingest_type = 'pdf'
+        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+    GROUP BY grobid.status_code;
+
+     status_code | count 
+    -------------+-------
+             200 |   385
+             500 |   202
+                 |     2
+    (3 rows)
+
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 500
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json';
+    => COPY 202
+
+    COPY (
+        SELECT row_to_json(ingest_request.*) FROM ingest_request
+        LEFT JOIN ingest_file_result ON
+            ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        LEFT JOIN grobid ON
+            grobid.sha1hex = ingest_file_result.terminal_sha1hex
+        WHERE
+            ingest_request.link_source = 'spn'
+            AND ingest_request.ingest_type = 'pdf'
+            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'
+            AND ingest_file_result.status = 'success'
+            AND grobid.status_code = 200
+    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json';
+    => COPY 385
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    | jq '. + {force_recrawl: true}' -c \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n100 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \
+    | shuf \
+    | head -n10000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+
+sudo -u sandcrawler pipenv run \
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \
+    > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json
+
+cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \
+    | shuf \
+    | head -n60000 \
+    | jq . -c \
+    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1