diff options
Diffstat (limited to 'notes/tasks')
| -rw-r--r-- | notes/tasks/2021-09-09_pdf_url_lists.md | 4 | ||||
| -rw-r--r-- | notes/tasks/2021-12-06_regrobid.md | 91 | ||||
| -rw-r--r-- | notes/tasks/2022-01-07_grobid_platform_pdfs.md | 23 | ||||
| -rw-r--r-- | notes/tasks/2022-03-07_ukraine_firedrill.md | 225 | ||||
| -rw-r--r-- | notes/tasks/2022-04-27_pdf_url_lists.md | 72 | ||||
| -rw-r--r-- | notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md | 132 | 
6 files changed, 547 insertions, 0 deletions
| diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md index 52a3264..cd8176e 100644 --- a/notes/tasks/2021-09-09_pdf_url_lists.md +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -64,3 +64,7 @@ ingest_file_result table, pdf, success: 66,487,928  "Parsed web PDFs": `file_meta`, left join CDX  (didn't do this one) + +--- + +Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09> diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md index 65e9fe3..5fb69d1 100644 --- a/notes/tasks/2021-12-06_regrobid.md +++ b/notes/tasks/2021-12-06_regrobid.md @@ -191,6 +191,84 @@ And some earlier files of interest on `aitio`:          | pv -l \          | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## Ancient Fatcat Files + +Files from an era where we didn't record GROBID version or status, even for +success. + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM grobid +        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex +        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex +        WHERE +            grobid.status_code = 200 +            AND grobid.status IS NULL +            AND cdx.sha1hex IS NOT NULL +            AND fatcat_file.sha1hex IS NOT NULL +            -- sort of arbitary "not recently" date filter +            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json' +    WITH NULL ''; + +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 +    # 107k 0:00:03 [29.9k/s] + + +## Start Re-Processing Old GROBID Versions + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM grobid +        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex +        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex +        WHERE +            grobid.status = 'success' +            AND grobid.grobid_version NOT LIKE '0.7.%' +            AND cdx.sha1hex IS NOT NULL +            AND fatcat_file.sha1hex IS NOT NULL +            -- sort of arbitary "not recently" date filter +            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json' +    WITH NULL ''; + +This one is huge, and want to process in batches/chunks of ~8 million at a time. + +    cd /srv/sandcrawler/tasks/ +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \ +        | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json + +Submit individual batches like: + +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Overall progress: + +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_00.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_01.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_02.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_03.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_04.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_05.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_06.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_07.json +    x  ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small) + +This finally finished on 2022-04-26. Horray! +  ## General Counts  How many fatcat files of what mimetype (reported in sandcrawler-db)? @@ -287,3 +365,16 @@ What are the GROBID status codes for fatcat files? Narrowed down:       error          |         200 |        3      (7 rows) +Ran the same query again on 2021-12-15: + +         status     | status_code |  count    +    ----------------+-------------+---------- +     success        |         200 | 45092915 +     error          |         500 |   302373 +                    |             |   250335 +                    |         200 |    53352 +     bad-grobid-xml |         200 |       39 +     error-timeout  |          -4 |       37 +     error          |         200 |       34 +     error          |         503 |        2 +    (8 rows) diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md new file mode 100644 index 0000000..b5422c2 --- /dev/null +++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md @@ -0,0 +1,23 @@ + +Martin crawled more than 10 million new PDFs from various platform domains. We +should get these processed and included in sandcrawler-db. + +## Select CDX Rows + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM cdx +        LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex +        WHERE +            grobid.sha1hex IS NULL +            AND cdx.sha1hex IS NOT NULL +            AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%' +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json' +    WITH NULL ''; +    => COPY 8801527 + +    cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +    # for pdfextract, would be: sandcrawler-prod.unextracted diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md new file mode 100644 index 0000000..c727a57 --- /dev/null +++ b/notes/tasks/2022-03-07_ukraine_firedrill.md @@ -0,0 +1,225 @@ + +Want to do priority crawling of Ukranian web content, plus Russia and Belarus. + + +## What is Missing? + +    (country_code:ua OR lang:uk) +    => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA +       later in day, already some 22k missing found! wow +    => 2022-04-04, after ingests:  476,174 total, 131,063 missing, 49k OA missing + +## Metadata Prep + +- container metadata update (no code changes) +    x  wikidata SPARQL update +    x  chocula run +    x  journal metadata update (fatcat) +    x  update journal stats (fatcat extra) +- DOAJ article metadata import +    x  prep and upload single JSON file + + +## Journal Homepage URL Crawl + +x dump ukraine-related journal homepages from chocula DB +x create crawl config +x start crawl +x repeat for belarus and russia + + +    python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv +    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv +    wc -l homepage_urls.2022-03-08.ua_tld.tsv +    1550 homepage_urls.2022-03-08.ua_tld.tsv + +    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv +    cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv + +sqlite3: + +    select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi +    1952 + +    SELECT COUNT(*) FROM homepage +    LEFT JOIN journal ON homepage.issnl = journal.issnl +    WHERE +        journal.country = 'ua' +        OR journal.lang = 'uk' +        OR journal.name like '%ukrain%' +        OR journal.publisher like '%ukrain%'; +    => 1970 + +    .mode csv +    .once homepage_urls_ukraine.tsv +    SELECT homepage.url FROM homepage +    LEFT JOIN journal ON homepage.issnl = journal.issnl +    WHERE +        journal.country = 'ua' +        OR journal.lang = 'uk' +        OR journal.name like '%ukrain%' +        OR journal.publisher like '%ukrain%'; + +    .mode csv +    .once homepage_urls_russia.tsv +    SELECT homepage.url FROM homepage +    LEFT JOIN journal ON homepage.issnl = journal.issnl +    WHERE +        journal.country = 'ru' +        OR journal.lang = 'ru' +        OR journal.name like '%russ%' +        OR journal.publisher like '%russ%'; + +    .mode csv +    .once homepage_urls_belarus.tsv +    SELECT homepage.url FROM homepage +    LEFT JOIN journal ON homepage.issnl = journal.issnl +    WHERE +        journal.country = 'by' +        OR journal.lang = 'be' +        OR journal.name like '%belarus%' +        OR journal.publisher like '%belarus%'; + +    cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv + +    wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv  +        1550 homepage_urls.2022-03-08.ua_tld.tsv +        1971 homepage_urls_ukraine.tsv +        3482 homepage_urls_ukraine_combined.2022-03-08.tsv + +    cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv + +    wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv +        3728 homepage_urls_russia.tsv +        2420 homepage_urls.2022-03-08.ru_tld.tsv +        6030 homepage_urls_russia_combined.2022-03-08.tsv + + +    cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv + +    wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv +        138 homepage_urls_belarus.tsv +        85 homepage_urls.2022-03-08.by_tld.tsv +        222 homepage_urls_belarus_combined.2022-03-08.tsv + + +## Landing Page Crawl + +x create crawl config +x fatcat ingest query for related URLs +    => special request code/label? +x finish .by and .ru article URL dump, start crawling +x URL list filtered from new OAI-PMH feed +    => do we need to do full bulk load/dump, or not? +- URL list from partner (google) +- do we need to do alternative thing of iterating over containers, ingesting each? + +    ./fatcat_ingest.py --env prod \ +        --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \ +        --ingest-type pdf \ +        --allow-non-oa \ +        query "country_code:ua OR lang:uk" + +    # around Tue 08 Mar 2022 01:07:37 PM PST +    # Expecting 185659 release objects in search queries +    # didn't complete successfully? hrm + +    # ok, retry "manually" (with kafkacat) +    ./fatcat_ingest.py --env prod \ +        --ingest-type pdf \ +        --allow-non-oa \ +        query "country_code:ua OR lang:uk" \ +    | pv -l \ +    | gzip \ +    > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json +    # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318}) +    # 103k 0:25:04 [68.7 /s] + +    zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +    zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz +    # 103k 0:00:02 [38.1k/s] + +    ./fatcat_ingest.py --env prod \ +        --ingest-type pdf \ +        --allow-non-oa \ +        query "country_code:by OR lang:be" \ +    | pv -l \ +    | gzip \ +    > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz +    # Expecting 2266 release objects in search queries +    # 1.29k 0:00:34 [37.5 /s] + +    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +    zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz + +    ./fatcat_ingest.py --env prod \ +        --ingest-type pdf \ +        --allow-non-oa \ +        query "country_code:ru OR lang:ru" \ +    | pv -l \ +    | gzip \ +    > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz +    # Expecting 1515246 release objects in search queries + +    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +    zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz + + +    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt +    # 309k 0:00:03 [81.0k/s] + +    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt +    # 71.2k 0:00:03 [19.0k/s] + +    zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt +    # 276k 0:00:03 [72.9k/s] + + +### Landing Page Bulk Ingest + +Running these 2022-03-24, after targeted crawl completed: + +    zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +    # 103k 0:00:02 [36.1k/s] + +    zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +    # 1.29k 0:00:00 [15.8k/s] + +    zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +    # 546k 0:00:13 [40.6k/s] + +It will probably take a week or more for these to complete. + + +## Outreach + +- openalex +- sucho.org +- ceeol.com diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md new file mode 100644 index 0000000..273ff32 --- /dev/null +++ b/notes/tasks/2022-04-27_pdf_url_lists.md @@ -0,0 +1,72 @@ + +Another dump of PDF URLs for partners. This time want to provide TSV with full +wayback download URLs, as well as "access" URLs. + +    export TASKDATE=2022-04-27 + +## "Ingested", AKA, "Targetted" PDF URLs + +These are URLs where we did a successful ingest run. + +    COPY ( +        SELECT +            terminal_sha1hex as pdf_sha1hex, +            ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url, +            ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url +        FROM ingest_file_result +        WHERE +            ingest_type = 'pdf' +            AND status = 'success' +            AND hit = true +        ORDER BY terminal_sha1hex ASC +        -- LIMIT 10; +    ) +    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv' +    WITH NULL ''; +    => COPY 85712674 + +May contain duplicates, both by sha1hex, URL, or both. + +Note that this could be filtered by timestamp, to make it monthly/annual. + + +## All CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + +    COPY ( +        SELECT +            cdx.sha1hex as pdf_sha1hex, +            ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url, +            ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url +        FROM cdx +        LEFT JOIN file_meta +        ON +            cdx.sha1hex = file_meta.sha1hex +        WHERE +            file_meta.mimetype = 'application/pdf' +            OR ( +                file_meta.mimetype IS NULL +                AND cdx.mimetype = 'application/pdf' +            ) +        ORDER BY cdx.sha1hex ASC +        -- LIMIT 10; +    ) +    TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv' +    WITH NULL ''; +    => COPY 161504070 + +Should be unique by wayback URL; may contain near-duplicates or duplicates by  + +## Upload to archive.org + +TODO: next time compress these files first (gzip/pigz) + +ia upload ia_scholarly_urls_$TASKDATE \ +    -m collection:ia_biblio_metadata \ +    -m title:"IA Scholarly URLs ($TASKDATE)" \ +    -m date:$TASKDATE \ +    -m creator:"Internet Archive Web Group" \ +    -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \ +    /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv + diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md new file mode 100644 index 0000000..74d3857 --- /dev/null +++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md @@ -0,0 +1,132 @@ + +Had a huge number of SPN requests for the andrzejklimczuk.com domain, +presumably from the author. + +Many were duplicates (same file, multiple releases, often things like zenodo +duplication). Many were also GROBID 500s, due to truncated common crawl +captures. + +Needed to cleanup! Basically sorted through a few editgroups manually, then +rejected all the rest and manually re-submitted with the below queries and +commands: + +    SELECT COUNT(*) from ingest_request +    LEFT JOIN ingest_file_result ON +        ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    LEFT JOIN grobid ON +        grobid.sha1hex = ingest_file_result.terminal_sha1hex +    WHERE +        ingest_request.link_source = 'spn' +        AND ingest_request.ingest_type = 'pdf' +        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'; +    => 589 + +    SELECT ingest_file_result.status, COUNT(*) from ingest_request +    LEFT JOIN ingest_file_result ON +        ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    LEFT JOIN grobid ON +        grobid.sha1hex = ingest_file_result.terminal_sha1hex +    WHERE +        ingest_request.link_source = 'spn' +        AND ingest_request.ingest_type = 'pdf' +        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' +    GROUP BY ingest_file_result.status; + +         status     | count  +    ----------------+------- +     cdx-error      |     1 +     success        |   587 +     wrong-mimetype |     1 +    (3 rows) + + +    SELECT grobid.status_code, COUNT(*) from ingest_request +    LEFT JOIN ingest_file_result ON +        ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    LEFT JOIN grobid ON +        grobid.sha1hex = ingest_file_result.terminal_sha1hex +    WHERE +        ingest_request.link_source = 'spn' +        AND ingest_request.ingest_type = 'pdf' +        AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' +    GROUP BY grobid.status_code; + +     status_code | count  +    -------------+------- +             200 |   385 +             500 |   202 +                 |     2 +    (3 rows) + + +    COPY ( +        SELECT row_to_json(ingest_request.*) FROM ingest_request +        LEFT JOIN ingest_file_result ON +            ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        LEFT JOIN grobid ON +            grobid.sha1hex = ingest_file_result.terminal_sha1hex +        WHERE +            ingest_request.link_source = 'spn' +            AND ingest_request.ingest_type = 'pdf' +            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' +            AND ingest_file_result.status = 'success' +            AND grobid.status_code = 500 +    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json'; +    => COPY 202 + +    COPY ( +        SELECT row_to_json(ingest_request.*) FROM ingest_request +        LEFT JOIN ingest_file_result ON +            ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        LEFT JOIN grobid ON +            grobid.sha1hex = ingest_file_result.terminal_sha1hex +        WHERE +            ingest_request.link_source = 'spn' +            AND ingest_request.ingest_type = 'pdf' +            AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' +            AND ingest_file_result.status = 'success' +            AND grobid.status_code = 200 +    ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json'; +    => COPY 385 + +sudo -u sandcrawler pipenv run \ +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \ +    > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json + +sudo -u sandcrawler pipenv run \ +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ +    | jq '. + {force_recrawl: true}' -c \ +    > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \ +    | shuf \ +    | head -n60000 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ +    | shuf \ +    | head -n100 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ +    | shuf \ +    | head -n10000 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +sudo -u sandcrawler pipenv run \ +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ +    > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \ +    | shuf \ +    | head -n60000 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 | 
