diff options
Diffstat (limited to 'notes/tasks')
-rw-r--r-- | notes/tasks/2021-09-09_pdf_url_lists.md | 4 | ||||
-rw-r--r-- | notes/tasks/2021-12-06_regrobid.md | 91 | ||||
-rw-r--r-- | notes/tasks/2022-01-07_grobid_platform_pdfs.md | 23 | ||||
-rw-r--r-- | notes/tasks/2022-03-07_ukraine_firedrill.md | 225 | ||||
-rw-r--r-- | notes/tasks/2022-04-27_pdf_url_lists.md | 72 | ||||
-rw-r--r-- | notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md | 132 |
6 files changed, 547 insertions, 0 deletions
diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md index 52a3264..cd8176e 100644 --- a/notes/tasks/2021-09-09_pdf_url_lists.md +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -64,3 +64,7 @@ ingest_file_result table, pdf, success: 66,487,928 "Parsed web PDFs": `file_meta`, left join CDX (didn't do this one) + +--- + +Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09> diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md index 65e9fe3..5fb69d1 100644 --- a/notes/tasks/2021-12-06_regrobid.md +++ b/notes/tasks/2021-12-06_regrobid.md @@ -191,6 +191,84 @@ And some earlier files of interest on `aitio`: | pv -l \ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## Ancient Fatcat Files + +Files from an era where we didn't record GROBID version or status, even for +success. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status_code = 200 + AND grobid.status IS NULL + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json' + WITH NULL ''; + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 107k 0:00:03 [29.9k/s] + + +## Start Re-Processing Old GROBID Versions + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status = 'success' + AND grobid.grobid_version NOT LIKE '0.7.%' + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json' + WITH NULL ''; + +This one is huge, and want to process in batches/chunks of ~8 million at a time. + + cd /srv/sandcrawler/tasks/ + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \ + | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json + +Submit individual batches like: + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Overall progress: + + x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small) + +This finally finished on 2022-04-26. Horray! + ## General Counts How many fatcat files of what mimetype (reported in sandcrawler-db)? @@ -287,3 +365,16 @@ What are the GROBID status codes for fatcat files? Narrowed down: error | 200 | 3 (7 rows) +Ran the same query again on 2021-12-15: + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 45092915 + error | 500 | 302373 + | | 250335 + | 200 | 53352 + bad-grobid-xml | 200 | 39 + error-timeout | -4 | 37 + error | 200 | 34 + error | 503 | 2 + (8 rows) diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md new file mode 100644 index 0000000..b5422c2 --- /dev/null +++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md @@ -0,0 +1,23 @@ + +Martin crawled more than 10 million new PDFs from various platform domains. We +should get these processed and included in sandcrawler-db. + +## Select CDX Rows + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json' + WITH NULL ''; + => COPY 8801527 + + cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + # for pdfextract, would be: sandcrawler-prod.unextracted diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md new file mode 100644 index 0000000..c727a57 --- /dev/null +++ b/notes/tasks/2022-03-07_ukraine_firedrill.md @@ -0,0 +1,225 @@ + +Want to do priority crawling of Ukranian web content, plus Russia and Belarus. + + +## What is Missing? + + (country_code:ua OR lang:uk) + => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA + later in day, already some 22k missing found! wow + => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing + +## Metadata Prep + +- container metadata update (no code changes) + x wikidata SPARQL update + x chocula run + x journal metadata update (fatcat) + x update journal stats (fatcat extra) +- DOAJ article metadata import + x prep and upload single JSON file + + +## Journal Homepage URL Crawl + +x dump ukraine-related journal homepages from chocula DB +x create crawl config +x start crawl +x repeat for belarus and russia + + + python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv + wc -l homepage_urls.2022-03-08.ua_tld.tsv + 1550 homepage_urls.2022-03-08.ua_tld.tsv + + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv + +sqlite3: + + select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi + 1952 + + SELECT COUNT(*) FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ua' + OR journal.lang = 'uk' + OR journal.name like '%ukrain%' + OR journal.publisher like '%ukrain%'; + => 1970 + + .mode csv + .once homepage_urls_ukraine.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ua' + OR journal.lang = 'uk' + OR journal.name like '%ukrain%' + OR journal.publisher like '%ukrain%'; + + .mode csv + .once homepage_urls_russia.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ru' + OR journal.lang = 'ru' + OR journal.name like '%russ%' + OR journal.publisher like '%russ%'; + + .mode csv + .once homepage_urls_belarus.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'by' + OR journal.lang = 'be' + OR journal.name like '%belarus%' + OR journal.publisher like '%belarus%'; + + cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv + + wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv + 1550 homepage_urls.2022-03-08.ua_tld.tsv + 1971 homepage_urls_ukraine.tsv + 3482 homepage_urls_ukraine_combined.2022-03-08.tsv + + cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv + + wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv + 3728 homepage_urls_russia.tsv + 2420 homepage_urls.2022-03-08.ru_tld.tsv + 6030 homepage_urls_russia_combined.2022-03-08.tsv + + + cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv + + wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv + 138 homepage_urls_belarus.tsv + 85 homepage_urls.2022-03-08.by_tld.tsv + 222 homepage_urls_belarus_combined.2022-03-08.tsv + + +## Landing Page Crawl + +x create crawl config +x fatcat ingest query for related URLs + => special request code/label? +x finish .by and .ru article URL dump, start crawling +x URL list filtered from new OAI-PMH feed + => do we need to do full bulk load/dump, or not? +- URL list from partner (google) +- do we need to do alternative thing of iterating over containers, ingesting each? + + ./fatcat_ingest.py --env prod \ + --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ua OR lang:uk" + + # around Tue 08 Mar 2022 01:07:37 PM PST + # Expecting 185659 release objects in search queries + # didn't complete successfully? hrm + + # ok, retry "manually" (with kafkacat) + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ua OR lang:uk" \ + | pv -l \ + | gzip \ + > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json + # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318}) + # 103k 0:25:04 [68.7 /s] + + zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz + # 103k 0:00:02 [38.1k/s] + + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:by OR lang:be" \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz + # Expecting 2266 release objects in search queries + # 1.29k 0:00:34 [37.5 /s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz + + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ru OR lang:ru" \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz + # Expecting 1515246 release objects in search queries + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz + + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt + # 309k 0:00:03 [81.0k/s] + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt + # 71.2k 0:00:03 [19.0k/s] + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt + # 276k 0:00:03 [72.9k/s] + + +### Landing Page Bulk Ingest + +Running these 2022-03-24, after targeted crawl completed: + + zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 103k 0:00:02 [36.1k/s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 1.29k 0:00:00 [15.8k/s] + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 546k 0:00:13 [40.6k/s] + +It will probably take a week or more for these to complete. + + +## Outreach + +- openalex +- sucho.org +- ceeol.com diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md new file mode 100644 index 0000000..273ff32 --- /dev/null +++ b/notes/tasks/2022-04-27_pdf_url_lists.md @@ -0,0 +1,72 @@ + +Another dump of PDF URLs for partners. This time want to provide TSV with full +wayback download URLs, as well as "access" URLs. + + export TASKDATE=2022-04-27 + +## "Ingested", AKA, "Targetted" PDF URLs + +These are URLs where we did a successful ingest run. + + COPY ( + SELECT + terminal_sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url, + ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + AND hit = true + ORDER BY terminal_sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv' + WITH NULL ''; + => COPY 85712674 + +May contain duplicates, both by sha1hex, URL, or both. + +Note that this could be filtered by timestamp, to make it monthly/annual. + + +## All CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url, + ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ORDER BY cdx.sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv' + WITH NULL ''; + => COPY 161504070 + +Should be unique by wayback URL; may contain near-duplicates or duplicates by + +## Upload to archive.org + +TODO: next time compress these files first (gzip/pigz) + +ia upload ia_scholarly_urls_$TASKDATE \ + -m collection:ia_biblio_metadata \ + -m title:"IA Scholarly URLs ($TASKDATE)" \ + -m date:$TASKDATE \ + -m creator:"Internet Archive Web Group" \ + -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \ + /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv + diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md new file mode 100644 index 0000000..74d3857 --- /dev/null +++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md @@ -0,0 +1,132 @@ + +Had a huge number of SPN requests for the andrzejklimczuk.com domain, +presumably from the author. + +Many were duplicates (same file, multiple releases, often things like zenodo +duplication). Many were also GROBID 500s, due to truncated common crawl +captures. + +Needed to cleanup! Basically sorted through a few editgroups manually, then +rejected all the rest and manually re-submitted with the below queries and +commands: + + SELECT COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'; + => 589 + + SELECT ingest_file_result.status, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY ingest_file_result.status; + + status | count + ----------------+------- + cdx-error | 1 + success | 587 + wrong-mimetype | 1 + (3 rows) + + + SELECT grobid.status_code, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY grobid.status_code; + + status_code | count + -------------+------- + 200 | 385 + 500 | 202 + | 2 + (3 rows) + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 500 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json'; + => COPY 202 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 200 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json'; + => COPY 385 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + | jq '. + {force_recrawl: true}' -c \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n100 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n10000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 |