aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-03-11 12:05:29 -0800
committerBryan Newbold <bnewbold@archive.org>2022-03-11 12:05:29 -0800
commit9d096b26e35802553263d6472a534deb381e65da (patch)
treeed28dfc8bb4e1d00410a716f498a3c717fdebf15 /notes
parent4aa53c730f5fe9e0a5249d10d0b6ac0ff3c0db7c (diff)
downloadsandcrawler-9d096b26e35802553263d6472a534deb381e65da.tar.gz
sandcrawler-9d096b26e35802553263d6472a534deb381e65da.zip
partial notes on .ua urgent crawling
Diffstat (limited to 'notes')
-rw-r--r--notes/tasks/2022-03-07_ukraine_firedrill.md196
1 files changed, 196 insertions, 0 deletions
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md
new file mode 100644
index 0000000..18bce5b
--- /dev/null
+++ b/notes/tasks/2022-03-07_ukraine_firedrill.md
@@ -0,0 +1,196 @@
+
+Want to do priority crawling of Ukranian web content, plus Russia and Belarus.
+
+
+## What is Missing?
+
+ (country_code:ua OR lang:uk)
+ => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA
+ => later in day, already some 22k missing found! wow
+
+
+## Metadata Prep
+
+- container metadata update (no code changes)
+ x wikidata SPARQL update
+ x chocula run
+ x journal metadata update (fatcat)
+ x update journal stats (fatcat extra)
+- DOAJ article metadata import
+ x prep and upload single JSON file
+
+
+## Journal Homepage URL Crawl
+
+x dump ukraine-related journal homepages from chocula DB
+x create crawl config
+x start crawl
+x repeat for belarus and russia
+
+
+ python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv
+ cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv
+
+sqlite3:
+
+ select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi
+ 1952
+
+ SELECT COUNT(*) FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+ => 1970
+
+ .mode csv
+ .once homepage_urls_ukraine.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ua'
+ OR journal.lang = 'uk'
+ OR journal.name like '%ukrain%'
+ OR journal.publisher like '%ukrain%';
+
+ .mode csv
+ .once homepage_urls_russia.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'ru'
+ OR journal.lang = 'ru'
+ OR journal.name like '%russ%'
+ OR journal.publisher like '%russ%';
+
+ .mode csv
+ .once homepage_urls_belarus.tsv
+ SELECT homepage.url FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ journal.country = 'by'
+ OR journal.lang = 'be'
+ OR journal.name like '%belarus%'
+ OR journal.publisher like '%belarus%';
+
+ cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv
+ 1550 homepage_urls.2022-03-08.ua_tld.tsv
+ 1971 homepage_urls_ukraine.tsv
+ 3482 homepage_urls_ukraine_combined.2022-03-08.tsv
+
+ cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv
+ 3728 homepage_urls_russia.tsv
+ 2420 homepage_urls.2022-03-08.ru_tld.tsv
+ 6030 homepage_urls_russia_combined.2022-03-08.tsv
+
+
+ cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv
+
+ wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv
+ 138 homepage_urls_belarus.tsv
+ 85 homepage_urls.2022-03-08.by_tld.tsv
+ 222 homepage_urls_belarus_combined.2022-03-08.tsv
+
+
+## Landing Page Crawl
+
+x create crawl config
+x fatcat ingest query for related URLs
+ => special request code/label?
+x finish .by and .ru article URL dump, start crawling
+x URL list filtered from new OAI-PMH feed
+ => do we need to do full bulk load/dump, or not?
+- URL list from partner (google)
+- do we need to do alternative thing of iterating over containers, ingesting each?
+
+ ./fatcat_ingest.py --env prod \
+ --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk"
+
+ # around Tue 08 Mar 2022 01:07:37 PM PST
+ # Expecting 185659 release objects in search queries
+ # didn't complete successfully? hrm
+
+ # ok, retry "manually" (with kafkacat)
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ua OR lang:uk" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json
+ # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318})
+ # 103k 0:25:04 [68.7 /s]
+
+ zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz
+ # 103k 0:00:02 [38.1k/s]
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:by OR lang:be" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz
+ # Expecting 2266 release objects in search queries
+ # 1.29k 0:00:34 [37.5 /s]
+
+ zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz
+
+ ./fatcat_ingest.py --env prod \
+ --ingest-type pdf \
+ --allow-non-oa \
+ query "country_code:ru OR lang:ru" \
+ | pv -l \
+ | gzip \
+ > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz
+ # Expecting 1515246 release objects in search queries
+
+ zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz
+
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt
+ # 309k 0:00:03 [81.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt
+ # 71.2k 0:00:03 [19.0k/s]
+
+ zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt
+ # 276k 0:00:03 [72.9k/s]
+
+## Outreach
+
+- openalex
+- sucho.org
+- ceeol.com