diff options
Diffstat (limited to 'notes/tasks')
-rw-r--r-- | notes/tasks/2020-01-06_heuristic_cdx.txt | 37 | ||||
-rw-r--r-- | notes/tasks/2020-01-27_cleanup_cdx.md | 34 | ||||
-rw-r--r-- | notes/tasks/2020-01-27_grobid_backfill.md | 40 | ||||
-rw-r--r-- | notes/tasks/2020-02-14_pdftrio.md | 162 | ||||
-rw-r--r-- | notes/tasks/2020-07-22_processing_holes.md | 120 | ||||
-rw-r--r-- | notes/tasks/2020-08-20_file_meta.md | 66 | ||||
-rw-r--r-- | notes/tasks/2020-10-21_pdfextract_holes.md | 74 | ||||
-rw-r--r-- | notes/tasks/2021-09-09_pdf_url_lists.md | 66 |
8 files changed, 599 insertions, 0 deletions
diff --git a/notes/tasks/2020-01-06_heuristic_cdx.txt b/notes/tasks/2020-01-06_heuristic_cdx.txt new file mode 100644 index 0000000..209fa4f --- /dev/null +++ b/notes/tasks/2020-01-06_heuristic_cdx.txt @@ -0,0 +1,37 @@ + +Wanted to include a large number of additional CDX lines based on regex +pattern. These are primarily .edu domains with things that look like user +accounts *and* .pdf file extensions in the path. + +## Commands + +aitio:/fast/gwb_pdfs + + pdfs/gwb-pdf-20191005172329-url-heuristics-edu + pdfs/gwb-pdf-20191005172329-url-heuristics + + +to filter as url/sha1 uniq: + + cat raw.cdx | sort -u -t' ' -k3,6 -S 4G > uniq.cdx + + cat gwb-pdf-20191005172329-url-heuristics-edu/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx + cat gwb-pdf-20191005172329-url-heuristics/part-r-000* | sort -u -t' ' -k3,6 -S 4G > gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx + + 7241795 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx + 41137888 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx + + cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx | sort -u -S 4G | wc -l + 7241795 + + cut -d' ' -f6 gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx | sort -u -S 4G | wc -l + 41137888 + + ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics-edu.uniq_url_sha1.cdx + Worker: Counter({'total': 7239153, 'insert-cdx': 6845283, 'update-cdx': 0}) + CDX lines pushed: Counter({'total': 7241795, 'pushed': 7239153, 'skip-parse': 2603, 'skip-mimetype': 39}) + + ./persist_tool.py cdx /fast/gwb_pdf/gwb-pdf-20191005172329-url-heuristics.uniq_url_sha1.cdx + Worker: Counter({'total': 41030360, 'insert-cdx': 22430064, 'update-cdx': 0}) + CDX lines pushed: Counter({'total': 41137888, 'pushed': 41030360, 'skip-mimetype': 87341, 'skip-parse': 20187}) + diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md new file mode 100644 index 0000000..54db92e --- /dev/null +++ b/notes/tasks/2020-01-27_cleanup_cdx.md @@ -0,0 +1,34 @@ + +Accidentally seem to have backfilled many CDX lines with non-PDF content. +Should clear these out! + +Something like: + + mimetype = 'text/html' + not in file_meta + +Or maybe instead: + + mimetype = 'text/html' + not in file_meta + +SQL: + + SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5; + SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01'; + => 24841846 + + SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5; + SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL; + => 24547552 + + DELETE FROM cdx + WHERE sha1hex IN + (SELECT cdx.sha1hex + FROM cdx + LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex + WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL); + => DELETE 24553428 + +Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in +the DELETE WHERE clause. diff --git a/notes/tasks/2020-01-27_grobid_backfill.md b/notes/tasks/2020-01-27_grobid_backfill.md new file mode 100644 index 0000000..d70e203 --- /dev/null +++ b/notes/tasks/2020-01-27_grobid_backfill.md @@ -0,0 +1,40 @@ + +Recently added a bunch of PDFs to sandcrawler-db. Want to GROBID extract the +~15m which haven't been processed yet. Also want to re-GROBID a batch of +PDFs-in-zipfiles from archive.org; will probably also want to re-GROBID other +petabox files soon. + +## pre-1923 zipfile re-extraction + +Exact commands (in parallel): + + fd .zip /srv/sandcrawler/tasks/crossref-pre-1909-scholarly-works/ | \ + parallel -j16 --progress --joblog extract_tasks.log --resume-failed \ + './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' + + fd .zip /srv/sandcrawler/tasks/crossref-pre-1923-scholarly-works/ | \ + parallel -j16 --progress --joblog extract_tasks_1923.log --resume-failed \ + './grobid_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 extract-zipfile {}' + +## petabox re-extraction + +This was run around 2020-02-03. There are a few million remaining PDFs that +have only partial file metadata (`file_meta`), meaning run with old version of +sandcrawler code. Want to get them all covered, maybe even DELETE the missing +ones, so re-grobiding petabox-only files. + +There are about 2,887,834 files in petabox, only 46,232 need re-processing (!). + + psql sandcrawler < dump_regrobid_pdf_petabox.sql + cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json + +This is pretty few... maybe even would have been caught by wayback backfill? + +Small start: + + head /srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.uniq.json | ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - + +Full batch, 25x parallel: + + cat /srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.uniq.json | pv -l | parallel -j25 --pipe ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json - + diff --git a/notes/tasks/2020-02-14_pdftrio.md b/notes/tasks/2020-02-14_pdftrio.md new file mode 100644 index 0000000..e6f8d8e --- /dev/null +++ b/notes/tasks/2020-02-14_pdftrio.md @@ -0,0 +1,162 @@ + +First end-to-end `pdf_trio` results! + +## Source + +Will use AIT partner #1830 (U Alberta) CDX as input. These are unique by +digest, about 100k. + + ArchiveIt-Collection-1830.download.cdx + +## Testing/Prep + +Versions/setup: + + sandcrawler: f613f69a40fcc9a445f21cadd35d7c36c8061db8 + => patched to 'auto' mode + + pdf_trio: 03bd3fdc15418462b2b1582e4f967f26ddcb43e2 + + pdftrio: 'auto' mode + + uwsgi: 16x processes + + sudo docker run --rm -p 8501:8501 -e TF_XLA_FLAGS=--tf_xla_cpu_global_jit -e KMP_AFFINITY=granularity=fine,compact,1,0 -e KMP_BLOCKTIME=0 -e OMP_NUM_THREADS=24 -e TENSORFLOW_INTER_OP_PARALLELISM=1 -e TENSORFLOW_INTRA_OP_PARALLELISM=24 -v /srv/pdftrio//models/bert_models:/models/bert_model -v /srv/pdftrio//models/pdf_image_classifier_model:/models/image_model -v /srv/pdftrio//config/tfserving_models_docker.config:/models/tfserving_models.config -v /srv/pdftrio/config/tfserving_batch.config:/models/tfserving_batch.config --name pdftrio-tfserving tensorflow/serving --model_config_file=/models/tfserving_models.config --enable_batching=true --batching_parameters_file=/models/tfserving_batch.config + +Basic testing:: + + head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - | jq . + + head -n100 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j20 --pipe --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + => Running in kafka output mode, publishing to sandcrawler-qa.pdftrio-output + + +On the persist side: + + kafkacat -C -b wbgrp-svc263.us.archive.org -t sandcrawler-qa.pdftrio-output | head | jq . + => looks fine + + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env qa persist-pdftrio + => Consuming from kafka topic sandcrawler-qa.pdftrio-output, group persist-pdftrio + +Ah, don't forget, start persist before writing to topic! Or would need to reset +offsets to start. + +Seems to be only a single pdftext instance running? Very low CPU + + head -n500 /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + +That is much better! CPU still not pegged, so maybe could do 50x processes? Lots of I/O wait. Blech. + +Zero ("0") not getting persisted for any columns (fixed in sandcrawler/db.py) + +`models_date` not getting set. Added `PDFTRIO_MODELS_DATE="2020-01-01"` to env. (TODO: ansible) + +## Prod Run + + ./sandcrawler_worker.py --kafka-hosts wbgrp-svc263.us.archive.org --env prod persist-pdftrio + + time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j40 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + +Worker CPU basically blocked on pdftotext, multiple 100% CPU. Presumably I/O +wait? Though not totally sure. + +htop: + + PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command + 17951 pdftrio 20 0 51756 12868 5856 R 90.1 0.0 0:06.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p + 17870 pdftrio 20 0 52004 12964 5684 R 87.4 0.0 0:08.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p + 13735 root 20 0 10.4G 3815M 4144 S 79.6 7.6 48h02:37 tensorflow_model_server --port=8500 --rest_api_port=850 + 14522 pdftrio 20 0 2817M 1331M 16896 R 43.1 2.6 0:57.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 18027 pdftrio 20 0 49192 10692 6116 R 39.8 0.0 0:00.61 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p + 14518 pdftrio 20 0 2818M 1336M 16836 S 33.3 2.7 0:47.46 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14504 pdftrio 20 0 2731M 1310M 13164 D 32.6 2.6 0:34.81 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14526 pdftrio 20 0 2816M 1333M 16832 R 28.7 2.7 0:57.22 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14500 pdftrio 20 0 2729M 1306M 13160 R 20.9 2.6 0:22.57 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14492 pdftrio 20 0 2729M 1307M 13156 S 17.6 2.6 0:17.91 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14508 pdftrio 20 0 2734M 1312M 14380 D 14.4 2.6 0:38.75 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14496 pdftrio 20 0 2728M 1300M 13160 S 13.7 2.6 0:18.00 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 17314 sandcrawl 20 0 56668 18228 4304 D 13.7 0.0 0:02.31 perl /usr/bin/parallel -j40 -N1 --pipe --round-robin -- + 14472 pdftrio 20 0 2725M 1283M 13136 S 12.4 2.6 0:05.69 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14513 pdftrio 20 0 2730M 1309M 14300 S 11.1 2.6 0:40.32 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14480 pdftrio 20 0 2725M 1291M 13144 S 10.4 2.6 0:08.77 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14488 pdftrio 20 0 2725M 1294M 13152 S 9.8 2.6 0:08.18 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 14468 pdftrio 20 0 2717M 1271M 13088 S 6.5 2.5 0:02.42 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 17411 sandcrawl 20 0 556M 53840 14936 S 6.5 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 14530 pdftrio 20 0 2524M 1252M 3492 S 4.6 2.5 0:12.72 /usr/bin/uwsgi --ini /srv/pdftrio/src/uwsgi.ini + 7311 bnewbold 20 0 27716 5520 3128 R 3.9 0.0 0:41.59 htop + 17444 sandcrawl 20 0 552M 50456 14892 S 3.9 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 18042 pdftrio 20 0 46068 6588 5328 R 3.3 0.0 0:00.05 pdftotext -nopgbrk -eol unix -enc UTF-8 /tmp/research-p + 18043 pdftrio 20 0 4 4 0 R 2.6 0.0 0:00.04 + 2203 grobid 20 0 6334M 126M 4188 S 0.7 0.3 3h27:32 /usr/lib/jvm/java-8-openjdk-amd64/bin/java -XX:MaxMetas + 17419 sandcrawl 20 0 619M 116M 15248 S 0.7 0.2 0:02.68 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17440 sandcrawl 20 0 578M 76948 15160 S 0.7 0.1 0:01.54 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 13848 root 20 0 0 0 0 D 0.7 0.0 0:00.69 kworker/u60:1 + 17443 sandcrawl 20 0 578M 76500 14912 S 0.7 0.1 0:01.74 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17414 sandcrawl 20 0 580M 77720 15036 S 0.0 0.2 0:01.77 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17432 sandcrawl 20 0 563M 61460 14976 S 0.0 0.1 0:01.59 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17442 sandcrawl 20 0 561M 53096 15240 S 0.0 0.1 0:01.47 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17433 sandcrawl 20 0 559M 57160 15176 S 0.0 0.1 0:01.52 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17431 sandcrawl 20 0 554M 50960 14892 S 0.0 0.1 0:01.37 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + 17413 sandcrawl 20 0 554M 52376 14920 S 0.0 0.1 0:01.57 python3 ./pdftrio_tool.py --kafka-mode --kafka-env qa - + +dstat: + + ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- + usr sys idl wai hiq siq| read writ| recv send| in out | int csw + 32 9 22 37 0 0| 0 37M| 20M 12M| 0 0 | 35k 64k + 20 6 24 50 0 0| 0 20M| 30M 5662k| 0 0 | 27k 48k + 27 7 24 43 0 0| 0 26M|8712k 6289k| 0 0 | 21k 114k + 30 8 23 38 0 0|4096B 61M| 17M 20M| 0 0 | 31k 54k + 33 6 17 44 0 0| 0 32M| 14M 6384k| 0 0 | 27k 46k + 25 6 24 44 0 0| 0 19M| 18M 13M| 0 0 | 27k 179k + 40 6 19 35 0 0|8192B 25M|7855k 6661k| 0 0 | 31k 85k + 59 8 12 20 0 0| 0 39M|4177k 33M| 0 0 | 34k 64k + 34 4 17 44 0 0| 0 16M|7527k 11M| 0 0 | 22k 45k + 44 7 17 32 0 0| 0 30M| 20M 291k| 0 0 | 36k 62k + +Create tmpfs: + + sudo mkdir -p /pdftrio-ramdisk + #sudo mount -t tmpfs -o size=2g tmpfs /pdftrio-ramdisk + sudo mount -t tmpfs -o size=6g tmpfs /pdftrio-ramdisk + +add to pdftrio config env and restart: + + TEMP=/run/pdf_trio + +Seems to have worked. Pretty much maxed CPU, may need to back-off parallelism. Doing more than 31/sec. + +Errors were not getting encoded correctly: + + File "/fast/sandcrawler/python/sandcrawler/persist.py", line 331, in push_batch + r['pdf_trio']['key'] = r['key'] + KeyError: 'pdf_trio' + +Fixed in sandcrawler worker, and patched persist to work around this. + + time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + +Wow, 30x parallelism waaaay less? + + time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1830.download.cdx | parallel -j30 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + +What changed? Confused. Load average was like 40. + +Via kafka, as much as 69.71/sec! Errors? + +Hrm, this whole `auto` thing. I am very skeptical. Should also do a run as `all`, -j20. + + Worker: Counter({'total': 1916, 'pushed': 1916}) + CDX lines pushed: Counter({'total': 1934, 'pushed': 1916, 'skip-parse': 18}) + +Hit some bugs, causing failure, but still seem to have processed a good chunk. + +Switched to `all`, running a different batch: + + time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-1914.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + +After flag change, another batch in `all`: + + time cat /srv/sandcrawler/tasks/ArchiveIt-Collection-2566.download.cdx | parallel -j20 -N1 --pipe --round-robin --linebuffer ./pdftrio_tool.py --kafka-mode --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc285.us.archive.org --pdftrio-host http://localhost:3939 -j0 classify-pdf-cdx - + diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md new file mode 100644 index 0000000..70e2b59 --- /dev/null +++ b/notes/tasks/2020-07-22_processing_holes.md @@ -0,0 +1,120 @@ + +Want to clean up missing/partial processing (GROBID, `pdf_meta`, `file_meta`) +in sandcrawler database. + + +## `pdf_meta` for petabox rows + +Ran `dump_unextracted_pdf_petabox.sql` SQL, which resulted in a .json file. + + wc -l dump_unextracted_pdf_petabox.2020-07-22.json + 1503086 dump_unextracted_pdf_petabox.2020-07-22.json + +Great, 1.5 million, not too many. Start small: + + head -n1000 dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +Full batch: + + cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +Re-ran on 2020-08-19: + + wc -l dump_unextracted_pdf_petabox.2020-08-19.json + 971194 dump_unextracted_pdf_petabox.2020-08-19.json + +## `pdf_meta` missing CDX rows + +First, the GROBID-ized rows but only if has a fatcat file as well. + +10,755,365! That is a lot still to process. + + cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +Re-ran on 2020-08-19: + + wc -l dump_unextracted_pdf.fatcat.2020-08-19.json + 65517 dump_unextracted_pdf.fatcat.2020-08-19.json + +Enqueued! + +## `GROBID` missing petabox rows + + wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json + 972221 /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json + +Start small: + + head -n1000 dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Full batch: + + cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Re-ran on 2020-08-19: + + wc -l dump_ungrobided_pdf_petabox.2020-08-19.json + 933 dump_ungrobided_pdf_petabox.2020-08-19.json + +Enqueued! + +## `GROBID` for missing CDX rows in fatcat + + wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json + 1808580 dump_ungrobided_pdf.fatcat.2020-07-22.json + +Full batch: + + cat dump_ungrobided_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +## `GROBID` for bad status + +Eg, wayback errors. + +TODO + +## `pdf_trio` for OA journal crawls + +TODO + +## `pdf_trio` for "included by heuristic", not in fatcat + +TODO + +## Live-ingest missing arxiv papers + + ./fatcat_ingest.py --allow-non-oa --limit 10000 query arxiv_id:* > /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json + => Expecting 1505184 release objects in search queries + + cat /srv/fatcat/snapshots/arxiv_10k_ingest_requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 22 + +Repeating this every few days should (?) result in all the backlog of arxiv +papers getting indexed. Could focus on recent years to start (with query +filter). + +## re-ingest spn2 errors (all time) + +Eg: + + spn2-cdx-lookup-failure: 143963 + spn-error: 101773 + spn2-error: 16342 + +TODO + +## re-try CDX errors + +Eg, for unpaywall only, bulk ingest all `cdx-error`. + +TODO + +## live ingest unpaywall `no-capture` URLs + +After re-trying the CDX errors for unpaywall URLs (see above), count all the +no-capture URLs, and if reasonable recrawl them all in live more ("reasonable" +meaning fewer than 200k or so URLs). + +Could also force recrawl (not using CDX lookups) for some publisher platforms +if that made sense. + +TODO diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md new file mode 100644 index 0000000..39c84dd --- /dev/null +++ b/notes/tasks/2020-08-20_file_meta.md @@ -0,0 +1,66 @@ + +Want to update fatcat file entities with "full" file metadata for those which are missing it. + +How many `file_meta` rows *still* don't have metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 62962 + +First generate list of sha1hex from most recent bulk export which are missing +at least some metadata (based on missing sha256): + + zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv + => 18.7M 0:05:46 [53.8k/s] + +Then dump the entire sandcrawler `file_meta` table as TSV, with first column +sha1hex and second column JSON with all the file metadata fields: + + COPY ( + SELECT sha1hex, row_to_json(file_meta) + FROM file_meta + WHERE sha256hex IS NOT NULL + ORDER BY sha1hex ASC + ) + TO '/grande/snapshots/file_meta_dump.tsv' + WITH NULL ''; + +Join/cut: + + export LC_ALL=C + join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json + => 18.1M 0:03:37 [83.2k/s] + +Check counts: + + cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l + => 18135313 + + zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr + 18103860 application/pdf + 29977 application/octet-stream + 876 text/html + 199 application/postscript + 171 application/gzip + 84 text/plain + 48 application/xml + 38 application/vnd.ms-powerpoint + 16 application/msword + 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document + 6 image/jpeg + 4 message/rfc822 + 4 application/zip + 4 application/vnd.openxmlformats-officedocument.presentationml.presentation + 3 text/x-tex + 3 application/x-dosexec + 2 application/x-tar + 2 application/vnd.ms-tnef + 1 video/mpeg + 1 image/tiff + 1 image/svg+xml + 1 image/png + 1 image/gif + 1 audio/x-ape + 1 application/vnd.ms-office + 1 application/CDFV2-unknown + +TODO: fatcat importer diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md new file mode 100644 index 0000000..c0bb65e --- /dev/null +++ b/notes/tasks/2020-10-21_pdfextract_holes.md @@ -0,0 +1,74 @@ + +Realized I had not enabled persisting of PDF extraction results (thumbnail, +text) in ingest worker when added over the summer. So now need to run a +catch-up. This applied to both "live" and "bulk" ingest. + +## `cdx` / `ingest` / `grobid` catch-up + +First, re-run extraction for cases where we did an ingest, and grobid ran +successfully, and we have a CDX row, but no `pdf_meta`: + + -- this is a slow query + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' + WITH NULL ''; + => 19,676,116 + +Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably. + + cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And again, after a couple partitions got hung up: + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json' + WITH NULL ''; + + + cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + => 562k 0:00:16 [34.6k/s] + +## `petabox` / `grobid` catch-up + +These didn't all seem to extract correctly before after 1.5m rows, there will +still 900k unprocessed. Trying again. + + COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM grobid + LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE petabox.sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json' + WITH NULL ''; + + cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## `cdx` / `grobid` catch-up + +Next will be to process PDFs with GROBID and CDX but no ingest. + diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md new file mode 100644 index 0000000..52a3264 --- /dev/null +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -0,0 +1,66 @@ + +Want to dump a URL list to share with partners, filtered to content we think is +likely to be scholarly. + +Columns to include: + +- original URL +- capture timestamp +- SHA1 + +## Stats Overview + +file_meta table, mimetype=application/pdf: 173,816,433 + +cdx table, mimetype=application/pdf: 131,346,703 + +ingest_file_result table, pdf, success: 66,487,928 + +## Ingested PDF URLs + +"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also? + + COPY ( + SELECT + base_url as start_url, + terminal_url as pdf_url, + terminal_dt as pdf_url_timestamp, + terminal_sha1hex as pdf_sha1hex + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv' + WITH NULL ''; + => 77,892,849 + +## CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.url as pdf_url, + cdx.datetime as pdf_url_timestamp, + cdx.sha1hex as pdf_sha1hex + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv' + WITH NULL ''; + => 147,837,935 + +## Processed web PDFs + +"Parsed web PDFs": `file_meta`, left join CDX + +(didn't do this one) |