ingests: normalize file names; commit updates

author: Bryan Newbold <bnewbold@archive.org> 2020-05-26 14:47:17 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-05-26 14:47:17 -0700
commit: 5dd8785d710cf7d067afdc691069bfa74406e06a (patch)
tree: 8ff16b25cee10f38127caf7fdb266d41fea12d83 /notes/ingest/2020-05_oai_pmh.md
parent: 4598ea9242d1001e473e6340342afea854868577 (diff)
download: sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.tar.gz
sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.zip
1 files changed, 125 insertions, 0 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..4cfd8d5
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,125 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+    => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+    zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+    => 16,013,503
+
+    zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+    => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+    zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+    => 80M 6:36:55 [3.36k/s]
+
+    time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+    => 80M 4:00:21 [5.55k/s]
+    => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+    => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+    => real    240m21.207s
+    => user    85m12.576s
+    => sys     3m29.580s
+
+    select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+    => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+    zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+    => 51,185,088
+
+    zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+    wc -l request_url.txt
+    => 50,002,674 request_url.txt
+
+    zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+    wc -l requires_oai.txt
+    => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'oai'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count
+    -------------------------+----------
+                             | 49491452
+     success                 |  1469113
+     no-capture              |   134611
+     redirect-loop           |    59666
+     no-pdf-link             |     8947
+     cdx-error               |     7561
+     terminal-bad-status     |     6704
+     null-body               |     5042
+     wrong-mimetype          |      879
+     wayback-error           |      722
+     petabox-error           |      198
+     gateway-timeout         |       86
+     link-loop               |       51
+     invalid-host-resolution |       24
+     spn2-cdx-lookup-failure |       22
+     spn2-error              |        4
+     bad-gzip-encoding       |        4
+     spn2-error:job-failed   |        2
+    (18 rows)
+
+Dump ingest requests:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.ingest_type = ingest_request.ingest_type
+            AND ingest_file_result.base_url = ingest_request.base_url
+        WHERE
+            ingest_request.ingest_type = 'pdf'
+            AND ingest_request.link_source = 'oai'
+            AND date(ingest_request.created) > '2020-05-01'
+            AND ingest_file_result.status IS NULL
+    ) TO '/grande/snapshots/oai_noingest_20200506.requests.json';
+    => COPY 49491452
+
+    cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
author	Bryan Newbold <bnewbold@archive.org>	2020-05-26 14:47:17 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-05-26 14:47:17 -0700
commit	5dd8785d710cf7d067afdc691069bfa74406e06a (patch)
tree	8ff16b25cee10f38127caf7fdb266d41fea12d83 /notes/ingest/2020-05_oai_pmh.md
parent	4598ea9242d1001e473e6340342afea854868577 (diff)
download	sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.tar.gz sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.zip