aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2020-05_oai_pmh.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2020-05_oai_pmh.md')
-rw-r--r--notes/ingest/2020-05_oai_pmh.md125
1 files changed, 125 insertions, 0 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..4cfd8d5
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,125 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+ => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+ zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+ => 16,013,503
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+ => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+ zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+ => 80M 6:36:55 [3.36k/s]
+
+ time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 80M 4:00:21 [5.55k/s]
+ => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+ => real 240m21.207s
+ => user 85m12.576s
+ => sys 3m29.580s
+
+ select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+ => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+ zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+ => 51,185,088
+
+ zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+ wc -l request_url.txt
+ => 50,002,674 request_url.txt
+
+ zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+ wc -l requires_oai.txt
+ => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ | 49491452
+ success | 1469113
+ no-capture | 134611
+ redirect-loop | 59666
+ no-pdf-link | 8947
+ cdx-error | 7561
+ terminal-bad-status | 6704
+ null-body | 5042
+ wrong-mimetype | 879
+ wayback-error | 722
+ petabox-error | 198
+ gateway-timeout | 86
+ link-loop | 51
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ spn2-error | 4
+ bad-gzip-encoding | 4
+ spn2-error:job-failed | 2
+ (18 rows)
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/oai_noingest_20200506.requests.json';
+ => COPY 49491452
+
+ cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+