aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-26 14:47:17 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-26 14:47:17 -0700
commit5dd8785d710cf7d067afdc691069bfa74406e06a (patch)
tree8ff16b25cee10f38127caf7fdb266d41fea12d83
parent4598ea9242d1001e473e6340342afea854868577 (diff)
downloadsandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.tar.gz
sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.zip
ingests: normalize file names; commit updates
-rw-r--r--notes/ingest/2019-10-23_testing.md (renamed from notes/ingest/20191023_testing.md)0
-rw-r--r--notes/ingest/2020-01-14_bulk.md (renamed from notes/ingest/20200114_bulk_ingests.md)0
-rw-r--r--notes/ingest/2020-02_unpaywall.md (renamed from notes/ingest/2020-02-14_unpaywall_ingest.md)0
-rw-r--r--notes/ingest/2020-03-oa_but_not_marked.md25
-rw-r--r--notes/ingest/2020-03_mag.md (renamed from notes/ingest/2020-03-04_mag.md)0
-rw-r--r--notes/ingest/2020-03_s2.md (renamed from notes/ingest/2020-03_s2_ingest.md)0
-rw-r--r--notes/ingest/2020-04-07_unpaywall.md63
-rw-r--r--notes/ingest/2020-04_datacite.md (renamed from notes/ingest/2020-04-07_datacite.md)0
-rw-r--r--notes/ingest/2020-04_unpaywall.md129
-rw-r--r--notes/ingest/2020-05_oai_pmh.md125
10 files changed, 279 insertions, 63 deletions
diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/2019-10-23_testing.md
index 481c4e2..481c4e2 100644
--- a/notes/ingest/20191023_testing.md
+++ b/notes/ingest/2019-10-23_testing.md
diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/2020-01-14_bulk.md
index 9d05cda..9d05cda 100644
--- a/notes/ingest/20200114_bulk_ingests.md
+++ b/notes/ingest/2020-01-14_bulk.md
diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02_unpaywall.md
index e18a2ff..e18a2ff 100644
--- a/notes/ingest/2020-02-14_unpaywall_ingest.md
+++ b/notes/ingest/2020-02_unpaywall.md
diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md
new file mode 100644
index 0000000..73396bd
--- /dev/null
+++ b/notes/ingest/2020-03-oa_but_not_marked.md
@@ -0,0 +1,25 @@
+
+These are large journals with a high fraction of "in IA", but not marked as OA
+so not crawling regularly.
+
+TODO: add things like list of unpaywall ISSN / OA status to try and find more
+"practical" / bronze OA
+
+## First Run
+
+https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him
+https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4
+https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4
+https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e
+https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm
+https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe
+
+## TODO
+
+https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible)
+https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?)
+
+https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link?
+https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA?
+https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken?
+https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03_mag.md
index 428ce05..428ce05 100644
--- a/notes/ingest/2020-03-04_mag.md
+++ b/notes/ingest/2020-03_mag.md
diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2.md
index fedaba0..fedaba0 100644
--- a/notes/ingest/2020-03_s2_ingest.md
+++ b/notes/ingest/2020-03_s2.md
diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md
deleted file mode 100644
index e30d482..0000000
--- a/notes/ingest/2020-04-07_unpaywall.md
+++ /dev/null
@@ -1,63 +0,0 @@
-
-A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
-not released for more than a month).
-
-Primary goal is:
-
-- generate ingest requests for only *new* URLs
-- bulk ingest these new URLs
-- crawl any no-capture URLs from that batch
-- re-bulk-ingest the no-capture batch
-- analytics on failed ingests. eg, any particular domains that are failing to crawl
-
-This ingest pipeline was started on 2020-04-07 by bnewbold.
-
-## Transform and Load
-
- # in sandcrawler pipenv on aitio
- zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
- => 24.7M 5:17:03 [ 1.3k/s]
-
- cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
- => 24.7M
- => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
-
-## Dump new URLs and Bulk Ingest
-
- COPY (
- SELECT row_to_json(ingest_request.*)
- FROM ingest_request
- LEFT JOIN ingest_file_result
- ON ingest_file_result.ingest_type = ingest_request.ingest_type
- AND ingest_file_result.base_url = ingest_request.base_url
- WHERE
- ingest_request.ingest_type = 'pdf'
- AND ingest_request.link_source = 'unpaywall'
- AND date(ingest_request.created) > '2020-04-01'
- AND ingest_file_result.status IS NULL
- ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
- => 3696189
-
- cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-
-## Dump no-capture
-
- COPY (
- SELECT row_to_json(ingest_request.*)
- FROM ingest_request
- LEFT JOIN ingest_file_result
- ON ingest_file_result.ingest_type = ingest_request.ingest_type
- AND ingest_file_result.base_url = ingest_request.base_url
- WHERE
- ingest_request.ingest_type = 'pdf'
- AND ingest_request.link_source = 'unpaywall'
- AND date(ingest_request.created) > '2020-04-01'
- AND ingest_file_result.status = 'no-capture'
- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
- AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
- AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
- AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
- AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
- AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
- AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
- ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json';
diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04_datacite.md
index 0fc7e67..0fc7e67 100644
--- a/notes/ingest/2020-04-07_datacite.md
+++ b/notes/ingest/2020-04_datacite.md
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md
new file mode 100644
index 0000000..bce757b
--- /dev/null
+++ b/notes/ingest/2020-04_unpaywall.md
@@ -0,0 +1,129 @@
+
+A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but
+not released for more than a month).
+
+Primary goal is:
+
+- generate ingest requests for only *new* URLs
+- bulk ingest these new URLs
+- crawl any no-capture URLs from that batch
+- re-bulk-ingest the no-capture batch
+- analytics on failed ingests. eg, any particular domains that are failing to crawl
+
+This ingest pipeline was started on 2020-04-07 by bnewbold.
+
+Ran through the first two steps again on 2020-05-03 after unpaywall had
+released another dump (dated 2020-04-27).
+
+## Transform and Load
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json
+ => 24.7M 5:17:03 [ 1.3k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => 24.7M
+ => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0})
+
+Second time:
+
+ # in sandcrawler pipenv on aitio
+ zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json
+ => 25.2M 3:16:28 [2.14k/s]
+
+ cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390})
+
+
+## Dump new URLs and Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json';
+ => 3696189
+
+ cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Second time:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json';
+ => 1799760
+
+ cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Dump no-capture, Run Crawl
+
+Make two ingest request dumps: one with "all" URLs, which we will have heritrix
+attempt to crawl, and then one with certain domains filtered out, which we may
+or may not bother trying to ingest (due to expectation of failure).
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json';
+ => 2734145
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2020-04-01'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json';
+ => 2602408
+
+Not actually a very significant size difference after all.
+
+See `journal-crawls` repo for details on seedlist generation and crawling.
+
+## Re-Ingest Post-Crawl
+
+Test small batch:
+
+ zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Run the whole batch:
+
+ zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..4cfd8d5
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,125 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+ => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+ zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+ => 16,013,503
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+ => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+ zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+ => 80M 6:36:55 [3.36k/s]
+
+ time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 80M 4:00:21 [5.55k/s]
+ => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+ => real 240m21.207s
+ => user 85m12.576s
+ => sys 3m29.580s
+
+ select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+ => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+ zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+ => 51,185,088
+
+ zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+ wc -l request_url.txt
+ => 50,002,674 request_url.txt
+
+ zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+ wc -l requires_oai.txt
+ => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ | 49491452
+ success | 1469113
+ no-capture | 134611
+ redirect-loop | 59666
+ no-pdf-link | 8947
+ cdx-error | 7561
+ terminal-bad-status | 6704
+ null-body | 5042
+ wrong-mimetype | 879
+ wayback-error | 722
+ petabox-error | 198
+ gateway-timeout | 86
+ link-loop | 51
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ spn2-error | 4
+ bad-gzip-encoding | 4
+ spn2-error:job-failed | 2
+ (18 rows)
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/oai_noingest_20200506.requests.json';
+ => COPY 49491452
+
+ cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+