diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-05-26 14:47:17 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-26 14:47:17 -0700 | 
| commit | 5dd8785d710cf7d067afdc691069bfa74406e06a (patch) | |
| tree | 8ff16b25cee10f38127caf7fdb266d41fea12d83 | |
| parent | 4598ea9242d1001e473e6340342afea854868577 (diff) | |
| download | sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.tar.gz sandcrawler-5dd8785d710cf7d067afdc691069bfa74406e06a.zip | |
ingests: normalize file names; commit updates
| -rw-r--r-- | notes/ingest/2019-10-23_testing.md (renamed from notes/ingest/20191023_testing.md) | 0 | ||||
| -rw-r--r-- | notes/ingest/2020-01-14_bulk.md (renamed from notes/ingest/20200114_bulk_ingests.md) | 0 | ||||
| -rw-r--r-- | notes/ingest/2020-02_unpaywall.md (renamed from notes/ingest/2020-02-14_unpaywall_ingest.md) | 0 | ||||
| -rw-r--r-- | notes/ingest/2020-03-oa_but_not_marked.md | 25 | ||||
| -rw-r--r-- | notes/ingest/2020-03_mag.md (renamed from notes/ingest/2020-03-04_mag.md) | 0 | ||||
| -rw-r--r-- | notes/ingest/2020-03_s2.md (renamed from notes/ingest/2020-03_s2_ingest.md) | 0 | ||||
| -rw-r--r-- | notes/ingest/2020-04-07_unpaywall.md | 63 | ||||
| -rw-r--r-- | notes/ingest/2020-04_datacite.md (renamed from notes/ingest/2020-04-07_datacite.md) | 0 | ||||
| -rw-r--r-- | notes/ingest/2020-04_unpaywall.md | 129 | ||||
| -rw-r--r-- | notes/ingest/2020-05_oai_pmh.md | 125 | 
10 files changed, 279 insertions, 63 deletions
| diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/2019-10-23_testing.md index 481c4e2..481c4e2 100644 --- a/notes/ingest/20191023_testing.md +++ b/notes/ingest/2019-10-23_testing.md diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/2020-01-14_bulk.md index 9d05cda..9d05cda 100644 --- a/notes/ingest/20200114_bulk_ingests.md +++ b/notes/ingest/2020-01-14_bulk.md diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02_unpaywall.md index e18a2ff..e18a2ff 100644 --- a/notes/ingest/2020-02-14_unpaywall_ingest.md +++ b/notes/ingest/2020-02_unpaywall.md diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md new file mode 100644 index 0000000..73396bd --- /dev/null +++ b/notes/ingest/2020-03-oa_but_not_marked.md @@ -0,0 +1,25 @@ + +These are large journals with a high fraction of "in IA", but not marked as OA +so not crawling regularly. + +TODO: add things like list of unpaywall ISSN / OA status to try and find more +"practical" / bronze OA + +## First Run + +https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him +https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4 +https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4 +https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e +https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm +https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe + +## TODO + +https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible) +https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?) + +https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link? +https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA? +https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken? +https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03_mag.md index 428ce05..428ce05 100644 --- a/notes/ingest/2020-03-04_mag.md +++ b/notes/ingest/2020-03_mag.md diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2.md index fedaba0..fedaba0 100644 --- a/notes/ingest/2020-03_s2_ingest.md +++ b/notes/ingest/2020-03_s2.md diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md deleted file mode 100644 index e30d482..0000000 --- a/notes/ingest/2020-04-07_unpaywall.md +++ /dev/null @@ -1,63 +0,0 @@ - -A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but -not released for more than a month). - -Primary goal is: - -- generate ingest requests for only *new* URLs -- bulk ingest these new URLs -- crawl any no-capture URLs from that batch -- re-bulk-ingest the no-capture batch -- analytics on failed ingests. eg, any particular domains that are failing to crawl - -This ingest pipeline was started on 2020-04-07 by bnewbold. - -## Transform and Load - -    # in sandcrawler pipenv on aitio -    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json -    => 24.7M 5:17:03 [ 1.3k/s] - -    cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - -    => 24.7M -    => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) - -## Dump new URLs and Bulk Ingest - -    COPY ( -        SELECT row_to_json(ingest_request.*) -        FROM ingest_request -        LEFT JOIN ingest_file_result -            ON ingest_file_result.ingest_type = ingest_request.ingest_type -            AND ingest_file_result.base_url = ingest_request.base_url -        WHERE -            ingest_request.ingest_type = 'pdf' -            AND ingest_request.link_source = 'unpaywall' -            AND date(ingest_request.created) > '2020-04-01' -            AND ingest_file_result.status IS NULL -    ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; -    => 3696189 - -    cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Dump no-capture - -    COPY ( -        SELECT row_to_json(ingest_request.*) -        FROM ingest_request -        LEFT JOIN ingest_file_result -            ON ingest_file_result.ingest_type = ingest_request.ingest_type -            AND ingest_file_result.base_url = ingest_request.base_url -        WHERE -            ingest_request.ingest_type = 'pdf' -            AND ingest_request.link_source = 'unpaywall' -            AND date(ingest_request.created) > '2020-04-01' -            AND ingest_file_result.status = 'no-capture' -            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' -            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' -            AND ingest_request.base_url NOT LIKE '%ahajournals.org%' -            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' -            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' -            AND ingest_request.base_url NOT LIKE '%academic.oup.com%' -            AND ingest_request.base_url NOT LIKE '%tandfonline.com%' -    ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json'; diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04_datacite.md index 0fc7e67..0fc7e67 100644 --- a/notes/ingest/2020-04-07_datacite.md +++ b/notes/ingest/2020-04_datacite.md diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md new file mode 100644 index 0000000..bce757b --- /dev/null +++ b/notes/ingest/2020-04_unpaywall.md @@ -0,0 +1,129 @@ + +A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but +not released for more than a month). + +Primary goal is: + +- generate ingest requests for only *new* URLs +- bulk ingest these new URLs +- crawl any no-capture URLs from that batch +- re-bulk-ingest the no-capture batch +- analytics on failed ingests. eg, any particular domains that are failing to crawl + +This ingest pipeline was started on 2020-04-07 by bnewbold. + +Ran through the first two steps again on 2020-05-03 after unpaywall had +released another dump (dated 2020-04-27). + +## Transform and Load + +    # in sandcrawler pipenv on aitio +    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json +    => 24.7M 5:17:03 [ 1.3k/s] + +    cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - +    => 24.7M +    => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) + +Second time: + +    # in sandcrawler pipenv on aitio +    zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json +    => 25.2M 3:16:28 [2.14k/s] + +    cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - +    => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0}) +    => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390}) + + +## Dump new URLs and Bulk Ingest + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'unpaywall' +            AND date(ingest_request.created) > '2020-04-01' +            AND ingest_file_result.status IS NULL +    ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; +    => 3696189 + +    cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Second time: + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'unpaywall' +            AND date(ingest_request.created) > '2020-05-01' +            AND ingest_file_result.status IS NULL +    ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json'; +    => 1799760 + +    cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Dump no-capture, Run Crawl + +Make two ingest request dumps: one with "all" URLs, which we will have heritrix +attempt to crawl, and then one with certain domains filtered out, which we may +or may not bother trying to ingest (due to expectation of failure). + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'unpaywall' +            AND date(ingest_request.created) > '2020-04-01' +            AND ingest_file_result.status = 'no-capture' +    ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json'; +    => 2734145 + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'unpaywall' +            AND date(ingest_request.created) > '2020-04-01' +            AND ingest_file_result.status = 'no-capture' +            AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' +            AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' +            AND ingest_request.base_url NOT LIKE '%ahajournals.org%' +            AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' +            AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' +            AND ingest_request.base_url NOT LIKE '%academic.oup.com%' +            AND ingest_request.base_url NOT LIKE '%tandfonline.com%' +    ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; +    => 2602408 + +Not actually a very significant size difference after all. + +See `journal-crawls` repo for details on seedlist generation and crawling. + +## Re-Ingest Post-Crawl + +Test small batch: + +    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + +    zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md new file mode 100644 index 0000000..4cfd8d5 --- /dev/null +++ b/notes/ingest/2020-05_oai_pmh.md @@ -0,0 +1,125 @@ + +Primary Goal: start large crawl of OAI landing pages that we haven't seen + +Fields of interest for ingest: +- oai identifer +- doi +- formats +- urls (maybe also "relations") +- types (type+stage) + +## Other Tasks + +About 150 million total lines. + +Types coverage + +    zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt + +Dump all ISSNs, with counts, quick check how many are in chocula/fatcat + +    zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt + +Language coverage + +    zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt + +Format coverage + +    zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt +    => 150M 0:56:14 [44.7k/s] + +Have a DOI? + +    zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l +    => 16,013,503 + +    zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt +    => 11,940,950 + +## Transform, Load, Bulk Ingest + +    zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz +    => 80M 6:36:55 [3.36k/s] + +    time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request - +    => 80M 4:00:21 [5.55k/s] +    => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0}) +    => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963}) + +    => real    240m21.207s +    => user    85m12.576s +    => sys     3m29.580s + +    select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai'; +    => 51,185,088 + +Why so many (30 million) skipped? Not unique? + +    zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l +    => 51,185,088 + +    zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt +    wc -l request_url.txt +    => 50,002,674 request_url.txt + +    zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt +    wc -l requires_oai.txt +    => 34,622,083 requires_oai.txt + +Yup, tons of duplication. And remember this is exact URL, not SURT or similar. + +How many of these are URLs we have seen and ingested already? + +    SELECT ingest_file_result.status, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE  +        ingest_request.ingest_type = 'pdf' +        AND ingest_request.link_source = 'oai' +    GROUP BY status +    ORDER BY COUNT DESC +    LIMIT 20; + +             status          |  count +    -------------------------+---------- +                             | 49491452 +     success                 |  1469113 +     no-capture              |   134611 +     redirect-loop           |    59666 +     no-pdf-link             |     8947 +     cdx-error               |     7561 +     terminal-bad-status     |     6704 +     null-body               |     5042 +     wrong-mimetype          |      879 +     wayback-error           |      722 +     petabox-error           |      198 +     gateway-timeout         |       86 +     link-loop               |       51 +     invalid-host-resolution |       24 +     spn2-cdx-lookup-failure |       22 +     spn2-error              |        4 +     bad-gzip-encoding       |        4 +     spn2-error:job-failed   |        2 +    (18 rows) + +Dump ingest requests: + +    COPY ( +        SELECT row_to_json(ingest_request.*) +        FROM ingest_request +        LEFT JOIN ingest_file_result +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE +            ingest_request.ingest_type = 'pdf' +            AND ingest_request.link_source = 'oai' +            AND date(ingest_request.created) > '2020-05-01' +            AND ingest_file_result.status IS NULL +    ) TO '/grande/snapshots/oai_noingest_20200506.requests.json'; +    => COPY 49491452 + +    cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + | 
