diff options
22 files changed, 1391 insertions, 85 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1eb0209..3fe5c5c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,7 +6,7 @@ before_script: - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 - add-apt-repository -y ppa:deadsnakes/ppa - apt update -qy - - apt install -y python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.7 python3.7-dev python3.7-venv python3.7-distutils pkg-config python3-pytest git + - apt install -y python3-dev python3-pip python3-wheel libjpeg-dev openjdk-8-jdk-headless sbt libpq-dev python-dev python3.7 python3.7-dev python3.7-venv python3.7-distutils pkg-config python3-pytest git libsnappy-dev libsodium-dev - pip3 install pipenv - pipenv --version diff --git a/kafka/howto_rebalance.md b/kafka/howto_rebalance.md new file mode 100644 index 0000000..d68b205 --- /dev/null +++ b/kafka/howto_rebalance.md @@ -0,0 +1,42 @@ + +## Rebalance Storage Between Brokers (kafka-manager web) + +For each topic you want to rebalance (eg, the large or high-throughput ones), +go to the topic page and do the blue "reassign partitions" button (or +potentially "generate" or "manual"). + +Monitor progress with the "Reassign Partitions" link at top of the page. + +Finally, run a preferred replica election after partition movement is complete. + +## Rebalance Storage Between Brokers (CLI) + +For example, after adding or removing brokers from the cluster. + +Create a list of topics to move, and put it in `/tmp/topics_to_move.json`: + + { + "version": 1, + "topics": [ + {"topic": "sandcrawler-shadow.grobid-output"}, + {"topic": "fatcat-prod.api-crossref"} + ] + } + +On a kafka broker, go to `/srv/kafka-broker/kafka-*/bin`, generate a plan, then +inspect the output: + + ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --broker-list "280,281,284,285,263" --topics-to-move-json-file /tmp/topics_to_move.json --generate > /tmp/reassignment-plan.json + cat /tmp/reassignment-plan.json | rg '^\{' | tail -n1 > /tmp/new-plan.json + cat /tmp/reassignment-plan.json | rg '^\{' | jq . + +If that looks good, start the rebalance: + + ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --execute + +Then monitor progress: + + ./kafka-reassign-partitions.sh --zookeeper localhost:2181 --reassignment-json-file /tmp/new-plan.json --verify + +Finally, run a preferred replica election after partition movement is complete. +Currently do this through the web interface (linked above). diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/2019-10-23_testing.md index 481c4e2..481c4e2 100644 --- a/notes/ingest/20191023_testing.md +++ b/notes/ingest/2019-10-23_testing.md diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/2020-01-14_bulk.md index 9d05cda..9d05cda 100644 --- a/notes/ingest/20200114_bulk_ingests.md +++ b/notes/ingest/2020-01-14_bulk.md diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02_unpaywall.md index e18a2ff..e18a2ff 100644 --- a/notes/ingest/2020-02-14_unpaywall_ingest.md +++ b/notes/ingest/2020-02_unpaywall.md diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md new file mode 100644 index 0000000..73396bd --- /dev/null +++ b/notes/ingest/2020-03-oa_but_not_marked.md @@ -0,0 +1,25 @@ + +These are large journals with a high fraction of "in IA", but not marked as OA +so not crawling regularly. + +TODO: add things like list of unpaywall ISSN / OA status to try and find more +"practical" / bronze OA + +## First Run + +https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him +https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4 +https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4 +https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e +https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm +https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe + +## TODO + +https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible) +https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?) + +https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link? +https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA? +https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken? +https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03_mag.md index 97594c8..428ce05 100644 --- a/notes/ingest/2020-03-04_mag.md +++ b/notes/ingest/2020-03_mag.md @@ -406,3 +406,171 @@ Full run: 2020-04-07 12:19 (pacific): 11,703,871 +## Post-bulk-ingest + +Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need +to re-try things like cdx-error. + +Current status: + + status | count + -------------------------------+---------- + success | 18491799 + redirect-loop | 1968530 + no-capture | 1373657 + no-pdf-link | 1311842 + link-loop | 1296439 + terminal-bad-status | 627577 + cdx-error | 418278 + wrong-mimetype | 50141 + wayback-error | 37159 + petabox-error | 11249 + null-body | 6295 + gateway-timeout | 3051 + spn2-cdx-lookup-failure | 328 + spn2-error:invalid-url-syntax | 93 + bad-redirect | 75 + | 47 + invalid-host-resolution | 28 + spn2-error | 10 + bad-gzip-encoding | 7 + redirects-exceeded | 2 + (20 rows) + +Lots of cdx-error to retry. + +The no-capture links are probably a mix of domain-blocklist and things that +failed in bulk mode. Will dump and re-attempt them: + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json'; + => 859849 + +What domains are these? + + cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30 + +Let's filter down more: + + cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json + + wc -l mag_nocapture_20200420.rows.filtered.json + 423085 mag_nocapture_20200420.rows.filtered.json + +Ok, enqueue! + + cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## Final Stats + +... for this round of ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+---------- + success | 18712849 + redirect-loop | 2008110 + no-pdf-link | 1337012 + link-loop | 1326761 + no-capture | 1030693 + terminal-bad-status | 637143 + gateway-timeout | 193194 + cdx-error | 125907 + spn2-cdx-lookup-failure | 77842 + wrong-mimetype | 50882 + wayback-error | 40278 + invalid-host-resolution | 35201 + petabox-error | 11254 + null-body | 6485 + spn2-error | 1643 + spn2-error:job-failed | 747 + spn2-error:invalid-url-syntax | 325 + spn2-error:soft-time-limit-exceeded | 190 + bad-redirect | 77 + | 47 + (20 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + domain | status | count + ---------------------------------+---------------------+-------- + ieeexplore.ieee.org | redirect-loop | 677712 + cyberleninka.ru | link-loop | 308390 + papers.ssrn.com | link-loop | 281804 + ieeexplore.ieee.org | link-loop | 273559 + dialnet.unirioja.es | redirect-loop | 240504 + dialnet.unirioja.es | terminal-bad-status | 232481 + onlinelibrary.wiley.com | no-pdf-link | 220932 + iopscience.iop.org | terminal-bad-status | 172480 + validate.perfdrive.com | no-pdf-link | 172312 + link.springer.com | redirect-loop | 130398 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382 + iopscience.iop.org | redirect-loop | 105234 + www.bmj.com | link-loop | 100354 + www.researchgate.net | redirect-loop | 84366 + www.cambridge.org | link-loop | 83171 + jamanetwork.com | no-pdf-link | 75053 + febs.onlinelibrary.wiley.com | no-pdf-link | 74872 + www.jstor.org | redirect-loop | 72059 + journals.sagepub.com | no-pdf-link | 63028 + science.sciencemag.org | redirect-loop | 62927 + profile.thieme.de | no-pdf-link | 62406 + cyberleninka.ru | redirect-loop | 56733 + link.springer.com | link-loop | 47608 + physoc.onlinelibrary.wiley.com | no-pdf-link | 30180 + science.sciencemag.org | link-loop | 29908 + papers.ssrn.com | redirect-loop | 27255 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789 + www.computer.org | no-pdf-link | 26444 + watermark.silverchair.com | terminal-bad-status | 25934 + www.nature.com | redirect-loop | 25306 + (30 rows) diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2.md index fedaba0..fedaba0 100644 --- a/notes/ingest/2020-03_s2_ingest.md +++ b/notes/ingest/2020-03_s2.md diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04-07_datacite.md deleted file mode 100644 index b0217f0..0000000 --- a/notes/ingest/2020-04-07_datacite.md +++ /dev/null @@ -1,18 +0,0 @@ - -After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many -of the DOIs are for, eg, datasets, and don't want to waste time on those. - -Instead of using full ingest request file from the crawl, will generate a new -ingest request file using `fatcat_ingest.py` and set that up for bulk crawling. - -## Generate Requests - - ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json - => Expecting 8905453 release objects in search queries - => 8.91M 11:49:50 [ 209 /s] - => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453}) - -## Bulk Ingest - - cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md deleted file mode 100644 index e30d482..0000000 --- a/notes/ingest/2020-04-07_unpaywall.md +++ /dev/null @@ -1,63 +0,0 @@ - -A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but -not released for more than a month). - -Primary goal is: - -- generate ingest requests for only *new* URLs -- bulk ingest these new URLs -- crawl any no-capture URLs from that batch -- re-bulk-ingest the no-capture batch -- analytics on failed ingests. eg, any particular domains that are failing to crawl - -This ingest pipeline was started on 2020-04-07 by bnewbold. - -## Transform and Load - - # in sandcrawler pipenv on aitio - zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json - => 24.7M 5:17:03 [ 1.3k/s] - - cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - - => 24.7M - => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) - -## Dump new URLs and Bulk Ingest - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND date(ingest_request.created) > '2020-04-01' - AND ingest_file_result.status IS NULL - ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; - => 3696189 - - cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Dump no-capture - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND date(ingest_request.created) > '2020-04-01' - AND ingest_file_result.status = 'no-capture' - AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' - AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' - AND ingest_request.base_url NOT LIKE '%ahajournals.org%' - AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' - AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' - AND ingest_request.base_url NOT LIKE '%academic.oup.com%' - AND ingest_request.base_url NOT LIKE '%tandfonline.com%' - ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json'; diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md new file mode 100644 index 0000000..0fc7e67 --- /dev/null +++ b/notes/ingest/2020-04_datacite.md @@ -0,0 +1,121 @@ + +After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many +of the DOIs are for, eg, datasets, and don't want to waste time on those. + +Instead of using full ingest request file from the crawl, will generate a new +ingest request file using `fatcat_ingest.py` and set that up for bulk crawling. + +## Generate Requests + + ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json + => Expecting 8905453 release objects in search queries + => 8.91M 11:49:50 [ 209 /s] + => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453}) + +## Bulk Ingest + + cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Ingest Stats + +Note that this will have a small fraction of non-datacite results mixed in (eg, +from COVID-19 targeted crawls): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + no-pdf-link | 4646767 + redirect-loop | 1447229 + no-capture | 860235 + success | 849501 + terminal-bad-status | 174869 + cdx-error | 159805 + wayback-error | 18076 + wrong-mimetype | 11169 + link-loop | 8410 + gateway-timeout | 4034 + spn2-cdx-lookup-failure | 510 + petabox-error | 339 + null-body | 251 + spn2-error | 19 + spn2-error:job-failed | 14 + bad-gzip-encoding | 13 + timeout | 5 + spn2-error:soft-time-limit-exceeded | 4 + invalid-host-resolution | 2 + spn2-error:pending | 1 + (20 rows) + +Top domains/statuses (including success): + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------------+---------------------+-------- + ssl.fao.org | no-pdf-link | 862277 + www.e-periodica.ch | no-pdf-link | 746781 + www.researchgate.net | redirect-loop | 664524 + dlc.library.columbia.edu | no-pdf-link | 493111 + www.die-bonn.de | redirect-loop | 352903 + figshare.com | no-pdf-link | 319709 + statisticaldatasets.data-planet.com | no-pdf-link | 309584 + catalog.paradisec.org.au | redirect-loop | 225396 + zenodo.org | no-capture | 193201 + digi.ub.uni-heidelberg.de | no-pdf-link | 184974 + open.library.ubc.ca | no-pdf-link | 167841 + zenodo.org | no-pdf-link | 130617 + www.google.com | no-pdf-link | 111312 + www.e-manuscripta.ch | no-pdf-link | 79192 + ds.iris.edu | no-pdf-link | 77649 + data.inra.fr | no-pdf-link | 69440 + www.tib.eu | no-pdf-link | 63872 + www.egms.de | redirect-loop | 53877 + archaeologydataservice.ac.uk | redirect-loop | 52838 + d.lib.msu.edu | no-pdf-link | 45297 + www.e-rara.ch | no-pdf-link | 45163 + springernature.figshare.com | no-pdf-link | 42527 + boris.unibe.ch | no-pdf-link | 40816 + www.research-collection.ethz.ch | no-capture | 40350 + spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059 + repository.dri.ie | terminal-bad-status | 32760 + othes.univie.ac.at | no-pdf-link | 32558 + repositories.lib.utexas.edu | no-capture | 31526 + posterng.netkey.at | no-pdf-link | 30315 + zenodo.org | terminal-bad-status | 29614 + (30 rows) + diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md new file mode 100644 index 0000000..bce757b --- /dev/null +++ b/notes/ingest/2020-04_unpaywall.md @@ -0,0 +1,129 @@ + +A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but +not released for more than a month). + +Primary goal is: + +- generate ingest requests for only *new* URLs +- bulk ingest these new URLs +- crawl any no-capture URLs from that batch +- re-bulk-ingest the no-capture batch +- analytics on failed ingests. eg, any particular domains that are failing to crawl + +This ingest pipeline was started on 2020-04-07 by bnewbold. + +Ran through the first two steps again on 2020-05-03 after unpaywall had +released another dump (dated 2020-04-27). + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json + => 24.7M 5:17:03 [ 1.3k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 24.7M + => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) + +Second time: + + # in sandcrawler pipenv on aitio + zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json + => 25.2M 3:16:28 [2.14k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390}) + + +## Dump new URLs and Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; + => 3696189 + + cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Second time: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json'; + => 1799760 + + cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Dump no-capture, Run Crawl + +Make two ingest request dumps: one with "all" URLs, which we will have heritrix +attempt to crawl, and then one with certain domains filtered out, which we may +or may not bother trying to ingest (due to expectation of failure). + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json'; + => 2734145 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; + => 2602408 + +Not actually a very significant size difference after all. + +See `journal-crawls` repo for details on seedlist generation and crawling. + +## Re-Ingest Post-Crawl + +Test small batch: + + zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + + zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md new file mode 100644 index 0000000..4cfd8d5 --- /dev/null +++ b/notes/ingest/2020-05_oai_pmh.md @@ -0,0 +1,125 @@ + +Primary Goal: start large crawl of OAI landing pages that we haven't seen + +Fields of interest for ingest: +- oai identifer +- doi +- formats +- urls (maybe also "relations") +- types (type+stage) + +## Other Tasks + +About 150 million total lines. + +Types coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt + +Dump all ISSNs, with counts, quick check how many are in chocula/fatcat + + zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt + +Language coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt + +Format coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt + => 150M 0:56:14 [44.7k/s] + +Have a DOI? + + zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l + => 16,013,503 + + zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt + => 11,940,950 + +## Transform, Load, Bulk Ingest + + zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz + => 80M 6:36:55 [3.36k/s] + + time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request - + => 80M 4:00:21 [5.55k/s] + => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963}) + + => real 240m21.207s + => user 85m12.576s + => sys 3m29.580s + + select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai'; + => 51,185,088 + +Why so many (30 million) skipped? Not unique? + + zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l + => 51,185,088 + + zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt + wc -l request_url.txt + => 50,002,674 request_url.txt + + zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt + wc -l requires_oai.txt + => 34,622,083 requires_oai.txt + +Yup, tons of duplication. And remember this is exact URL, not SURT or similar. + +How many of these are URLs we have seen and ingested already? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + | 49491452 + success | 1469113 + no-capture | 134611 + redirect-loop | 59666 + no-pdf-link | 8947 + cdx-error | 7561 + terminal-bad-status | 6704 + null-body | 5042 + wrong-mimetype | 879 + wayback-error | 722 + petabox-error | 198 + gateway-timeout | 86 + link-loop | 51 + invalid-host-resolution | 24 + spn2-cdx-lookup-failure | 22 + spn2-error | 4 + bad-gzip-encoding | 4 + spn2-error:job-failed | 2 + (18 rows) + +Dump ingest requests: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/oai_noingest_20200506.requests.json'; + => COPY 49491452 + + cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/NEXT.md b/notes/ingest/NEXT.md new file mode 100644 index 0000000..8cdd6df --- /dev/null +++ b/notes/ingest/NEXT.md @@ -0,0 +1,52 @@ + +biorxiv +medrxiv + doi:10.1101\/20* + +persee.fr 147k + publisher:persee in_ia:false is_oa:true + https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013 + +cairn.info: 161k + doi_prefix:10.3917 in_ia:false is_oa:true + https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm + https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm + +IOP OA: 169k + doi_prefix:10.1088 is_oa:true in_ia:false + +indian journals platform? 124k + doi_prefix:10.4103 in_ia:false is_oa:true + http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad + http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki + +openedition? 48k + doi_prefix:10.4000 is_oa:true in_ia:false + +german medical science (GMS) 28k + doi_prefix:10.3205 in_ia:false is_oa:true + https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml + +siberian chemistry 28k + doi_prefix:10.2298 in_ia:false is_oa:true + http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H + +jalc oa doi: 82k + doi_registrar:jalc in_ia:false is_oa:true + +sage OA papers + https://journals.sagepub.com/doi/10.1177/034003529802400510 + +Scientific Reports: 25k + in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm" + +U Toronto press: 23k + publisher:"Toronto Press" in_ia:false is_oa:true + has an annoying bounce page + +ASHA (speech-language-hearing association): 7k + publisher:Speech-Language-Hearing in_ia:false is_oa:true + +MIT press journals + + diff --git a/proposals/20200211_nsq.md b/proposals/20200211_nsq.md new file mode 100644 index 0000000..6aa885b --- /dev/null +++ b/proposals/20200211_nsq.md @@ -0,0 +1,79 @@ + +status: planned + +In short, Kafka is not working well as a job task scheduler, and I want to try +NSQ as a medium-term solution to that problem. + + +## Motivation + +Thinking of setting up NSQ to use for scheduling distributed work, to replace +kafka for some topics. for example, "regrobid" requests where we enqueue +millions of, basically, CDX lines, and want to process on dozens of cores or +multiple machines. or file ingest backfill. results would still go to kafka (to +persist), and pipelines like DOI harvest -> import -> elasticsearch would still +be kafka + +The pain point with kafka is having dozens of workers on tasks that take more +than a couple seconds per task. we could keep tweaking kafka and writing weird +consumer group things to handle this, but I think it will never work very well. +NSQ supports re-queues with delay (eg, on failure, defer to re-process later), +allows many workers to connect and leave with no disruption, messages don't +have to be processed in order, and has a very simple enqueue API (HTTP POST). + +The slowish tasks we have now are file ingest (wayback and/or SPNv2 + +GROBID) and re-GROBID. In the near future will also have ML backlog to go +through. + +Throughput isn't much of a concern as tasks take 10+ seconds each. + + +## Specific Plan + +Continue publishing ingest requests to Kafka topic. Have a new persist worker +consume from this topic and push to request table (but not result table) using +`ON CONFLICT DO NOTHING`. Have a new single-process kafka consumer pull from +the topic and push to NSQ. This consumer monitors NSQ and doesn't push too many +requests (eg, 1k maximum). NSQ could potentially even run as in-memory mode. +New worker/pusher class that acts as an NSQ client, possibly with parallelism. + +*Clean* NSQ shutdown/restart always persists data locally to disk. + +Unclean shutdown (eg, power failure) would mean NSQ might have lost state. +Because we are persisting requests to sandcrawler-db, cleanup is simple: +re-enqueue all requests from the past N days with null result or result older +than M days. + +Still need multiple kafka and NSQ topics to have priority queues (eg, bulk, +platform-specific). + +To start, have a single static NSQ host; don't need nsqlookupd. Could use +wbgrp-svc506 (datanode VM with SSD, lots of CPU and RAM). + +To move hosts, simply restart the kafka pusher pointing at the new NSQ host. +When the old host's queue is empty, restart the workers to consume from the new +host, and destroy the old NSQ host. + + +## Alternatives + +Work arounds i've done to date have been using the `grobid_tool.py` or +`ingest_tool.py` JSON input modes to pipe JSON task files (millions of lines) +through GNU/parallel. I guess GNU/parallel's distributed mode is also an option +here. + +Other things that could be used: + +**celery**: popular, many features. need to run separate redis, no disk persistence (?) + +**disque**: need to run redis, no disk persistence (?) <https://github.com/antirez/disque> + +**gearman**: <http://gearman.org/> no disk persistence (?) + + +## Old Notes + +TBD if would want to switch ingest requests from fatcat -> sandcrawler over, +and have the continuous ingests run out of NSQ, or keep using kafka for that. +currently can only do up to 10x parallelism or so with SPNv2, so that isn't a +scaling pain point diff --git a/python/Makefile b/python/Makefile new file mode 100644 index 0000000..1525900 --- /dev/null +++ b/python/Makefile @@ -0,0 +1,15 @@ + +SHELL = /bin/bash +.SHELLFLAGS = -o pipefail -c + +.PHONY: help +help: ## Print info about all commands + @echo "Commands:" + @echo + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: test +test: ## Run all tests and lints + pipenv run pytest + #pipenv run mypy *.py sandcrawler/*.py tests/ --ignore-missing-imports + diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 08e3a96..f329a73 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -87,6 +87,15 @@ class GrobidWorker(SandcrawlerWorker): self.sink = sink self.consolidate_mode = 2 + def timeout_response(self, task): + default_key = task['sha1hex'] + return dict( + status="error-timeout", + error_msg="internal GROBID worker timeout", + source=task, + key=default_key, + ) + def process(self, record): default_key = record['sha1hex'] if record.get('warc_path') and record.get('warc_offset'): diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 8fbb0ba..88ea41b 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -42,7 +42,10 @@ def extract_fulltext_url(html_url, html_body): try: soup = BeautifulSoup(html_body, 'html.parser') except TypeError as te: - print("{} (url={})".format(te, html_url, file=sys.stderr)) + print(f"{te} (url={html_url})", file=sys.stderr) + return dict() + except UnboundLocalError as ule: + print(f"{ule} (url={html_url})", file=sys.stderr) return dict() ### General Tricks ### @@ -54,6 +57,9 @@ def extract_fulltext_url(html_url, html_body): if not meta: # researchgate does this; maybe others also? meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) + # if tag is only partially populated + if meta and not meta.get('content'): + meta = None # wiley has a weird almost-blank page we don't want to loop on if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: url = meta['content'].strip() diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5cb3ef8..82b43fe 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -229,6 +229,20 @@ class IngestFileWorker(SandcrawlerWorker): result.pop('key', None) return result + def timeout_response(self, task): + print("[TIMEOUT]", file=sys.stderr) + return dict( + request=task, + hit=False, + status="timeout", + error_message="ingest worker internal timeout", + ) + + def want(self, request): + if not request.get('ingest_type') in ('file', 'pdf'): + return False + return True + def process(self, request): # backwards compatibility diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index d5db7a5..6425e99 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -2,6 +2,7 @@ import sys import json import time +import signal import zipfile import multiprocessing.pool from collections import Counter @@ -26,6 +27,9 @@ class SandcrawlerWorker(object): def push_record(self, task): self.counts['total'] += 1 + if not self.want(task): + self.counts['skip'] += 1 + return result = self.process(task) if not result: self.counts['failed'] += 1 @@ -40,6 +44,43 @@ class SandcrawlerWorker(object): print(json.dumps(result)) return result + def timeout_response(self, task): + """ + This should be overridden by workers that want to return something + meaningful when there is a processing timeout. Eg, JSON vs some other + error message. + """ + return None + + def push_record_timeout(self, task, timeout=300): + """ + A wrapper around self.push_record which sets a timeout. + + Note that this uses signals and *will behave wrong/weirdly* with + multithreading or if signal-based timeouts are used elsewhere in the + same process. + """ + + def timeout_handler(signum, frame): + raise TimeoutError("timeout processing record") + signal.signal(signal.SIGALRM, timeout_handler) + resp = None + signal.alarm(int(timeout)) + try: + resp = self.push_record(task) + except TimeoutError: + self.counts['timeout'] += 1 + resp = self.timeout_response(task) # pylint: disable=assignment-from-none + # TODO: what if it is this push_record() itself that is timing out? + if resp and self.sink: + self.sink.push_record(resp) + self.counts['pushed'] += 1 + elif resp: + print(json.dumps(resp)) + finally: + signal.alarm(0) + return resp + def push_batch(self, tasks): results = [] for task in tasks: @@ -52,6 +93,12 @@ class SandcrawlerWorker(object): print("Worker: {}".format(self.counts), file=sys.stderr) return self.counts + def want(self, task): + """ + Optionally override this as a filter in implementations. + """ + return True + def process(self, task): """ Derived workers need to implement business logic here. @@ -338,7 +385,6 @@ class ZipfilePusher(RecordPusher): print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr) return self.counts - class KafkaJsonPusher(RecordPusher): def __init__(self, worker, kafka_hosts, consume_topic, group, **kwargs): @@ -398,7 +444,8 @@ class KafkaJsonPusher(RecordPusher): done = False while not done: try: - self.worker.push_record(record) + # use timeouts; don't want kafka itself to timeout + self.worker.push_record_timeout(record, timeout=300) break except SandcrawlerBackoffError as be: print("Backing off for 200 seconds: {}".format(be)) diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py new file mode 100755 index 0000000..916f41c --- /dev/null +++ b/python/scripts/oai2ingestrequest.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +""" +Transform an OAI-PMH bulk dump (JSON) into ingest requests. + +Eg: https://archive.org/details/oai_harvest_20200215 +""" + +import sys +import json +import argparse +import urlcanon + +DOMAIN_BLOCKLIST = [ + # large OA publishers (we get via DOI) + + # large repos and aggregators (we crawl directly) + "://arxiv.org/", + "://europepmc.org/", + "ncbi.nlm.nih.gov/", + "semanticscholar.org/", + "://doi.org/", + "://dx.doi.org/", + "zenodo.org/", + "figshare.com/", + "://archive.org/", + ".archive.org/", + "://127.0.0.1/", + + # OAI specific additions + "://hdl.handle.net/", +] + +RELEASE_STAGE_MAP = { + 'info:eu-repo/semantics/draftVersion': 'draft', + 'info:eu-repo/semantics/submittedVersion': 'submitted', + 'info:eu-repo/semantics/acceptedVersion': 'accepted', + 'info:eu-repo/semantics/publishedVersion': 'published', + 'info:eu-repo/semantics/updatedVersion': 'updated', +} + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + +def transform(obj): + """ + Transforms from a single OAI-PMH object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj.get('oai') or not obj['oai'].startswith('oai:'): + return [] + if not obj.get('urls'): + return [] + + # look in obj['formats'] for PDF? + if obj.get('formats'): + # if there is a list of formats, and it does not contain PDF, then + # skip. Note that we will continue if there is no formats list. + has_pdf = False + for f in obj['formats']: + if 'pdf' in f.lower(): + has_pdf = True + if not has_pdf: + return [] + + doi = None + if obj.get('doi'): + doi = obj['doi'][0].lower().strip() + if not doi.startswith('10.'): + doi = None + + # infer release stage and/or type from obj['types'] + release_stage = None + for t in obj.get('types', []): + if t in RELEASE_STAGE_MAP: + release_stage = RELEASE_STAGE_MAP[t] + + # TODO: infer rel somehow? Eg, repository vs. OJS publisher + rel = None + + for url in obj['urls']: + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in url: + skip = True + if skip: + continue + try: + base_url = canon(url) + except UnicodeEncodeError: + continue + + request = { + 'base_url': base_url, + 'ingest_type': 'pdf', + 'link_source': 'oai', + 'link_source_id': obj['oai'].lower(), + 'ingest_request_source': 'metha-bulk', + 'release_stage': release_stage, + 'rel': rel, + 'ext_ids': { + 'doi': doi, + 'oai': obj['oai'].lower(), + }, + 'edit_extra': {}, + } + requests.append(request) + + return requests + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="OAI-PMH dump file to use (usually stdin)", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + +if __name__ == '__main__': + main() diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt new file mode 100644 index 0000000..55f0c1e --- /dev/null +++ b/sql/stats/2020-05-03_stats.txt @@ -0,0 +1,418 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 41 GB | 82 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 59 GB | 7604 MB | 66 GB + "public"."file_meta" | 31 GB | 28 GB | 59 GB + "public"."ingest_request" | 19 GB | 20 GB | 39 GB + "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (10 rows) + + Size: 383.93G + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 158059828 | 197346217653010 + (1 row) + + => 158 million, 197 terabytes + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 157805029 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6040 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1027125 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + + unique_sha1 | total + -------------+----------- + 92936564 | 111022039 + (1 row) + + => 110 million rows, 92.9 million files + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------------------------------------+----------- + application/pdf | 104178718 + warc/revisit | 5274410 + text/xml | 519042 + text/html | 295523 + application/octet-stream | 259681 + unk | 138930 + application/postscript | 81065 + application/save | 80765 + binary/octet-stream | 59804 + application/x-download | 27083 + text/plain | 26938 + application/download | 25125 + image/pdf | 16095 + application/force-download | 9004 + application/x-msdownload | 3711 + application | 2934 + application/x-octetstream | 2926 + multipart/form-data | 2741 + application/x-pdf | 2444 + .pdf | 2368 + application/binary | 1268 + application/pdf' | 1192 + pdf | 1113 + file/unknown | 1086 + application/unknown | 761 + file | 753 + application/blob | 670 + application/octetstream | 657 + text/pdf | 549 + 0 | 417 + ('application/pdf', | 349 + application/http;msgtype=response | 251 + application/doc | 180 + [...] (wasn't LIMIT 25) + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + + unique_releases | total + -----------------+---------- + 17455441 | 92707544 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057023 + | 14638425 + (2 rows) + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057074 + | 3 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | unpaywall | 26244088 + pdf | mag | 25596658 + pdf | doi | 15652966 + pdf | pmc | 2043646 + pdf | arxiv | 721902 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 103 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | unpaywall | unpaywall | 26244088 + pdf | mag | mag-corpus | 25596658 + pdf | doi | fatcat-ingest | 8267308 + pdf | doi | fatcat-changelog | 3869772 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630719 + pdf | arxiv | fatcat-changelog | 91157 + pdf | pmc | fatcat-ingest | 10195 + pdf | pmc | fatcat-changelog | 4626 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 103 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 15 + (15 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+------- + pdf | mag | 47 + pdf | unpaywall | 1 + (2 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | unpaywall | 26244088 | 19968092 | 0.761 + pdf | mag | 25596658 | 18712912 | 0.731 + pdf | doi | 15653166 | 2878833 | 0.184 + pdf | pmc | 2043646 | 1279529 | 0.626 + pdf | arxiv | 721902 | 592394 | 0.821 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 103 | 82 | 0.796 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 37449502 + pdf | no-pdf-link | 10908442 + pdf | no-capture | 5643670 + pdf | redirect-loop | 4823502 + pdf | terminal-bad-status | 1715056 + pdf | link-loop | 1425072 + pdf | cdx-error | 535365 + pdf | gateway-timeout | 267654 + pdf | skip-url-blocklist | 220433 + pdf | wrong-mimetype | 189804 + pdf | spn2-cdx-lookup-failure | 103926 + pdf | spn-error | 101777 + pdf | wayback-error | 93517 + pdf | null-body | 87279 + pdf | invalid-host-resolution | 35305 + pdf | spn-remote-error | 28888 + pdf | petabox-error | 12406 + pdf | spn2-error | 2905 + pdf | spn2-error:job-failed | 2307 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 745 + pdf | spn2-error:proxy-error | 438 + pdf | spn2-error:invalid-url-syntax | 406 + pdf | spn2-error:soft-time-limit-exceeded | 405 + pdf | spn2-error:browser-running-error | 274 + (25 rows) + +Failures by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | domain | status | count + -------------+---------------------------------------+---------------------+-------- + pdf | ssl.fao.org | no-pdf-link | 862277 + pdf | www.researchgate.net | redirect-loop | 749094 + pdf | www.e-periodica.ch | no-pdf-link | 747370 + pdf | ieeexplore.ieee.org | redirect-loop | 707482 + pdf | plutof.ut.ee | no-pdf-link | 685341 + pdf | www.gbif.org | no-pdf-link | 670905 + pdf | dlc.library.columbia.edu | no-pdf-link | 508281 + pdf | figshare.com | no-pdf-link | 400501 + pdf | onlinelibrary.wiley.com | no-pdf-link | 399187 + pdf | watermark.silverchair.com | terminal-bad-status | 357188 + pdf | www.die-bonn.de | redirect-loop | 352903 + pdf | academic.oup.com | no-pdf-link | 346828 + pdf | iopscience.iop.org | terminal-bad-status | 345147 + pdf | linkinghub.elsevier.com | no-capture | 328434 + pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206 + pdf | cyberleninka.ru | link-loop | 309525 + pdf | www.tandfonline.com | no-pdf-link | 309146 + pdf | dialnet.unirioja.es | terminal-bad-status | 307572 + pdf | doi.pangaea.de | no-pdf-link | 304924 + pdf | journals.sagepub.com | no-pdf-link | 285774 + pdf | papers.ssrn.com | link-loop | 282415 + pdf | dialnet.unirioja.es | redirect-loop | 274476 + pdf | ieeexplore.ieee.org | link-loop | 273607 + pdf | catalog.paradisec.org.au | redirect-loop | 234653 + pdf | www.plate-archive.org | no-pdf-link | 209217 + pdf | zenodo.org | no-pdf-link | 200078 + pdf | zenodo.org | no-capture | 199025 + pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084 + pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039 + pdf | validate.perfdrive.com | no-pdf-link | 180191 + (30 rows) + +Success by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | domain | status | count + -------------+----------------------------+---------+--------- + pdf | www.jstage.jst.go.jp | success | 2244620 + pdf | europepmc.org | success | 1284770 + pdf | link.springer.com | success | 1017998 + pdf | www.scielo.br | success | 799577 + pdf | arxiv.org | success | 592622 + pdf | downloads.hindawi.com | success | 527278 + pdf | res.mdpi.com | success | 501093 + pdf | hal.archives-ouvertes.fr | success | 447877 + pdf | digital.library.unt.edu | success | 404460 + pdf | www.cambridge.org | success | 394666 + pdf | dergipark.org.tr | success | 373706 + pdf | journals.plos.org | success | 296994 + pdf | watermark.silverchair.com | success | 275562 + pdf | www.nature.com | success | 263836 + pdf | cds.cern.ch | success | 223057 + pdf | www.pnas.org | success | 220488 + pdf | s3-eu-west-1.amazonaws.com | success | 214558 + pdf | www.jbc.org | success | 205277 + pdf | www.redalyc.org | success | 193591 + pdf | iopscience.iop.org | success | 175796 + pdf | apps.dtic.mil | success | 170589 + pdf | zenodo.org | success | 167812 + pdf | peerj.com | success | 155620 + pdf | www.biorxiv.org | success | 149337 + pdf | 210.101.116.28 | success | 145706 + pdf | www.teses.usp.br | success | 145438 + pdf | absimage.aps.org | success | 144400 + pdf | hrcak.srce.hr | success | 134669 + pdf | www.erudit.org | success | 131771 + pdf | babel.hathitrust.org | success | 130645 + (30 rows) + + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + => NOT RUN, fatcat_file table is way out of date + |