diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 21:01:25 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 21:01:25 -0800 | 
| commit | 1b74e8a4dee21bd260040dad8072e4fb48456b3c (patch) | |
| tree | 36a5c8f263ff7a1637754816c0d76a226e77532d /sql | |
| parent | be8f1d134681caaa15485246b65551a67e5bd5a5 (diff) | |
| download | sandcrawler-1b74e8a4dee21bd260040dad8072e4fb48456b3c.tar.gz sandcrawler-1b74e8a4dee21bd260040dad8072e4fb48456b3c.zip  | |
recent sandcrawler-db / ingest stats (interesting)
Diffstat (limited to 'sql')
| -rw-r--r-- | sql/stats/2020-02-24_stats.txt | 482 | ||||
| -rw-r--r-- | sql/stats/README.md | 6 | 
2 files changed, 488 insertions, 0 deletions
diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt new file mode 100644 index 0000000..e7a00e8 --- /dev/null +++ b/sql/stats/2020-02-24_stats.txt @@ -0,0 +1,482 @@ + +## SQL Table Sizes + +    SELECT +        table_name, +        pg_size_pretty(table_size) AS table_size, +        pg_size_pretty(indexes_size) AS indexes_size, +        pg_size_pretty(total_size) AS total_size +      FROM ( +          SELECT +              table_name, +              pg_table_size(table_name) AS table_size, +              pg_indexes_size(table_name) AS indexes_size, +              pg_total_relation_size(table_name) AS total_size +          FROM ( +              SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name +              FROM information_schema.tables +              WHERE table_schema = 'public' +          ) AS all_tables +          ORDER BY total_size DESC +      ) AS pretty_sizes; + + +    Size:  271.83G + +              table_name           | table_size | indexes_size | total_size  +    -------------------------------+------------+--------------+------------ +     "public"."cdx"                | 42 GB      | 36 GB        | 78 GB +     "public"."grobid_shadow"      | 61 GB      | 6553 MB      | 68 GB +     "public"."grobid"             | 47 GB      | 7213 MB      | 54 GB +     "public"."file_meta"          | 26 GB      | 12 GB        | 38 GB +     "public"."shadow"             | 8303 MB    | 9216 MB      | 17 GB +     "public"."fatcat_file"        | 5206 MB    | 2094 MB      | 7300 MB +     "public"."ingest_file_result" | 1831 MB    | 2454 MB      | 4285 MB +     "public"."ingest_request"     | 2006 MB    | 2122 MB      | 4128 MB +     "public"."petabox"            | 403 MB     | 594 MB       | 997 MB +     "public"."pdftrio"            | 78 MB      | 64 MB        | 142 MB +    (10 rows) + + +## File Metadata + +(skipping, no update) + + +## CDX + +Total and unique-by-sha1 counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +Processed or not: + +    # TODO: + +## GROBID + +Counts: + +    SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total  FROM grobid; + +     unique_releases |  total +    -----------------+---------- +          15,632,810 | 76,555,791 +    (1 row) + +Status? + +    SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + +     status_code |  count    +    -------------+---------- +             200 | 70656028 +             500 |  5896836 +              -4 |     2295 +             503 |      111 +    (4 rows) +  + What version used? +  +    SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + +     grobid_version |  count +    ----------------+---------- +     0.5.5-fatcat   | 56001631 +                    | 14654496 +    (2 rows) + +## Petabox + +Counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + +     unique_sha1 |  total +    -------------+--------- +       2,868,825 | 2,887,834 +    (1 row) + +## Ingests + +Requests by source: + +    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type | link_source |  count   +    -------------+-------------+--------- +     pdf         | doi         | 6591633 +     pdf         | pmc         | 2030279 +     pdf         | arxiv       |  630743 +     pdf         | unpaywall   |    1400 +     pdf         | spn         |      82 +     pdf         | pubmed      |       2 +    (6 rows) + +    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type | link_source |  ingest_request_source  |  count +    -------------+-------------+-------------------------+--------- +     pdf         | doi         | fatcat-ingest-container | 3515873 +     pdf         | doi         |                         | 2943896 +     pdf         | pmc         | fatcat-ingest-container | 2028825 +     pdf         | arxiv       |                         |  629719 +     pdf         | doi         | fatcat-changelog        |  129932 +     pdf         | doi         | fatcat-ingest           |    1935 +     pdf         | pmc         |                         |    1454 +     pdf         | unpaywall   | unpaywall               |    1400 +     pdf         | arxiv       | fatcat-ingest           |     998 +     pdf         | spn         |                         |      64 +     pdf         | arxiv       | fatcat-ingest-container |      26 +     pdf         | spn         | savepapernow-web        |      18 +     pdf         | pubmed      |                         |       2 +     pdf         | doi         | savepapernow-web        |       1 +    (14 rows) + +    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; +    UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; +    => UPDATE 2943896 + +    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; +    UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; +    => UPDATE 64 + +    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; +    UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; +    => UPDATE 629719 + +    SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; +    UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; +    => UPDATE 1454 + +    SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed'; +    DELETE FROM ingest_request WHERE link_source = 'pubmed'; +    => DELETE 2 + +    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + +         ingest_type | link_source |  count   +    -------------+-------------+--------- +     pdf         | doi         | 6591637 +     pdf         | pmc         | 2030279 +     pdf         | arxiv       |  630743 +     pdf         | unpaywall   |    1400 +     pdf         | spn         |      82 +    (5 rows) + +    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type | link_source |  ingest_request_source  |  count   +    -------------+-------------+-------------------------+--------- +     pdf         | doi         | fatcat-ingest-container | 3515873 +     pdf         | doi         | fatcat-changelog        | 3073828 +     pdf         | pmc         | fatcat-ingest-container | 2028825 +     pdf         | arxiv       | fatcat-ingest           |  630717 +     pdf         | doi         | fatcat-ingest           |    1935 +     pdf         | pmc         | fatcat-ingest           |    1454 +     pdf         | unpaywall   | unpaywall               |    1400 +     pdf         | spn         | savepapernow-web        |      82 +     pdf         | arxiv       | fatcat-ingest-container |      26 +     pdf         | doi         | savepapernow-web        |       1 +    (10 rows) + +Uncrawled requests by source: + +    # TODO: verify this? +    SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +    WHERE ingest_file_result.base_url IS NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + +    none? + +Results by source: + +    SELECT +        ingest_request.ingest_type, +        ingest_request.link_source, +        COUNT(*) as attempts, +        COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,  +        ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +        AND ingest_file_result.ingest_type IS NOT NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + +     ingest_type | link_source | attempts |  hits   | fraction  +    -------------+-------------+----------+---------+---------- +     pdf         | doi         |  6591637 | 1622702 |    0.246 +     pdf         | pmc         |  2030279 | 1241836 |    0.612 +     pdf         | arxiv       |   630743 |  500620 |    0.794 +     pdf         | unpaywall   |     1400 |     851 |    0.608 +     pdf         | spn         |       82 |      62 |    0.756 +    (5 rows) + +Ingest result by status: + +    SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + +     ingest_type |               status                |  count   +    -------------+-------------------------------------+--------- +     pdf         | success                             | 3366189 +     pdf         | no-pdf-link                         | 2902620 +     pdf         | no-capture                          | 1672025 +     pdf         | redirect-loop                       |  388844 +     pdf         | cdx-error                           |  272780 +     pdf         | terminal-bad-status                 |  171878 +     pdf         | spn-remote-error                    |  163843 +     pdf         | spn-error                           |  108070 +     pdf         | null-body                           |   66778 +     pdf         | link-loop                           |   43403 +     pdf         | skip-url-blocklist                  |   34705 +     pdf         | wrong-mimetype                      |   31343 +     pdf         | wayback-error                       |   13012 +     pdf         | spn2-cdx-lookup-failure             |    6100 +     pdf         | gateway-timeout                     |    5633 +     pdf         | other-mimetype                      |    5114 +     pdf         | spn2-error:proxy-error              |     538 +     pdf         | spn2-error:job-failed               |     470 +     pdf         | petabox-error                       |     415 +     pdf         | spn2-error:browser-running-error    |     136 +     pdf         | spn2-error                          |     127 +     pdf         | spn2-error:soft-time-limit-exceeded |      71 +     pdf         | bad-redirect                        |      39 +     pdf         | spn2-error:unknown                  |      30 +     pdf         | spn2-error:browsing-timeout         |      25 +     pdf         | pending                             |       3 +     pdf         | invalid-host-resolution             |       1 +    (27 rows) + + +## Fatcat Files + +(skipping, no update) + +## Recent Success/Failure of Ingest by Domain + +NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a +small fraction of this is from changelog. + +    # "problem domains" and statuses +    SELECT domain, status, COUNT((domain, status))      +    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 +    WHERE t1.domain != '' +        AND t1.status != 'success' +        AND t1.updated >= NOW() - '1 day'::INTERVAL     +    GROUP BY domain, status +    ORDER BY COUNT DESC +    LIMIT 10; + +             domain          |     status     | count +    -------------------------+----------------+------- +     linkinghub.elsevier.com | no-capture     |  2579 +     www.mdpi.com            | wrong-mimetype |  1313 +     onlinelibrary.wiley.com | no-pdf-link    |   785 +     americanarchivist.org   | no-pdf-link    |   756 +     journals.sagepub.com    | redirect-loop  |   503 +     link.springer.com       | redirect-loop  |   432 +     iopscience.iop.org      | no-capture     |   392 +     www.tandfonline.com     | no-pdf-link    |   389 +     pubs.rsc.org            | no-capture     |   361 +     www.persee.fr           | no-capture     |   344 +    (10 rows) + + +    # "what type of errors" +    SELECT ingest_type, status, COUNT(*) +    FROM ingest_file_result +    WHERE updated >= NOW() - '1 day'::INTERVAL +    GROUP BY ingest_type, status +    ORDER BY COUNT DESC +    LIMIT 25;     + +     ingest_type |               status                | count  +    -------------+-------------------------------------+------- +     pdf         | success                             | 40578 +     pdf         | cdx-error                           | 14982 +     pdf         | no-capture                          |  7747 +     pdf         | no-pdf-link                         |  7111 +     pdf         | redirect-loop                       |  3265 +     pdf         | wrong-mimetype                      |  1629 +     pdf         | spn2-cdx-lookup-failure             |   657 +     pdf         | link-loop                           |   538 +     pdf         | null-body                           |   517 +     pdf         | terminal-bad-status                 |   400 +     pdf         | wayback-error                       |    79 +     pdf         | spn2-error:job-failed               |    53 +     pdf         | gateway-timeout                     |    38 +     pdf         | spn2-error:soft-time-limit-exceeded |     7 +     pdf         | spn2-error                          |     6 +     pdf         | petabox-error                       |     5 +     pdf         | spn2-error:browsing-timeout         |     4 +     pdf         | spn2-error:unknown                  |     2 +     pdf         | bad-redirect                        |     1 +     pdf         | pending                             |     1 +    (20 rows) + +    # "throughput per day for last N days" +    SELECT ingest_type, +           date(updated), +           COUNT(*) as total, +           COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success +    FROM ingest_file_result +    WHERE updated >= NOW() - '1 month'::INTERVAL +    GROUP BY ingest_type, date(updated) +    ORDER BY date(updated) DESC; + +     ingest_type |    date    |  total  | success  +    -------------+------------+---------+--------- +     pdf         | 2020-02-25 |   32660 |   14322 +     pdf         | 2020-02-24 |   44967 |   26263 +     pdf         | 2020-02-23 |   58795 |   18874 +     pdf         | 2020-02-22 |  844249 |  272606 +     pdf         | 2020-02-21 | 1287378 |  433487 +     pdf         | 2020-02-20 | 1455943 |  492408 +     pdf         | 2020-02-19 |   21453 |    7529 +     pdf         | 2020-02-18 |    5863 |    2926 +     pdf         | 2020-02-17 |    3737 |     970 +     pdf         | 2020-02-16 |   13779 |    4862 +     pdf         | 2020-02-15 | 1021020 |  623020 +     pdf         | 2020-02-14 | 1036036 |  632830 +     pdf         | 2020-02-13 |   13503 |    5824 +     pdf         | 2020-02-12 |   20078 |   11422 +     pdf         | 2020-02-11 |   13499 |    6781 +     pdf         | 2020-02-10 |    2275 |     961 +     pdf         | 2020-02-09 |    3231 |    1494 +     pdf         | 2020-02-08 |    8967 |    4400 +     pdf         | 2020-02-07 |    7022 |    2430 +     pdf         | 2020-02-06 |    1291 |     516 +     pdf         | 2020-02-05 |    8586 |    6596 +     pdf         | 2020-02-04 |    3681 |    3593 +     pdf         | 2020-02-03 |     284 |     284 +     pdf         | 2020-02-02 |     480 |     480 +     pdf         | 2020-02-01 |     489 |     336 +     pdf         | 2020-01-31 |    1187 |    1130 +     pdf         | 2020-01-30 |    1613 |    1288 +     pdf         | 2020-01-29 |     947 |     279 +     pdf         | 2020-01-28 |     667 |     323 +    (29 rows) + +Top "no-capture" domains (will need to re-ingest using live tool): + +    SELECT domain, status, COUNT((domain, status))      +    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 +    WHERE t1.domain != '' +        AND t1.status = 'no-capture' +    GROUP BY domain, status +    ORDER BY COUNT DESC +    LIMIT 25; + +              domain          |   status   | count   +    --------------------------+------------+-------- +     linkinghub.elsevier.com  | no-capture | 320065 +     iopscience.iop.org       | no-capture |  46858 +     pubs.rsc.org             | no-capture |  43331 +     www.persee.fr            | no-capture |  38971 +     www.doiserbia.nb.rs      | no-capture |  27112 +     academic.oup.com         | no-capture |  18877 +     www.osapublishing.org    | no-capture |  17113 +     osf.io                   | no-capture |  16978 +     scripts.iucr.org         | no-capture |  14844 +     www.degruyter.com        | no-capture |   8093 +     mab-online.nl            | no-capture |   6603 +     insights.ovid.com        | no-capture |   6457 +     ir.lib.uth.gr            | no-capture |   3625 +     www.sciencedirect.com    | no-capture |   3244 +     www.tandfonline.com      | no-capture |   3201 +     www.ccsenet.org          | no-capture |   2849 +     www.intechopen.com       | no-capture |   2813 +     primary-hospital-care.ch | no-capture |   2774 +     www.nature.com           | no-capture |   2484 +     www.indianjournals.com   | no-capture |   2432 +     journals.aps.org         | no-capture |   2197 +     journals.sagepub.com     | no-capture |   2064 +     www.episodes.org         | no-capture |   1805 +     periodicos.uninove.br    | no-capture |   1692 +     escholarship.org         | no-capture |   1666 +    (25 rows) + +Top "no-pdf-link" domains: + +    SELECT domain, status, COUNT((domain, status))      +    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 +    WHERE t1.domain != '' +        AND t1.status = 'no-pdf-link' +    GROUP BY domain, status +    ORDER BY COUNT DESC +    LIMIT 25; + +               domain            |   status    | count   +    -----------------------------+-------------+-------- +     plutof.ut.ee                | no-pdf-link | 685315 +     www.gbif.org                | no-pdf-link | 670647 +     doi.pangaea.de              | no-pdf-link | 301984 +     www.plate-archive.org       | no-pdf-link | 209218 +     onlinelibrary.wiley.com     | no-pdf-link |  84890 +     figshare.com                | no-pdf-link |  72892 +     zenodo.org                  | no-pdf-link |  45768 +     www.tandfonline.com         | no-pdf-link |  43848 +     data.mendeley.com           | no-pdf-link |  42367 +     springernature.figshare.com | no-pdf-link |  35941 +     dhz.uni-passau.de           | no-pdf-link |  29187 +     www.frontiersin.org         | no-pdf-link |  17925 +     digital.ucd.ie              | no-pdf-link |  16769 +     mr.crossref.org             | no-pdf-link |  14999 +     journals.lww.com            | no-pdf-link |  12122 +     musewide.aip.de             | no-pdf-link |  10854 +     datadryad.org               | no-pdf-link |  10686 +     www.jstor.org               | no-pdf-link |   9159 +     koreascience.or.kr          | no-pdf-link |   9067 +     easy.dans.knaw.nl           | no-pdf-link |   8264 +     scielo.conicyt.cl           | no-pdf-link |   8069 +     www.degruyter.com           | no-pdf-link |   7989 +     www.kci.go.kr               | no-pdf-link |   6990 +     www.m-hikari.com            | no-pdf-link |   6941 +     cshprotocols.cshlp.org      | no-pdf-link |   6553 +    (25 rows) + +Top block-ish domains: + +    SELECT domain, status, COUNT((domain, status))      +    FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 +    WHERE t1.domain != '' +        AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status') +    GROUP BY domain, status +    ORDER BY COUNT DESC +    LIMIT 25; + +                 domain              |       status        | count  +    ---------------------------------+---------------------+------- +     journals.openedition.org        | redirect-loop       | 30395 +     ieeexplore.ieee.org             | redirect-loop       | 28926 +     www.degruyter.com               | redirect-loop       | 18891 +     www.cairn.info                  | link-loop           |  8919 +     www.frontiersin.org             | terminal-bad-status |  6786 +     projecteuclid.org               | link-loop           |  6098 +     www.mdpi.com                    | terminal-bad-status |  5189 +     medicalforum.ch                 | terminal-bad-status |  4596 +     jrnl.nau.edu.ua                 | link-loop           |  4238 +     www.revistas.unam.mx            | link-loop           |  3926 +     journals.aps.org                | redirect-loop       |  3696 +     www.ijcseonline.org             | redirect-loop       |  3567 +     www.researchsquare.com          | terminal-bad-status |  3453 +     www.persee.fr                   | terminal-bad-status |  3221 +     www.baltistica.lt               | link-loop           |  2098 +     osf.io                          | redirect-loop       |  2004 +     seer.ufrgs.br                   | terminal-bad-status |  2002 +     jtd.amegroups.com               | link-loop           |  1738 +     www.hindawi.com                 | terminal-bad-status |  1613 +     linkinghub.elsevier.com         | redirect-loop       |  1612 +     www.scienceopen.com             | terminal-bad-status |  1580 +     atm.amegroups.com               | link-loop           |  1571 +     scielo.conicyt.cl               | terminal-bad-status |  1491 +     repozytorium.ur.edu.pl          | redirect-loop       |  1279 +     agupubs.onlinelibrary.wiley.com | link-loop           |  1182 +    (25 rows) + diff --git a/sql/stats/README.md b/sql/stats/README.md index ea61fa0..52642f6 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -41,6 +41,10 @@ Total and unique-by-sha1 counts:      SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; +mimetype counts: + +    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; +  Processed or not:      # TODO: @@ -71,6 +75,8 @@ Requests by source:      SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; +    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; +  Uncrawled requests by source:      # TODO: verify this?  | 
