diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-09-15 00:20:57 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-15 00:20:57 -0700 | 
| commit | c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8 (patch) | |
| tree | a5c89b810c74e65a67c66b412daffbb555792016 /sql | |
| parent | c714ecdcd8aa8bb39b1b46860944b6cace7f5077 (diff) | |
| download | sandcrawler-c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8.tar.gz sandcrawler-c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8.zip  | |
updated sandcrawler-db stats
Diffstat (limited to 'sql')
| -rw-r--r-- | sql/stats/2020-09-14_stats.txt | 340 | ||||
| -rw-r--r-- | sql/stats/README.md | 12 | 
2 files changed, 346 insertions, 6 deletions
diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt new file mode 100644 index 0000000..3bc27b0 --- /dev/null +++ b/sql/stats/2020-09-14_stats.txt @@ -0,0 +1,340 @@ + +## SQL Table Sizes + +    SELECT +        table_name, +        pg_size_pretty(table_size) AS table_size, +        pg_size_pretty(indexes_size) AS indexes_size, +        pg_size_pretty(total_size) AS total_size +      FROM ( +          SELECT +              table_name, +              pg_table_size(table_name) AS table_size, +              pg_indexes_size(table_name) AS indexes_size, +              pg_total_relation_size(table_name) AS total_size +          FROM ( +              SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name +              FROM information_schema.tables +              WHERE table_schema = 'public' +          ) AS all_tables +          ORDER BY total_size DESC +      ) AS pretty_sizes; + + +              table_name           | table_size | indexes_size | total_size  +    -------------------------------+------------+--------------+------------ +     "public"."cdx"                | 44 GB      | 45 GB        | 89 GB +     "public"."grobid"             | 66 GB      | 8127 MB      | 74 GB +     "public"."ingest_request"     | 34 GB      | 40 GB        | 73 GB +     "public"."ingest_file_result" | 28 GB      | 44 GB        | 72 GB +     "public"."grobid_shadow"      | 64 GB      | 6902 MB      | 71 GB +     "public"."file_meta"          | 33 GB      | 30 GB        | 63 GB +     "public"."shadow"             | 9111 MB    | 10204 MB     | 19 GB +     "public"."fatcat_file"        | 12 GB      | 6656 MB      | 18 GB +     "public"."pdf_meta"           | 12 GB      | 2924 MB      | 15 GB +     "public"."pdftrio"            | 618 MB     | 432 MB       | 1051 MB +     "public"."petabox"            | 403 MB     | 594 MB       | 997 MB +    (11 rows) + + +## File Metadata + +Counts and total file size: + +    SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + +     total_count |   total_size +    -------------+----------------- +       167021210 | 221982345333674 +    (1 row) + +Top mimetypes: + +    SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + +               mimetype            |   count +    -------------------------------+----------- +     application/pdf               | 166765214 +     application/octet-stream      |    155517 +     application/xml               |     42170 +     text/html                     |     18708 +     text/plain                    |     15990 +     application/gzip              |      6491 +                                   |      6036 +     application/postscript        |      4912 +     application/vnd.ms-powerpoint |      1672 +     application/msword            |       921 +    (10 rows) + +Missing full metadata: + +    SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + +     count +    ------- +    62960 +    (1 row) + + +## CDX + +Total and unique-by-sha1 counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +     unique_sha1 |   total    +    -------------+----------- +       102123051 | 126550160 +    (1 row) + +mimetype counts: + +    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + +              mimetype          |   count    +    ----------------------------+----------- +     application/pdf            | 116885565 +     warc/revisit               |   7951816 +     text/xml                   |    519042 +     application/octet-stream   |    327639 +     text/html                  |    295725 +     unk                        |    172491 +     application/postscript     |     81095 +     application/save           |     80900 +     binary/octet-stream        |     61783 +     text/plain                 |     33684 +     image/pdf                  |     32856 +     application/x-download     |     32418 +     application/download       |     27672 +     application/force-download |     10892 +     multipart/form-data        |      5750 +     application/x-msdownload   |      3832 +     application/x-octetstream  |      3516 +     application                |      3499 +     .pdf                       |      3038 +     application/x-pdf          |      2701 +     application/binary         |      1322 +     pdf                        |      1232 +     file/unknown               |      1199 +     application/pdf'           |      1192 +     file                       |       979 +    (25 rows) + +## GROBID + +Counts: + +    SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + +     total_files | unique_releases +    -------------+----------------- +       101494314 |        18919012 +    (1 row) + +Status? + +    SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + +     status_code |  count +    -------------+---------- +             200 | 93730358 +             500 |  7759103 +              -4 |     4683 +             503 |      150 +    (4 rows) + +What version used? + +    SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + +     grobid_version |  count +    ----------------+---------- +     0.5.5-fatcat   | 80838234 +                    | 12892145 +    (2 rows) + +## Petabox + +Counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; +unique_sha1 |  total +    -------------+--------- +         2868825 | 2887834 +    (1 row) + +## Ingests + +Requests by source: + +    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type |   link_source   |  count +    -------------+-----------------+---------- +     pdf         | oai             | 51185088 +     pdf         | mag             | 35015357 +     pdf         | unpaywall       | 27653003 +     pdf         | doi             | 17362763 +     pdf         | pmc             |  2248854 +     pdf         | arxiv           |   835400 +     pdf         | cnki_covid19    |     2034 +     pdf         | wanfang_covid19 |      975 +     pdf         | spn             |      197 +    (9 rows) + +    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type |   link_source   |  ingest_request_source  |  count    +    -------------+-----------------+-------------------------+---------- +     pdf         | oai             | metha-bulk              | 51185088 +     pdf         | mag             | mag-corpus              | 35015357 +     pdf         | unpaywall       | unpaywall               | 27653003 +     pdf         | doi             | fatcat-ingest           |  8399261 +     pdf         | doi             | fatcat-changelog        |  5449349 +     pdf         | doi             | fatcat-ingest-container |  3515873 +     pdf         | pmc             | fatcat-ingest-container |  2028825 +     pdf         | arxiv           | fatcat-ingest           |   634665 +     pdf         | pmc             | fatcat-ingest           |   210453 +     pdf         | arxiv           | fatcat-changelog        |   200707 +     pdf         | pmc             | fatcat-changelog        |     9582 +     pdf         | cnki_covid19    | scrape-covid19          |     2034 +     pdf         | wanfang_covid19 | scrape-covid19          |      975 +     pdf         | spn             | savepapernow-web        |      197 +     pdf         | arxiv           | fatcat-ingest-container |       26 +     pdf         | doi             | savepapernow-web        |       21 +     pdf         | arxiv           | savepapernow-web        |        2 +    (17 rows) + +Uncrawled requests by source: + +    # TODO: verify this? +    SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +    WHERE ingest_file_result.base_url IS NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type | link_source | count +    -------------+-------------+-------- +     pdf         | mag         | 170304 +     pdf         | oai         |  15287 +     pdf         | unpaywall   |      1 +    (3 rows) + +Results by source: + +    SELECT +        ingest_request.ingest_type, +        ingest_request.link_source, +        COUNT(*) as attempts, +        COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,  +        ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +        AND ingest_file_result.ingest_type IS NOT NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + +     ingest_type |   link_source   | attempts |   hits   | fraction +    -------------+-----------------+----------+----------+---------- +     pdf         | oai             | 51185088 | 14144314 |    0.276 +     pdf         | mag             | 35015357 | 24811947 |    0.709 +     pdf         | unpaywall       | 27653003 | 22302629 |    0.807 +     pdf         | doi             | 17363369 |  3533568 |    0.204 +     pdf         | pmc             |  2248860 |  1713197 |    0.762 +     pdf         | arxiv           |   835400 |   685219 |    0.820 +     pdf         | cnki_covid19    |     2034 |        0 |    0.000 +     pdf         | wanfang_covid19 |      975 |      764 |    0.784 +     pdf         | spn             |      197 |      138 |    0.701 +    (9 rows) + +Ingest result by status: + +    SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + +     ingest_type |               status                |  count +    -------------+-------------------------------------+---------- +     pdf         | success                             | 58265365 +     pdf         | no-pdf-link                         | 27216435 +     pdf         | no-capture                          | 21982611 +     pdf         | redirect-loop                       |  8457469 +     pdf         | terminal-bad-status                 |  2695023 +     pdf         | link-loop                           |  2209672 +     pdf         | wrong-mimetype                      |   767508 +     pdf         | gateway-timeout                     |   548870 +     pdf         | cdx-error                           |   391611 +     pdf         | skip-url-blocklist                  |   220661 +     pdf         | null-body                           |   182215 +     pdf         | wayback-error                       |   146869 +     pdf         | spn2-cdx-lookup-failure             |   107229 +     pdf         | spn-error                           |    85128 +     pdf         | invalid-host-resolution             |    37352 +     pdf         | petabox-error                       |    32490 +     pdf         | spn2-error                          |    29212 +     pdf         | spn-remote-error                    |    27927 +     pdf         | other-mimetype                      |     2305 +     pdf         | bad-redirect                        |     1524 +     pdf         | spn2-error:job-failed               |     1521 +     pdf         | timeout                             |      842 +     pdf         | spn2-error:soft-time-limit-exceeded |      793 +     pdf         | redirects-exceeded                  |      748 +     pdf         | spn2-error:invalid-url-syntax       |      417 +    (25 rows) + +Failed ingest by terminal status code: + +    SELECT ingest_type, terminal_status_code, COUNT(*) +    FROM ingest_file_result +    WHERE hit = false +    GROUP BY ingest_type, terminal_status_code +    ORDER BY COUNT DESC +    LIMIT 25; + +     ingest_type | terminal_status_code |  count +    -------------+----------------------+---------- +     pdf         |                  200 | 34064937 +     pdf         |                      | 20514531 +     pdf         |                  301 |  7271700 +     pdf         |                  302 |   720632 +     pdf         |                  503 |   712697 +     pdf         |                  400 |   444209 +     pdf         |                  404 |   331495 +     pdf         |                  403 |   323030 +     pdf         |                  401 |   259327 +     pdf         |                  500 |   236122 +     pdf         |                  303 |   101609 +     pdf         |                  429 |    47738 +     pdf         |                  502 |    36183 +     pdf         |                  420 |    26603 +     pdf         |                  509 |    15113 +     pdf         |                  409 |    14790 +     pdf         |                  999 |     8996 +     pdf         |                  307 |     3769 +     pdf         |                  308 |     3422 +     pdf         |                  202 |     3228 +     pdf         |                  520 |     2058 +     pdf         |                  410 |     1734 +     pdf         |                  521 |     1033 +     pdf         |                  504 |      868 +     pdf         |                  505 |      424 +    (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + +    SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count +    FROM grobid +    LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex +    WHERE fatcat_file.sha1hex IS NULL +      AND grobid.fatcat_release IS NOT NULL; + +    total_count | release_count +    -------------+--------------- +        6600758 |       5213294 +    (1 row) + diff --git a/sql/stats/README.md b/sql/stats/README.md index 89deec2..2e9eae5 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -29,7 +29,7 @@ Counts and total file size:  Top mimetypes: -    SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; +    SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 20;  Missing full metadata: @@ -43,11 +43,7 @@ Total and unique-by-sha1 counts:  mimetype counts: -    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; - -Processed or not: - -    # TODO: +    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;  ## GROBID @@ -107,6 +103,10 @@ Ingest result by status:      SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; +Failed ingest by terminal status code: + +    SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 25; +  ## Fatcat Files  Count of PDF files that GROBID processed and matched to a release (via  | 
