1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
## SQL Table Sizes
table_name | table_size | indexes_size | total_size
-------------------------------+------------+--------------+------------
"public"."cdx" | 42 GB | 36 GB | 78 GB
"public"."grobid" | 38 GB | 7076 MB | 45 GB
"public"."file_meta" | 23 GB | 11 GB | 34 GB
"public"."shadow" | 8303 MB | 9216 MB | 17 GB
"public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
"public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB
"public"."petabox" | 403 MB | 594 MB | 997 MB
"public"."ingest_request" | 363 MB | 625 MB | 988 MB
## File Metadata
Counts and total file size:
SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
total_count | total_size
-------------+-----------------
118823340 | 140917467253923
(1 row)
# 118,823,340 => 118 million
# 140,917,467,253,923 => ~141 TByte
Top mimetypes:
SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
mimetype | count
-------------------------------+-----------
application/pdf | 117185567
| 1509149
application/octet-stream | 87783
text/html | 9901
application/postscript | 3781
application/vnd.ms-powerpoint | 1421
text/plain | 1151
application/xml | 427
application/gzip | 414
application/msword | 314
(10 rows)
## CDX
Total and unique-by-sha1 counts:
SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
unique_sha1 | total
-------------+-----------
96141851 | 110030179
(1 row)
# 96,141,851
# 110,030,179
Top mimetypes (not unique by sha1):
mimetype | count
------------------------+----------
application/pdf | 84582642
text/html | 24841846
text/xml | 524682
application/postscript | 81009
(4 rows)
## GROBID
Counts:
SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
unique_releases | total
-----------------+----------
13675190 | 59919772
# 13,675,190
# 59,919,772
Status?
SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
status_code | count
-------------+----------
200 | 57382904
500 | 2536862
503 | 6
(3 rows)
What version used?
SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
grobid_version | count
----------------+----------
0.5.5-fatcat | 41699385
| 15683279
(2 rows)
## Petabox
Counts:
SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
unique_sha1 | total
-------------+---------
2868825 | 2887834
(1 row)
# 2,868,825
# 2,887,834
## Ingests
Requests by source:
SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
ingest_type | link_source | count
-------------+-------------+---------
pdf | doi | 2816171
pdf | arxiv | 154448
pdf | spn | 55
pdf | pubmed | 2
(4 rows)
Uncrawled requests by source:
# TODO: verify this?
SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_request.base_url = ingest_file_result.base_url
AND ingest_request.ingest_type = ingest_file_result.ingest_type
WHERE ingest_file_result.base_url IS NULL
GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
ingest_type | link_source | count
-------------+-------------+-------
(0 rows)
Results by source:
SELECT
ingest_request.ingest_type,
ingest_request.link_source,
COUNT(*) as attempts,
COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_request.base_url = ingest_file_result.base_url
AND ingest_request.ingest_type = ingest_file_result.ingest_type
AND ingest_file_result.ingest_type IS NOT NULL
GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
ingest_type | link_source | attempts | hits | fraction
-------------+-------------+----------+--------+----------
pdf | doi | 2816171 | 289199 | 0.103
pdf | arxiv | 154448 | 41105 | 0.266
pdf | spn | 55 | 46 | 0.836
pdf | pubmed | 2 | 0 | 0.000
(4 rows)
Ingest result by status:
SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
ingest_type | status | count
-------------+---------------------+---------
pdf | no-pdf-link | 2213720
pdf | success | 330492
pdf | spn-remote-error | 182157
pdf | spn-error | 141222
pdf | cdx-error | 83131
pdf | link-loop | 11350
pdf | other-mimetype | 6089
pdf | null-body | 1980
pdf | terminal-bad-status | 583
pdf | wayback-error | 381
(10 rows)
|