aboutsummaryrefslogtreecommitdiffstats
path: root/notes/bootstrap/import_timing_20190116.txt
blob: 96723ca83b62088ecd1ca923a33a3c6541868a67 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492

## master / eea40c6c713a35e19eb005a322b2075018a32e3e

    sudo service fatcat-api stop
    # as postgres user: diesel database reset
    sudo service elasticsearch stop
    sudo service postgresql restart
    sudo service fatcat-api start
    # reset postgres stats

    time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv

    Processed 53300 lines, inserted 53283, updated 0.
    real    1m10.618s
    user    0m8.812s
    sys     0m0.316s


    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid -

    Processed 48900 lines, inserted 48731, updated 0. (times 80x)
    real    17m16.242s
    user    26m45.464s
    sys     1m37.052s

    ~300 TPS
    35% fatcatd CPU
    bunch of python3 around 25-30% CPU. overall only ~12% CPU.
    all disk write. autovacuum in progress (why? is this a naive importer?)

    time zcat /srv/fatcat/datasets/crossref-works.2018-09-05.1mil.json.gz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3

    ~200% fatcatd CPU; 75% parallel; then postgres; then python3
    overall 40% user, 6% system
    10808 TPS (!)
    autovacuum on release_rev
    TODO: logs are going to rsyslog and disk (causing disk contention)

    Processed 50450 lines, inserted 46166, updated 0. (unsure how many chunks)
    905112inputs+2125192outputs (9major+2694248minor)pagefaults 0swaps
    real    5m54.368s
    user    33m1.724s
    sys     1m52.404s

    
    # did not run:
    #time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -


Results:

							  table_name                          | table_size | indexes_size | total_size 
	--------------------------------------------------------------+------------+--------------+------------
	 "public"."creator_rev"                                       | 371 MB     | 457 MB       | 827 MB
	 "public"."creator_edit"                                      | 377 MB     | 420 MB       | 797 MB
	 "public"."creator_ident"                                     | 255 MB     | 412 MB       | 667 MB
	 "public"."release_rev"                                       | 382 MB     | 261 MB       | 643 MB
	 "public"."release_ref"                                       | 437 MB     | 152 MB       | 589 MB
	 "public"."release_contrib"                                   | 154 MB     | 141 MB       | 295 MB
	 "public"."release_edit"                                      | 89 MB      | 99 MB        | 188 MB
	 "public"."work_edit"                                         | 89 MB      | 98 MB        | 187 MB
	 "public"."release_ident"                                     | 60 MB      | 97 MB        | 157 MB
	 "public"."work_ident"                                        | 60 MB      | 97 MB        | 157 MB
	 "public"."work_rev"                                          | 39 MB      | 36 MB        | 75 MB
	 "public"."container_rev"                                     | 16 MB      | 6040 kB      | 22 MB
	 "public"."editgroup"                                         | 8520 kB    | 14 MB        | 22 MB
	 "public"."abstracts"                                         | 19 MB      | 1648 kB      | 21 MB
	 "public"."changelog"                                         | 6632 kB    | 6600 kB      | 13 MB
	 "public"."container_edit"                                    | 5824 kB    | 6208 kB      | 12 MB
	 "public"."container_ident"                                   | 4160 kB    | 6376 kB      | 10 MB
	 "public"."release_rev_abstract"                              | 2448 kB    | 3000 kB      | 5448 kB

		   relname        | too_much_seq |      case      | rel_size  | seq_scan | idx_scan 
	----------------------+--------------+----------------+-----------+----------+----------
	 release_ref          |            6 | Missing Index? | 457768960 |        6 |        0
	 release_rev_abstract |            3 | Missing Index? |   2473984 |        3 |        0
	 release_contrib      |            3 | Missing Index? | 161882112 |        3 |        0
	 release_edit         |       -14934 | OK             |  93069312 |        2 |    14936
	 work_edit            |       -14936 | OK             |  93003776 |        2 |    14938
	 container_edit       |       -14977 | OK             |   5931008 |        2 |    14979
	 creator_edit         |       -14977 | OK             | 395403264 |        2 |    14979
	 abstracts            |       -39370 | OK             |  19357696 |        1 |    39371
	 changelog            |      -108251 | OK             |   6766592 |      627 |   108878
	 container_rev        |      -179447 | OK             |  17104896 |        3 |   179450
	 container_ident      |      -590029 | OK             |   4235264 |        3 |   590032
	 release_ident        |      -929184 | OK             |  62881792 |     5331 |   934515
	 work_rev             |     -1837772 | OK             |  40828928 |        1 |  1837773
	 work_ident           |     -1845253 | OK             |  62873600 |     6980 |  1852233
	 creator_ident        |     -3930575 | OK             | 267001856 |        3 |  3930578
	 editgroup            |     -5848807 | OK             |   8691712 |    31878 |  5880685
	 release_rev          |     -6081392 | OK             | 400916480 |        9 |  6081401
	 creator_rev          |     -7818340 | OK             | 388743168 |        3 |  7818343

	select count(*) from release_ref;
	1701210

	Size:    4.61G


## citation-efficiency / 8a0d963beb2fa6766a7141df39dc322abea1b9a0

    sudo service fatcat-api stop
    # as postgres user: diesel database reset
    sudo service elasticsearch stop
    sudo service postgresql restart
    sudo service fatcat-api start


    time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv

	Processed 53300 lines, inserted 53283, updated 0.

	real    1m9.867s
	user    0m8.580s
	sys     0m0.292s


    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid -

	real    17m20.110s
	user    26m56.660s
	sys     1m37.280s


    time zcat /srv/fatcat/datasets/crossref-works.2018-09-05.1mil.json.gz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3

	12043 TPS
	225% fatcatd, 33% total usr CPU, 5% sys
	autovacuum in progress

	1916.81user 110.41system 5:23.08elapsed 627%CPU (0avgtext+0avgdata 411056maxresident)k
	47920inputs+2021736outputs (4major+2734186minor)pagefaults 0swaps
	real    5m23.095s
	user    32m10.964s
	sys     1m51.800s

							  table_name                          | table_size | indexes_size | total_size 
	--------------------------------------------------------------+------------+--------------+------------
	 "public"."creator_rev"                                       | 371 MB     | 456 MB       | 827 MB
	 "public"."creator_edit"                                      | 377 MB     | 421 MB       | 798 MB
	 "public"."creator_ident"                                     | 255 MB     | 412 MB       | 667 MB
	 "public"."release_rev"                                       | 385 MB     | 260 MB       | 645 MB
	 "public"."release_contrib"                                   | 154 MB     | 141 MB       | 295 MB
	 "public"."refs_blob"                                         | 197 MB     | 6064 kB      | 203 MB
	 "public"."release_edit"                                      | 89 MB      | 98 MB        | 187 MB
	 "public"."work_edit"                                         | 89 MB      | 98 MB        | 187 MB
	 "public"."work_ident"                                        | 60 MB      | 97 MB        | 157 MB
	 "public"."release_ident"                                     | 60 MB      | 97 MB        | 156 MB
	 "public"."work_rev"                                          | 39 MB      | 36 MB        | 75 MB
	 "public"."container_rev"                                     | 16 MB      | 6056 kB      | 22 MB
	 "public"."editgroup"                                         | 8472 kB    | 13 MB        | 22 MB
	 "public"."abstracts"                                         | 19 MB      | 1672 kB      | 21 MB
	 "public"."changelog"                                         | 6544 kB    | 6592 kB      | 13 MB
	 "public"."container_edit"                                    | 5832 kB    | 6096 kB      | 12 MB
	 "public"."container_ident"                                   | 4120 kB    | 6352 kB      | 10 MB
	 "public"."release_rev_abstract"                              | 2432 kB    | 3040 kB      | 5472 kB

		   relname        | too_much_seq |      case      | rel_size  | seq_scan | idx_scan 
	----------------------+--------------+----------------+-----------+----------+----------
	 release_rev_abstract |            3 | Missing Index? |   2457600 |        3 |        0
	 release_contrib      |            3 | Missing Index? | 161800192 |        3 |        0
	 release_edit         |       -14348 | OK             |  93044736 |        2 |    14350
	 work_edit            |       -14419 | OK             |  93044736 |        2 |    14421
	 creator_edit         |       -14977 | OK             | 395321344 |        2 |    14979
	 container_edit       |       -14977 | OK             |   5939200 |        2 |    14979
	 abstracts            |       -39364 | OK             |  19365888 |        1 |    39365
	 changelog            |      -109498 | OK             |   6676480 |        2 |   109500
	 refs_blob            |      -141113 | OK             |  50651136 |        1 |   141114
	 container_rev        |      -179441 | OK             |  17096704 |        3 |   179444
	 container_ident      |      -589642 | OK             |   4194304 |        3 |   589645
	 release_ident        |      -929175 | OK             |  62840832 |     3548 |   932723
	 work_rev             |     -1836990 | OK             |  40787968 |        1 |  1836991
	 work_ident           |     -1844214 | OK             |  62849024 |     4071 |  1848285
	 creator_ident        |     -3930572 | OK             | 267010048 |        3 |  3930575
	 release_rev          |     -4378297 | OK             | 403906560 |        9 |  4378306
	 editgroup            |     -5911871 | OK             |   8642560 |        3 |  5911874
	 creator_rev          |     -7818337 | OK             | 388710400 |        3 |  7818340

	select count(*) from release_ref;
	7

	select count(*) from refs_blob;
	70169

	Size:    4.23G

Total row size is more than halved, and index by almost two orders of
magnitude, with about the same (or even faster) insert time. Success!

Raw release JSON (crossref-works.2018-09-05.1mil.json) was first million lines
(not randomized). 217 MB gzip, about 2.1 GB uncompressed.

Continuing with a full import...

	time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3

    217758.13user 10299.22system 19:56:34elapsed 317%CPU (0avgtext+0avgdata 437384maxresident)k
    88774752inputs+356480424outputs (573520major+35809039minor)pagefaults 0swaps

    real    1196m34.828s => 20 hours
    user    3713m5.376s
    sys     177m48.364s

       relname        | too_much_seq | case |  rel_size   | seq_scan | idx_scan  
----------------------+--------------+------+-------------+----------+-----------
 release_edit         |       -89909 | OK   |  8641216512 |        2 |     89911
 work_edit            |       -89980 | OK   |  8641134592 |        2 |     89982
 creator_edit         |       -90540 | OK   |   395321344 |        8 |     90548
 container_edit       |       -90544 | OK   |     9494528 |        4 |     90548
 release_contrib      |      -918913 | OK   | 22936829952 |        3 |    918916
 release_rev_abstract |      -919036 | OK   |   271998976 |        3 |    919039
 container_rev        |     -1029775 | OK   |    21168128 |        3 |   1029778
 changelog            |     -1941085 | OK   |   117219328 |        2 |   1941087
 abstracts            |     -4633441 | OK   |  2443132928 |        1 |   4633442
 creator_ident        |     -7165562 | OK   |   267010048 |        3 |   7165565
 creator_rev          |     -9432011 | OK   |   388710400 |        3 |   9432014
 refs_blob            |    -34911929 | OK   | 12710191104 |        2 |  34911931
 container_ident      |    -66613383 | OK   |     7626752 |        3 |  66613386
 release_ident        |    -86429880 | OK   |  5833670656 |     3548 |  86433428
 work_rev             |   -170840559 | OK   |  3784466432 |        1 | 170840560
 work_ident           |   -170923520 | OK   |  5833400320 |     4071 | 170927591
 editgroup            |   -176784137 | OK   |   149798912 |        3 | 176784140
 release_rev          |   -478829274 | OK   | 43008122880 |        9 | 478829283

                          table_name                          | table_size | indexes_size | total_size 
--------------------------------------------------------------+------------+--------------+------------
 "public"."release_rev"                                       | 40 GB      | 23 GB        | 63 GB
 "public"."refs_blob"                                         | 54 GB      | 1479 MB      | 56 GB
 "public"."release_contrib"                                   | 21 GB      | 19 GB        | 40 GB
 "public"."release_edit"                                      | 8243 MB    | 9154 MB      | 17 GB
 "public"."work_edit"                                         | 8243 MB    | 9150 MB      | 17 GB
 "public"."release_ident"                                     | 5565 MB    | 9017 MB      | 14 GB
 "public"."work_ident"                                        | 5565 MB    | 9006 MB      | 14 GB
 "public"."work_rev"                                          | 3610 MB    | 3343 MB      | 6953 MB
 "public"."abstracts"                                         | 2382 MB    | 191 MB       | 2573 MB
 "public"."creator_rev"                                       | 371 MB     | 456 MB       | 827 MB
 "public"."creator_edit"                                      | 377 MB     | 421 MB       | 798 MB
 "public"."creator_ident"                                     | 255 MB     | 412 MB       | 667 MB
 "public"."release_rev_abstract"                              | 259 MB     | 336 MB       | 596 MB
 "public"."editgroup"                                         | 143 MB     | 237 MB       | 380 MB
 "public"."changelog"                                         | 112 MB     | 116 MB       | 228 MB
 "public"."container_rev"                                     | 20 MB      | 9248 kB      | 29 MB
 "public"."container_edit"                                    | 9304 kB    | 8872 kB      | 18 MB
 "public"."container_ident"                                   | 7472 kB    | 9112 kB      | 16 MB

	Size:  234.86G


    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -

    Processed 531750 lines, inserted 455063, updated 0.
    Command exited with non-zero status 1
    12904.53user 485.22system 2:10:40elapsed 170%CPU (0avgtext+0avgdata 63300maxresident)k
    98696inputs+3452984outputs (154major+287476minor)pagefaults 0swaps

    real    130m40.181s
    user    215m34.456s
    sys     8m10.064s


    time zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -

    Processed 486350 lines, inserted 247482, updated 171906.
    20591.62user 844.63system 3:21:40elapsed 177%CPU (0avgtext+0avgdata 41488maxresident)k
    80768inputs+2522808outputs (110major+251307minor)pagefaults 0swaps

    real    201m40.885s
    user    343m34.616s
    sys     14m8.364s


    time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -

    Processed 133303 lines, inserted 123905, updated 0.
    10636.02user 424.26system 2:20:14elapsed 131%CPU (0avgtext+0avgdata 65232maxresident)k
    23984inputs+20285752outputs (119major+1344072minor)pagefaults 0swaps

    real    140m14.397s
    user    180m0.592s
    sys     7m30.664s

Re-summarize:

    select count(*) from file_rev_release;
    12510846

    select count(*) from container_ident where is_live='f';
    0
    select count(*) from release_ident where is_live='f';
    0
    select count(*) from work_ident where is_live='f';
    0
    select count(*) from creator_ident where is_live='f';
    1
    select count(*) from file_ident where is_live='f';
    0

    Size:  261.59G
	=> releases and works about 242 GB of this

                          table_name                          | table_size | indexes_size | total_size 
--------------------------------------------------------------+------------+--------------+------------
 "public"."release_rev"                                       | 40 GB      | 24 GB        | 64 GB
 "public"."refs_blob"                                         | 58 GB      | 1576 MB      | 60 GB
 "public"."release_contrib"                                   | 22 GB      | 19 GB        | 41 GB
 "public"."release_edit"                                      | 8386 MB    | 9300 MB      | 17 GB
 "public"."work_edit"                                         | 8386 MB    | 9297 MB      | 17 GB
 "public"."release_ident"                                     | 5690 MB    | 9199 MB      | 15 GB
 "public"."work_ident"                                        | 5690 MB    | 9191 MB      | 15 GB
 "public"."file_rev_url"                                      | 6151 MB    | 2346 MB      | 8496 MB
 "public"."work_rev"                                          | 3673 MB    | 3402 MB      | 7075 MB
 "public"."abstracts"                                         | 3807 MB    | 277 MB       | 4085 MB
 "public"."file_rev"                                          | 1403 MB    | 2309 MB      | 3712 MB
 "public"."file_edit"                                         | 1181 MB    | 1236 MB      | 2417 MB
 "public"."file_rev_release"                                  | 721 MB     | 1266 MB      | 1987 MB
 "public"."file_ident"                                        | 691 MB     | 1163 MB      | 1854 MB
 "public"."release_rev_abstract"                              | 374 MB     | 495 MB       | 869 MB
 "public"."creator_rev"                                       | 371 MB     | 456 MB       | 827 MB
 "public"."creator_edit"                                      | 377 MB     | 421 MB       | 798 MB
 "public"."creator_ident"                                     | 255 MB     | 412 MB       | 667 MB
 "public"."editgroup"                                         | 194 MB     | 299 MB       | 493 MB
 "public"."changelog"                                         | 134 MB     | 138 MB       | 272 MB
 "public"."container_rev"                                     | 20 MB      | 9248 kB      | 29 MB
 "public"."container_edit"                                    | 9304 kB    | 8872 kB      | 18 MB
 "public"."container_ident"                                   | 7472 kB    | 9112 kB      | 16 MB

       relname        | too_much_seq | case |  rel_size   | seq_scan | idx_scan  
----------------------+--------------+------+-------------+----------+-----------
 release_edit         |      -486322 | OK   |  8791547904 |       14 |    486336
 work_edit            |      -486391 | OK   |  8791498752 |        8 |    486399
 file_edit            |      -486945 | OK   |  1237671936 |        4 |    486949
 creator_edit         |      -486949 | OK   |   395321344 |        8 |    486957
 container_edit       |      -486953 | OK   |     9494528 |        4 |    486957
 container_rev        |     -1029946 | OK   |    21168128 |        3 |   1029949
 file_rev_url         |     -2166783 | OK   |  6448095232 |        2 |   2166785
 file_rev_release     |     -2166811 | OK   |   756015104 |        7 |   2166818
 changelog            |     -7336464 | OK   |   140369920 |        2 |   7336466
 abstracts            |     -7447647 | OK   |  3716759552 |        1 |   7447648
 creator_ident        |     -7561970 | OK   |   267010048 |        6 |   7561976
 creator_rev          |     -9432017 | OK   |   388710400 |        3 |   9432020
 release_contrib      |    -11915853 | OK   | 23410876416 |        3 |  11915856
 release_rev_abstract |    -11917411 | OK   |   392249344 |        3 |  11917414
 file_ident           |    -23530866 | OK   |   724213760 |    60366 |  23591232
 refs_blob            |    -40651974 | OK   | 13605445632 |        2 |  40651976
 container_ident      |    -67010119 | OK   |     7626752 |        5 |  67010124
 file_rev             |    -84478325 | OK   |  1470947328 |       10 |  84478335
 release_ident        |   -114803381 | OK   |  5964980224 |     3551 | 114806932
 work_rev             |   -173810916 | OK   |  3850354688 |        1 | 173810917
 work_ident           |   -177260615 | OK   |  5964554240 |     4074 | 177264689
 editgroup            |   -192178637 | OK   |   203137024 |        3 | 192178640
 release_rev          |   -501596237 | OK   | 43460804608 |        9 | 501596246
(23 rows)


	bnewbold@wbgrp-svc500$ df -h /
	Filesystem      Size  Used Avail Use% Mounted on
	/dev/vda1       858G  409G  407G  51% /

	bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/11/main/
	263G    /var/lib/postgresql/11/main/

At this point slightly contaminated by running re-import for a minute before cancelling...

	select count(*) from refs_blob;
	18,517,091

	select count(*) from release_ident;
	86,905,233

Re-importing crossref dump to simulate double size database:

	time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3

	Processed 4989650 lines, inserted 538846, updated 0. (etc)
    208873.41user 8805.46system 12:40:59elapsed 476%CPU (0avgtext+0avgdata 419868maxresident)k
    43945800inputs+268799736outputs (210727major+21027251minor)pagefaults 0swaps

    real    760m59.324s
    user    3567m3.524s
    sys     152m56.692s

                          table_name                          | table_size | indexes_size | total_size 
--------------------------------------------------------------+------------+--------------+------------
 "public"."release_rev"                                       | 47 GB      | 27 GB        | 74 GB
 "public"."refs_blob"                                         | 72 GB      | 1898 MB      | 73 GB
 "public"."release_contrib"                                   | 26 GB      | 23 GB        | 49 GB
 "public"."release_edit"                                      | 9418 MB    | 10 GB        | 19 GB
 "public"."work_edit"                                         | 9418 MB    | 10 GB        | 19 GB
 "public"."work_ident"                                        | 6386 MB    | 10 GB        | 16 GB
 "public"."release_ident"                                     | 6387 MB    | 10 GB        | 16 GB
 "public"."file_rev_url"                                      | 6151 MB    | 2346 MB      | 8496 MB
 "public"."work_rev"                                          | 4125 MB    | 3828 MB      | 7952 MB
 "public"."abstracts"                                         | 4134 MB    | 303 MB       | 4437 MB
 "public"."file_rev"                                          | 1403 MB    | 2309 MB      | 3712 MB
 "public"."file_edit"                                         | 1181 MB    | 1236 MB      | 2417 MB
 "public"."file_rev_release"                                  | 721 MB     | 1266 MB      | 1987 MB
 "public"."file_ident"                                        | 691 MB     | 1163 MB      | 1854 MB
 "public"."release_rev_abstract"                              | 409 MB     | 539 MB       | 948 MB
 "public"."editgroup"                                         | 340 MB     | 543 MB       | 883 MB
 "public"."creator_rev"                                       | 371 MB     | 456 MB       | 827 MB
 "public"."creator_edit"                                      | 377 MB     | 421 MB       | 798 MB
 "public"."creator_ident"                                     | 255 MB     | 412 MB       | 667 MB
 "public"."changelog"                                         | 250 MB     | 259 MB       | 508 MB
 "public"."container_rev"                                     | 20 MB      | 9272 kB      | 29 MB
 "public"."container_edit"                                    | 9472 kB    | 8880 kB      | 18 MB
 "public"."container_ident"                                   | 7592 kB    | 9136 kB      | 16 MB

	Size:  301.96G

	select count(*) from refs_blob;
	22,322,741

	select count(*) from release_ident;
	97,597,519

	changelog: 4,286,296

Huh. Expected this to basically double size... what happened? Doing fetches?

So... it was doing fetches (no 'no_release_updates' flag passed), but still
inserted 5 million? also not good.

    time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt --extid-map-file /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 --no-release-updates

    Processed 5001450 lines, inserted 4811233, updated 0.
    138166.58user 5605.61system 43:47:54elapsed 91%CPU (0avgtext+0avgdata 426964maxresident)k
    208379288inputs+488675440outputs (267864major+49077696minor)pagefaults 0swaps

    real    2627m55.003s
    user    2386m11.476s
    sys     99m0.408s

							  table_name                          | table_size | indexes_size | total_size 
	--------------------------------------------------------------+------------+--------------+------------
	 "public"."release_rev"                                       | 93 GB      | 53 GB        | 146 GB
	 "public"."release_contrib"                                   | 52 GB      | 45 GB        | 97 GB
	 "public"."refs_blob"                                         | 72 GB      | 1898 MB      | 73 GB
	 "public"."release_edit"                                      | 18 GB      | 20 GB        | 39 GB
	 "public"."work_edit"                                         | 18 GB      | 20 GB        | 39 GB
	 "public"."work_ident"                                        | 12 GB      | 20 GB        | 32 GB
	 "public"."release_ident"                                     | 12 GB      | 20 GB        | 32 GB
	 "public"."work_rev"                                          | 8185 MB    | 7596 MB      | 15 GB
	 "public"."file_rev_url"                                      | 6151 MB    | 2346 MB      | 8496 MB
	 "public"."abstracts"                                         | 4134 MB    | 303 MB       | 4437 MB
	 "public"."file_rev"                                          | 1403 MB    | 2309 MB      | 3712 MB
	 "public"."file_edit"                                         | 1181 MB    | 1236 MB      | 2417 MB
	 "public"."file_rev_release"                                  | 721 MB     | 1266 MB      | 1987 MB
	 "public"."file_ident"                                        | 691 MB     | 1163 MB      | 1854 MB
	 "public"."release_rev_abstract"                              | 700 MB     | 919 MB       | 1619 MB
	 "public"."editgroup"                                         | 486 MB     | 788 MB       | 1275 MB
	 "public"."creator_rev"                                       | 371 MB     | 456 MB       | 827 MB
	 "public"."creator_edit"                                      | 377 MB     | 421 MB       | 798 MB
	 "public"."changelog"                                         | 365 MB     | 381 MB       | 746 MB
	 "public"."creator_ident"                                     | 255 MB     | 412 MB       | 667 MB
	 "public"."container_rev"                                     | 20 MB      | 9272 kB      | 29 MB
	 "public"."container_edit"                                    | 9472 kB    | 8880 kB      | 18 MB
	 "public"."container_ident"                                   | 7592 kB    | 9136 kB      | 16 MB

		   relname        | too_much_seq | case |   rel_size   | seq_scan |  idx_scan  
	----------------------+--------------+------+--------------+----------+------------
	 release_edit         |      -487544 | OK   |  19594010624 |       26 |     487570
	 work_edit            |      -487615 | OK   |  19594043392 |       26 |     487641
	 file_edit            |      -488168 | OK   |   1237671936 |       19 |     488187
	 creator_edit         |      -488173 | OK   |    395321344 |       26 |     488199
	 container_edit       |      -488306 | OK   |      9666560 |       49 |     488355
	 file_rev_url         |     -2166808 | OK   |   6448095232 |        2 |    2166810
	 file_rev_release     |     -2166881 | OK   |    756015104 |        7 |    2166888
	 container_rev        |     -2264841 | OK   |     21364736 |        3 |    2264844
	 changelog            |    -11338986 | OK   |    382525440 |        2 |   11338988
	 creator_rev          |    -12726261 | OK   |    388710400 |        3 |   12726264
	 creator_ident        |    -14563891 | OK   |    267010048 |        6 |   14563897
	 abstracts            |    -15594992 | OK   |   4052975616 |        1 |   15594993
	 file_ident           |    -23532116 | OK   |    724213760 |    60366 |   23592482
	 file_rev             |    -84478438 | OK   |   1470947328 |       10 |   84478448
	 release_contrib      |    -97501069 | OK   |  55310950400 |        3 |   97501072
	 release_rev_abstract |    -97505413 | OK   |    734248960 |        3 |   97505416
	 refs_blob            |   -108179066 | OK   |  15747162112 |       11 |  108179077
	 container_ident      |   -152392399 | OK   |      7749632 |        5 |  152392404
	 release_ident        |   -307197678 | OK   |  13256884224 |     3557 |  307201235
	 work_rev             |   -387420683 | OK   |   8580505600 |        1 |  387420684
	 work_ident           |   -390871805 | OK   |  13256515584 |     4074 |  390875879
	 editgroup            |   -409831715 | OK   |    509853696 |        3 |  409831718
	 release_rev          |  -1112440989 | OK   | 100107378688 |        9 | 1112440998

	Size:  501.37G

	select count(*) from refs_blob;
	22,322,742

	select count(*) from release_ident;
	193,709,943