aboutsummaryrefslogtreecommitdiffstats
path: root/notes/bootstrap/import_timing_20180924.txt
diff options
context:
space:
mode:
Diffstat (limited to 'notes/bootstrap/import_timing_20180924.txt')
-rw-r--r--notes/bootstrap/import_timing_20180924.txt135
1 files changed, 135 insertions, 0 deletions
diff --git a/notes/bootstrap/import_timing_20180924.txt b/notes/bootstrap/import_timing_20180924.txt
new file mode 100644
index 00000000..d9c49fc2
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180924.txt
@@ -0,0 +1,135 @@
+
+Pretty much all imports done at git hash c1d0fea
+
+ time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+ Processed 4990450 lines, inserted 4005034, updated 0. (etc)
+
+ 133387.36user 5255.64system 24:19:01elapsed 158%CPU (0avgtext+0avgdata 448196maxresident)k
+ 177480808inputs+432403768outputs (204major+48533880minor)pagefaults 0swaps
+
+ real 1459m1.518s
+ user 2308m24.300s
+ sys 93m17.132s
+
+ Longer, bigger, etc than previously!
+
+ Size: 377.49G
+
+ select count(id) from release_ident; => 79,880,900
+
+
+ zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update -
+ Processed 531700 lines, inserted 511751, updated 0.
+ Command exited with non-zero status 1
+ 17087.60user 717.77system 3:07:11elapsed 158%CPU (0avgtext+0avgdata 67420maxresident)k
+ 60128inputs+3401960outputs (141major+403282minor)pagefaults 0swaps
+
+
+Sample of "not found" DOIs:
+
+ DOI not found: 10.1109/mic.2005.100
+ DOI not found: 10.3386/w9732
+ DOI not found: 10.1090/s0002-9939-97-04114-2
+ DOI not found: 10.1186/1475-2867-5-29
+ DOI not found: 10.2172/143964
+ DOI not found: 10.2172/10170724
+ DOI not found: 10.2172/383051
+ DOI not found: 10.1017/s0033291700051370
+ DOI not found: 10.12980/jclm.3.2015j5-154
+ DOI not found: 10.2172/801341
+ DOI not found: 10.2172/899508
+
+ DOI not found: 10.1136/bmj.2.4570.302
+ DOI not found: 10.1136/bmj.2.4687.1049
+ DOI not found: 10.1163/221125903x00429
+ DOI not found: 10.1177/004947557800800102
+ DOI not found: 10.1177/107755874800500313
+ DOI not found: 10.1177/107755874800500415
+ DOI not found: 10.1177/107755874800500713
+
+ DOI not found: 10.5990/jwpa.29.72
+ DOI not found: 10.2307/1107183
+ DOI not found: 10.1101/147165
+ DOI not found: 10.17848/wp04-108
+ DOI not found: 10.2172/542039
+ DOI not found: 10.2172/542040
+ DOI not found: 10.1002/9781444308747.ch6
+
+
+ zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+ Processed 485400 lines, inserted 283498, updated 197825.
+ 25649.33user 1152.84system 4:42:24elapsed 158%CPU (0avgtext+0avgdata 38984maxresident)k
+ 38584inputs+2371576outputs (136major+357478minor)pagefaults 0swaps
+
+ Size: 395.13G
+
+ table_name | table_size | indexes_size | total_size
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref" | 154 GB | 54 GB | 208 GB
+ "public"."release_rev" | 39 GB | 22 GB | 61 GB
+ "public"."release_contrib" | 25 GB | 22 GB | 47 GB
+ "public"."release_edit" | 7095 MB | 6956 MB | 14 GB
+ "public"."work_edit" | 7095 MB | 6956 MB | 14 GB
+ "public"."release_ident" | 5203 MB | 6254 MB | 11 GB
+ "public"."work_ident" | 5203 MB | 6254 MB | 11 GB
+ "public"."file_rev_url" | 6535 MB | 2478 MB | 9013 MB
+ "public"."work_rev" | 3376 MB | 3127 MB | 6503 MB
+ "public"."file_rev" | 1404 MB | 2115 MB | 3519 MB
+ "public"."abstracts" | 2611 MB | 208 MB | 2820 MB
+ "public"."file_edit" | 1089 MB | 1066 MB | 2155 MB
+ "public"."file_release" | 713 MB | 1250 MB | 1962 MB
+ "public"."file_ident" | 618 MB | 740 MB | 1358 MB
+ "public"."creator_rev" | 371 MB | 457 MB | 828 MB
+ "public"."creator_edit" | 347 MB | 352 MB | 699 MB
+ "public"."release_rev_abstract" | 284 MB | 369 MB | 653 MB
+ "public"."creator_ident" | 255 MB | 305 MB | 560 MB
+ "public"."changelog" | 138 MB | 142 MB | 279 MB
+ "public"."editgroup" | 155 MB | 92 MB | 247 MB
+ "public"."container_rev" | 20 MB | 9272 kB | 29 MB
+ "public"."container_edit" | 8312 kB | 7360 kB | 15 MB
+ "public"."container_ident" | 7272 kB | 6832 kB | 14 MB
+
+Exports!
+
+ time cat /tmp/fatcat_ident_releases.tsv | ./target/release/fatcat-export release --expand files,container -j8 | gzip > release_export_expanded.json.gz
+ INFO 2018-09-27T22:54:30Z: fatcat_export: Done reading (79880900 lines), waiting for workers to exit...
+
+ real 384m29.435s
+ user 740m1.060s
+ sys 229m11.632s
+
+
+ time zcat /srv/fatcat/snapshots/2018-09-24/release_export_expanded.json.gz | ./transform_release.py | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release
+ 2018/09/28 02:56:36 79880900 docs in 2h56m48.425914042s at 7529.948 docs/s with 8 workers
+ 2018/09/28 02:56:36 applied setting: {"index": {"refresh_interval": "1s"}} with status 200 OK
+ 2018/09/28 02:56:36 applied setting: {"index": {"number_of_replicas": "1"}} with status 200 OK
+ 2018/09/28 02:56:40 index flushed: 200 OK
+
+ real 176m53.138s
+ user 318m17.004s
+ sys 29m48.944s
+
+ webcrawl@wbgrp-svc503:/srv/fatcat/src/extra/elasticsearch$ du -sh /srv/elasticsearch/data/
+ 52G /srv/elasticsearch/data/
+
+TODO:
+x abstracts
+x file_hashes
+x ext idents
+x upload to an item
+x download and re-build elastic
+- insert new mellon matches
+
+
+ time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-grobid-metadata -
+ [...]
+ Processed 132994 lines, inserted 123052, updated 0.
+ Processed 132984 lines, inserted 122979, updated 0.
+ 10930.34user 475.87system 2:40:03elapsed 118%CPU (0avgtext+0avgdata 68180maxresident)k
+ 8912inputs+20157832outputs (59major+1104467minor)pagefaults 0swaps
+
+ real 160m3.573s
+ user 184m54.176s
+ sys 8m23.388s
+