From 186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 25 Sep 2018 15:44:39 -0700
Subject: start organizing notes into subdirectories

---
 notes/alignments.csv                       |  57 ------
 notes/alignments.txt                       |  22 --
 notes/bootstrap/import_timing_20180815.txt | 292 ++++++++++++++++++++++++++
 notes/bootstrap/import_timing_20180908.txt | 249 ++++++++++++++++++++++
 notes/bootstrap/import_timing_20180910.txt | 319 +++++++++++++++++++++++++++++
 notes/bootstrap/import_timing_20180920.txt |  96 +++++++++
 notes/bootstrap/import_timing_20180923.txt |  91 ++++++++
 notes/bootstrap/initial_sources.txt        |  26 +++
 notes/bootstrap/old_imports.txt            |  20 ++
 notes/contrib_types.txt                    |  14 --
 notes/import_timing_20180815.txt           | 292 --------------------------
 notes/import_timing_20180908.txt           | 249 ----------------------
 notes/import_timing_20180910.txt           | 319 -----------------------------
 notes/import_timing_20180920.txt           |  96 ---------
 notes/import_timing_20180923.txt           |  39 ----
 notes/initial_sources.txt                  |  26 ---
 notes/old_imports.txt                      |  20 --
 notes/performance/postgres_performance.txt | 203 ++++++++++++++++++
 notes/performance/speed.txt                |  82 ++++++++
 notes/postgres_performance.txt             | 203 ------------------
 notes/schema/alignments.csv                |  57 ++++++
 notes/schema/alignments.txt                |  22 ++
 notes/schema/contrib_types.txt             |  14 ++
 notes/schema/work_release_types.txt        |  28 +++
 notes/speed.txt                            |  82 --------
 notes/work_release_types.txt               |  28 ---
 26 files changed, 1499 insertions(+), 1447 deletions(-)
 delete mode 100644 notes/alignments.csv
 delete mode 100644 notes/alignments.txt
 create mode 100644 notes/bootstrap/import_timing_20180815.txt
 create mode 100644 notes/bootstrap/import_timing_20180908.txt
 create mode 100644 notes/bootstrap/import_timing_20180910.txt
 create mode 100644 notes/bootstrap/import_timing_20180920.txt
 create mode 100644 notes/bootstrap/import_timing_20180923.txt
 create mode 100644 notes/bootstrap/initial_sources.txt
 create mode 100644 notes/bootstrap/old_imports.txt
 delete mode 100644 notes/contrib_types.txt
 delete mode 100644 notes/import_timing_20180815.txt
 delete mode 100644 notes/import_timing_20180908.txt
 delete mode 100644 notes/import_timing_20180910.txt
 delete mode 100644 notes/import_timing_20180920.txt
 delete mode 100644 notes/import_timing_20180923.txt
 delete mode 100644 notes/initial_sources.txt
 delete mode 100644 notes/old_imports.txt
 create mode 100644 notes/performance/postgres_performance.txt
 create mode 100644 notes/performance/speed.txt
 delete mode 100644 notes/postgres_performance.txt
 create mode 100644 notes/schema/alignments.csv
 create mode 100644 notes/schema/alignments.txt
 create mode 100644 notes/schema/contrib_types.txt
 create mode 100644 notes/schema/work_release_types.txt
 delete mode 100644 notes/speed.txt
 delete mode 100644 notes/work_release_types.txt

diff --git a/notes/alignments.csv b/notes/alignments.csv
deleted file mode 100644
index b8619ddc..00000000
--- a/notes/alignments.csv
+++ /dev/null
@@ -1,57 +0,0 @@
-fatcat entity,fatcat field,description,Crossref,ORCID,Bibtex,CSL,BIBFRAME,Dublin Core (DCMES),Dublin Core (DCMI),oai_dc (OAI-PMH),resourceSync,Highwire Press (google scholar),MEDLINE (?)
-WORK,,,,,,,Work,,,,,,
-,work_type,,,,,,,Type,,,,,
-RELEASE,,,,,,,Instance,,,,,,
-,title,,title,,title,,,Title,,title,,citation_title,
-,release_type,,type (???),,object type; howpublished,,,Type,,,,,
-,release_status,"NEW; eg, preprint, final",,,,,,,,,,,
-,container,,ISSN (via lookup),,ISSN (via lookup),,,Relation (?),,,,citation_issn (normalized),
-,release_date,RENAME,published-print or published-online,,"year, month",issued,,Date,Issued (?),,,citation_publication_date,
-,volume,,volume,,volume,,,,,,,citation_volume,
-,pages,,page,,pages,,,,,,,"citation_firstpage, citation_lastpage",
-,issue,,issue,,,,,,,,,citation_issue,
-,doi,always lower-case,DOI,,doi,DOI,,Identifier,,,,citation_doi,
-,isbn13,,ISBN (converted),,,ISBN,,Identifier,,,,citation_isbn,
-,publisher,,publisher,,publisher,,,Publisher,,,,"citation_dissertation_institution, citation_technical_report_institution",
-,language,NEW; RFC1766 (ISO 639-1 superset); may transition,,,,,,Language,,,,,
-,ref:index,,reference:[index],,,Citation-number,,,,,,,
-,ref:key,NEW,reference:key,,,citation-label,,,,,,,
-,ref:raw,RENAME,reference:unstructured,,,,,,,,,,
-,ref:container_title,NEW,reference:journal-title,,,,,,,,,,
-,ref:year,NEW,reference:year,,,,,,,,,,
-,ref:title,NEW,,,,,,,,,,,
-,ref:locator,"NEW; (aka, page number)",,,,,,,,,,,
-,contrib:role,,,,,,Role,,,,,,
-,contrib:index,,author[index],,,,,,,,,,
-,contrib:raw,RENAME,Author:{given+family},,author,,,,,,,,
-,extra:number,tech reports,,,,,,,,,,citation_technical_report_number,
-,extra:institution,for tech reports and dissertations,,,,,,,,,,,
-,extra:abstract,,,,,,,Description,,,,citation_abstract,
-,extra:pmcid,,,,,,,,,,,,
-,extra:pmid,,,,,,,,,,,,
-,extra:version,"eg, for software",,,,,,,,,,,
-,,,,,,,,,,,,,
-CREATOR,,,,,,,Agent,Creator / Contributor,,,,,
-,display_name,NEW; usually western/latinized  ,,.name.credit-name.value (?),,,,,,,,citation_author,
-,orcid,,,.path,,,,,,,,,
-,given_name,NEW,,.name.given-names.value,,,,,,,,,
-,surname,NEW,,.name.family-names.value,,,,,,,,,
-,extra:viaf,,,,,,,,,,,,
-,extra:alt_names[],,,,,,,,,,,,
-,,,,,,,,,,,,,
-CONTAINER,,,,,,,N/A,relation.isPartOf,,,,,
-,name,,,,,,,,,,,"citation_journal_title, citation_conference_title",
-,publisher,,,,,,,,,,,,
-,issnl,,,,,,,Identifier (?),,,,citation_issn (normalized),
-,abbrev,,,,,,,,,,,citation_journal_abbrev,
-,coden,,,,,,,,,,,,
-,extra:issn-e,,,,,,,,,,,,
-,extra:issn-p,,,,,,,,,,,,
-FILE,,,,,,,Item,,,,,,
-,release,,,,,,,,,,,,
-,size,,,,,,,,,,length,,
-,sha256,NEW,,,,,,,,,hash,,
-,sha1,,,,,,,,,,hash,,
-,md5,,,,,,,,,,hash,,
-,url,,,,www_pdf_url,,,,,,href,citation_pdf_url,
-,mimetype,NEW,,,,,,Format,,,type,,
diff --git a/notes/alignments.txt b/notes/alignments.txt
deleted file mode 100644
index e2736268..00000000
--- a/notes/alignments.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-bibtex
-BIBFRAME
-schema.org: http://schema.org/CreativeWork
-dublin core
-
-entity specific:
-- crossref / release
-- orcid / creator
-- issn / container
-
-
-## Dublin Core (original/simple)
-
-Usage guide: <http://www.dublincore.org/documents/usageguide/elements/>
-
-Quick descriptions of the "original 15" fields: <http://mn.gov/bridges/dcore.html>
-
-## Citation Style Language
-
-Human-readable specification: <http://docs.citationstyles.org/en/1.0.1/specification.html>
-
-Specifically, the "variables" and type definitions: <http://docs.citationstyles.org/en/stable/specification.html#appendix-iv-variables>
diff --git a/notes/bootstrap/import_timing_20180815.txt b/notes/bootstrap/import_timing_20180815.txt
new file mode 100644
index 00000000..1206cc41
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180815.txt
@@ -0,0 +1,292 @@
+
+Schema changes since previous imports:
+- more fields (identifiers+indexes)
+- timestamps
+- UUIDs more places
+- fixed some crossref import bugs?
+- abstracts
+- file_urls as table (not single value)
+- timestamps
+- TEXT -> CHAR in a few places
+- removed many work fields
+
+## Containers
+
+(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
+
+real    1m25.292s
+user    0m12.640s
+sys     0m0.412s
+
+## Creators
+
+time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+
+(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*)
+real    22m2.465s
+user    26m41.924s
+sys     1m33.844s
+
+## Releases
+
+xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
+
+    128516.30 user
+    3905.14 system
+    44:17:05 elapsed
+    83% CPU
+
+Almost 44 hours... I think I remember more like 36 hours last time? Things
+slowed down a lot towards the end, many more ORCID cross-references?
+
+looking in htop, postgres seems to be primary bottleneck. At something like 12
+hours in, had 44 million release_ident rows, which is 1000/second.
+
+Note: seems like the more frequently `count(*)` is run, the more performant.
+Because in-memory?
+
+    2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG:  duration: 42949.549 ms  statement: select count(id) from release_ident;
+
+    fatcat_prod=# select count(*) from release_ident;
+    count   
+    ----------
+    44185608
+    (1 row)
+
+    Time: 2753.916 ms (00:02.754)
+    fatcat_prod=# select count(*) from release_ident;
+    count   
+    ----------
+    44187937
+    (1 row)
+
+    Time: 2711.670 ms (00:02.712)
+
+As expected, autovacuum very busy. Only ~150 TPS; but that includes batch
+writes? 75061172 rows.
+
+## Files
+
+    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
+
+    Done! Inserted 6607075
+
+    real    2152m28.822s => 36 hours (!)
+    user    401m46.464s
+    sys     21m45.724s
+
+
+Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table?
+
+    SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce';
+
+Created index:
+    
+    CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev);
+
+... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't
+see non-indexed queries. Maybe an ANALYZE does need to happen?
+
+This being single-threaded is going to be a problem in the future. ~50 million
+files would be ~2 weeks.
+
+## Post-Import Status
+
+    Size:  358.89G (postgres self-reported)
+    Mem.:   57.10% -    16.85G/49.14G
+
+Was 184G last time in late June; doubled in size (!).
+
+    bnewbold@wbgrp-svc500$ df -h /
+    Filesystem      Size  Used Avail Use% Mounted on
+    /dev/vda1       858G  529G  286G  65% /
+
+    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/
+    361G    /var/lib/postgresql/
+    83G     /srv/datasets/
+    77G     /srv/elastic-blah/
+
+    fatcat_prod=# select count(*) from changelog; => 2,085,067
+
+    SELECT
+        table_name,
+        pg_size_pretty(table_size) AS table_size,
+        pg_size_pretty(indexes_size) AS indexes_size,
+        pg_size_pretty(total_size) AS total_size
+    FROM (
+        SELECT
+            table_name,
+            pg_table_size(table_name) AS table_size,
+            pg_indexes_size(table_name) AS indexes_size,
+            pg_total_relation_size(table_name) AS total_size
+        FROM (
+            SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+            FROM information_schema.tables
+        ) AS all_tables
+        ORDER BY total_size DESC
+    ) AS pretty_sizes;
+
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref"                                       | 159 GB     | 47 GB        | 206 GB
+ "public"."release_rev"                                       | 40 GB      | 10 GB        | 51 GB
+ "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB
+ "public"."release_ident"                                     | 5797 MB    | 6597 MB      | 12 GB
+ "public"."work_ident"                                        | 5787 MB    | 6394 MB      | 12 GB
+ "public"."release_edit"                                      | 6674 MB    | 4646 MB      | 11 GB
+ "public"."work_edit"                                         | 6674 MB    | 4646 MB      | 11 GB
+ "public"."work_rev"                                          | 3175 MB    | 2939 MB      | 6114 MB
+ "public"."file_rev_url"                                      | 1995 MB    | 275 MB       | 2270 MB
+ "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB
+ "public"."file_rev"                                          | 829 MB     | 954 MB       | 1783 MB
+ "public"."file_ident"                                        | 498 MB     | 532 MB       | 1030 MB
+ "public"."file_release"                                      | 369 MB     | 642 MB       | 1011 MB
+ "public"."file_edit"                                         | 591 MB     | 410 MB       | 1002 MB
+ "public"."creator_rev"                                       | 337 MB     | 318 MB       | 655 MB
+ "public"."creator_ident"                                     | 280 MB     | 297 MB       | 577 MB
+ "public"."creator_edit"                                      | 316 MB     | 220 MB       | 536 MB
+ "public"."release_rev_abstract"                              | 183 MB     | 84 MB        | 267 MB
+ "public"."changelog"                                         | 123 MB     | 125 MB       | 249 MB
+ "public"."editgroup"                                         | 139 MB     | 81 MB        | 220 MB
+ "public"."container_rev"                                     | 19 MB      | 6912 kB      | 26 MB
+ "public"."container_ident"                                   | 6896 kB    | 7016 kB      | 14 MB
+ "public"."container_edit"                                    | 8056 kB    | 5240 kB      | 13 MB
+
+In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB.
+
+For many of these indexes, and the _ident tables, switching from UUID to
+BIGSERIAL would half the size.
+
+## Exports
+
+    time ./fatcat_export.py changelog - | pv -l | wc
+
+    As of:
+
+    159k 1:17:35 [34.3 /s]
+    159,740 lines
+    2,427,277,881 chars (bytes; 2.4GB)
+
+    real    77m35.183s
+    user    15m36.208s
+    sys     0m31.484s
+
+Running at about 100/sec; estimate 6 hours for completion. Could shard using
+start/end flags, but am not here.
+
+Running `quick_dump.sql` (identifier tables, in a transaction):
+
+    251M Aug 19 23:08 fatcat_ident_creators.tsv
+    5.9M Aug 19 23:08 fatcat_ident_containers.tsv
+    467M Aug 19 23:08 fatcat_ident_files.tsv
+    5.2G Aug 19 23:10 fatcat_ident_releases.tsv
+    5.2G Aug 19 23:11 fatcat_ident_works.tsv
+     12K Aug 19 23:11 .
+    1.8G Aug 19 23:12 fatcat_abstracts.json
+
+Work and Release tables in under 2 minutes each; say 5 minutes total.
+
+    time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc
+
+    172k 1:07:08 [42.7 /s]
+    172181 lines
+    1,118,166,293 chars (bytes; 1.1 GB)
+
+    real    67m8.340s
+    user    10m21.988s
+    sys     0m34.612s
+
+Running at only 10/sec or so, this would take forever even if sharded. :(
+
+Both exports/dumps are running in parallel. "Expand" queries might help with speed?
+
+## Postgres Analysis
+
+SELECT *
+FROM
+  pg_stat_statements
+ORDER BY
+  total_time DESC LIMIT 5;
+
+Summary:
+
+    SELECT "creator_ident" by ORCID
+        1,295,864 calls
+        930,305,208 total time
+        717.9 mean time     <= this should be less than a ms!
+
+    INSERT INTO release_rev
+        75144055 calls
+        111470961 total time
+        1.483 mean time
+
+    INSERT INTO work_rev
+        75,144,055 calls
+        82693994 total time
+        1.1 mean time
+
+    INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING *
+        26,008,280 calls    <= why so few? different query depending on number
+                               of rows inserted
+        18955782 total time
+        0.728 mean time
+
+    SELECT container_ident
+        78,4143 calls
+        17683156 total time
+        22.55 mean time     <= why so slow?
+
+    INSERT INTO release_contrib 
+        15,072,820 calls
+
+    INSERT INTO "release_contrib
+
+
+       relname        | too_much_seq |      case      |   rel_size   | seq_scan | idx_scan  
+----------------------+--------------+----------------+--------------+----------+-----------
+ file_rev_url         |         2391 | Missing Index? |   2091147264 |     2391 |         0
+ file_release         |       -30670 | OK             |    386899968 |        2 |     30672
+ container_rev        |      -979948 | OK             |     20242432 |   784146 |   1764094
+ file_edit            |     -2206807 | OK             |    619896832 |        6 |   2206813
+ creator_edit         |     -2206810 | OK             |    331079680 |       11 |   2206821
+ work_edit            |     -2206811 | OK             |   6996566016 |       14 |   2206825
+ release_edit         |     -2206811 | OK             |   6996582400 |       14 |   2206825
+ container_edit       |     -2206816 | OK             |      8216576 |        5 |   2206821
+ changelog            |     -2209659 | OK             |    129286144 |       10 |   2209669
+ abstracts            |     -3486466 | OK             |   1706237952 |        8 |   3486474
+ release_rev_abstract |     -4975493 | OK             |    191602688 |    42919 |   5018412
+ release_ref          |     -5032717 | OK             | 170494861312 |        3 |   5032720
+ release_contrib      |     -5032744 | OK             |  20370251776 |        3 |   5032747
+ creator_rev          |     -8400410 | OK             |    353583104 |  1296507 |   9696917
+ file_ident           |    -13483224 | OK             |    522190848 |        7 |  13483231
+ creator_ident        |    -16686744 | OK             |    293625856 |        3 |  16686747
+ file_rev             |    -32405557 | OK             |    868515840 |        4 |  32405561
+ container_ident      |    -69162337 | OK             |      7028736 |        3 |  69162340
+ work_rev             |   -150288161 | OK             |   3328589824 |        1 | 150288162
+ editgroup            |   -162783807 | OK             |    146112512 |        9 | 162783816
+ release_ident        |   -165676917 | OK             |   6076841984 |       52 | 165676969
+ work_ident           |   -229439828 | OK             |   6066814976 |        3 | 229439831
+ release_rev          |   -930140217 | OK             |  43360542720 |        9 | 930140226
+
+TODO changes:
+- don't return all as often; in particular, inserting release_contrib, release_ref
+x missing an index somewhere on file_rev_url, release_rev_abstract
+x why so many seq_scan on container_rev, creator_rev
+    => running/EXPLAIN same query on psql hits index, not seq_scan
+    => seemed to be an issue with VALUE params getting sent separately; query
+    planner only looked at query and wasn't using index on ORCID/ISSN-L because
+    it didn't know those values were not-NULL?
+    => adding NOT NULL to query seems to have sped up case of there being a
+    "hit", but no hit still slow. might need to change indices or something for
+    the (perhaps common in future) case of DOI lookups with invalid DOIs (eg,
+    CORE import)
+
+random DEBUG queries:
+
+    EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+    EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1;
+
+    EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1;
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
diff --git a/notes/bootstrap/import_timing_20180908.txt b/notes/bootstrap/import_timing_20180908.txt
new file mode 100644
index 00000000..3091e4fa
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180908.txt
@@ -0,0 +1,249 @@
+
+Changes since last time:
+- auto-accept flag (potentially no need to UPDATE/vacuum database)
+- large CRUD code refactor (but not complete removal of api_server CRUD yet)
+- removed "single query insert per entity", but added row-based inserts
+
+## Laptop Rough Timing
+
+HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379:
+=> note: in this branch, importer is sitll creating editgroups in separate request
+
+    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+    real    0m28.779s
+    user    0m5.716s
+    sys     0m0.208s
+
+    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+
+    89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k
+    78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps
+    # 1:46.67 elapsed
+
+    # running again
+    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+    real    0m29.714s
+    user    0m6.048s
+    sys     0m0.212s
+
+    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+    89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k
+    0inputs+377456outputs (0major+583532minor)pagefaults 0swaps
+    # 1:48.11 elapsed
+
+MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a
+
+    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+    real    1m8.061s
+    user    0m7.384s
+    sys     0m0.280s
+
+    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+    96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k
+    0inputs+389168outputs (0major+586810minor)pagefaults 0swaps
+    # 3:23.05 elapsed
+
+AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510
+
+    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+    real    1m7.757s
+    user    0m6.240s
+    sys     0m0.228s
+
+    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+    91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k
+    0inputs+384976outputs (0major+580544minor)pagefaults 0swaps
+
+So, annecdotally, seems like autoaccept didn't do much here (not
+vacuum-limited?), but the row inserts were more than a factor of 2x performance
+improvement. Great! Could try experimenting with even larger batch sizes...
+
+
+## Production Import
+
+http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379
+
+    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
+
+    real    0m30.845s
+    user    0m8.884s
+    sys     0m0.312s
+
+
+    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+    # TPS: 1181
+    real    6m54.019s => down from 22m!
+    user    25m33.980s
+    sys     1m30.480s
+
+    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
+    # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?)
+    # parallel at ~80%
+    # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction
+    # postgres still does a vacuum to analyze; good!
+    # ~600-1000 TPS near start (large variance)
+    # left to run overnight...
+    # slowed down to ~50-80 TPS about 10 hours later
+    # lots of IOWAIT
+    # only at 40309165 rows in the morning; this could take a long time
+
+    #   PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s
+    #   Size:  153.71G -     1.90M/s        | TPS:          77
+    #   Mem.:   68.70% -    22.60G/49.14G   | IO Max:    20448/s
+    #   Swap:    3.80% -     1.88G/50.00G   | Read :     10.17M/s -   2603/s
+    #   Load:    11.71 11.80 12.01          | Write:      4.00M/s -   1024/s
+
+    92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k
+    1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps
+    # 35:45:15 elapsed
+
+    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
+
+## Perf Tweaks
+
+
+
+    SELECT
+      relname,
+      seq_scan - idx_scan AS too_much_seq,
+      CASE
+        WHEN
+          seq_scan - coalesce(idx_scan, 0) > 0
+        THEN
+          'Missing Index?'
+        ELSE
+          'OK'
+      END,
+      pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
+    FROM
+      pg_stat_all_tables
+    WHERE
+      schemaname = 'public'
+      AND pg_relation_size(relname::regclass) > 80000
+    ORDER BY
+      too_much_seq DESC;
+
+       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan  
+----------------------+--------------+------+--------------+----------+-----------
+ file_rev_url         |           -1 | OK   |    163323904 |        2 |         3
+ file_release         |           -3 | OK   |     24461312 |        2 |         5
+ release_edit         |       -10265 | OK   |   6080495616 |        2 |     10267
+ container_edit       |       -10265 | OK   |     31170560 |        2 |     10267
+ work_edit            |       -10265 | OK   |   6080364544 |        2 |     10267
+ file_edit            |       -10265 | OK   |     49111040 |        2 |     10267
+ creator_edit         |       -10265 | OK   |    330924032 |        2 |     10267
+ changelog            |       -11692 | OK   |    106668032 |        2 |     11694
+ release_ref          |      -374197 | OK   | 125437673472 |        3 |    374200
+ release_contrib      |      -374202 | OK   |  16835354624 |        3 |    374205
+ release_rev_abstract |      -374919 | OK   |    162250752 |        3 |    374922
+ file_ident           |     -1047172 | OK   |     41926656 |        2 |   1047174
+ container_rev        |     -1378943 | OK   |     50356224 |   724612 |   2103555   <=
+ file_rev             |     -2407647 | OK   |     68493312 |        4 |   2407651
+ abstracts            |     -2765610 | OK   |   1450901504 |        1 |   2765611
+ creator_ident        |     -7127467 | OK   |    242688000 |        2 |   7127469
+ creator_rev          |     -7943007 | OK   |    353370112 |   839155 |   8782162   <=
+ container_ident      |    -57488245 | OK   |     23060480 |        2 |  57488247
+ release_ident        |    -66085389 | OK   |   4459159552 |       14 |  66085403
+ work_rev             |   -130613391 | OK   |   2892775424 |        1 | 130613392
+ work_ident           |   -130633923 | OK   |   4459192320 |        2 | 130633925
+ editgroup            |   -136775970 | OK   |    120561664 |        1 | 136775971
+ release_rev          |   -718337215 | OK   |  36850507776 |        8 | 718337223
+
+
+Slowest queries:
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+
+    SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3
+
+    SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping...
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+
+
+DEBUG: (while a file import is running)
+
+Creator lookup where row exists:
+
+    time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663
+    real    0m0.020s
+    user    0m0.004s
+    sys     0m0.004s
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+    => (1 row)
+    Time: 0.988 ms
+    
+Creator lookup where row doesn't exist:
+
+    bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X
+    {"message":"Not found: 0000-0002-8867-166X"}
+    real    0m1.282s
+    user    0m0.008s
+    sys     0m0.000s
+    Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms)
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+    => (0 rows)
+    Time: 0.810 ms
+
+-------
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+from logs:
+
+    execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+
+NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :(
+
+
+    exact logs:
+    SET TIME ZONE 'UTC'
+    SET CLIENT_ENCODING TO 'UTF8'
+    SELECT 1
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+    parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1'
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1';
+
+CHANGES:
+- restart fatcat-api
+- restart postgresql
+- select pg_stat_reset();
+
+seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev.
+
+
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref"                                       | 117 GB     | 34 GB        | 151 GB
+ "public"."release_rev"                                       | 34 GB      | 9202 MB      | 43 GB
+ "public"."release_contrib"                                   | 16 GB      | 17 GB        | 33 GB
+ "public"."work_edit"                                         | 5800 MB    | 4033 MB      | 9833 MB
+ "public"."release_edit"                                      | 5800 MB    | 4032 MB      | 9833 MB
+ "public"."work_ident"                                        | 4254 MB    | 5102 MB      | 9356 MB
+ "public"."release_ident"                                     | 4254 MB    | 5095 MB      | 9349 MB
+ "public"."work_rev"                                          | 2759 MB    | 2545 MB      | 5304 MB
+ "public"."abstracts"                                         | 1417 MB    | 114 MB       | 1530 MB
+ "public"."creator_rev"                                       | 337 MB     | 277 MB       | 614 MB
+ "public"."creator_edit"                                      | 316 MB     | 221 MB       | 536 MB
+ "public"."creator_ident"                                     | 232 MB     | 279 MB       | 511 MB
+ "public"."file_rev_url"                                      | 260 MB     | 101 MB       | 361 MB
+ "public"."release_rev_abstract"                              | 155 MB     | 200 MB       | 355 MB
+ "public"."file_rev"                                          | 109 MB     | 124 MB       | 233 MB
+ "public"."changelog"                                         | 102 MB     | 106 MB       | 208 MB
+ "public"."editgroup"                                         | 116 MB     | 69 MB        | 184 MB
+ "public"."file_ident"                                        | 66 MB      | 70 MB        | 136 MB
+ "public"."file_edit"                                         | 78 MB      | 53 MB        | 131 MB
+ "public"."file_release"                                      | 38 MB      | 64 MB        | 103 MB
+ "public"."container_rev"                                     | 48 MB      | 26 MB        | 74 MB
+ "public"."container_edit"                                    | 30 MB      | 20 MB        | 50 MB
+ "public"."container_ident"                                   | 22 MB      | 26 MB        | 48 MB
+
+CHANGES:
+- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case.
+
diff --git a/notes/bootstrap/import_timing_20180910.txt b/notes/bootstrap/import_timing_20180910.txt
new file mode 100644
index 00000000..43c76e43
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180910.txt
@@ -0,0 +1,319 @@
+
+Changes since last time:
+- removed "WHERE IS NOT NULL" from lookup indexes
+- some cleanups
+- JSONB
+
+## QA Timing
+
+master branch b15fa552e288bec5bbc2b07a3e11bab9235a1e7c
+
+    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv # --batch-size 100
+    real    0m30.249s
+    user    0m9.364s
+    sys     0m0.268s
+
+    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - # --batch-size 100
+    => something went sort of wrong? seemed to hang, with no activity.
+    => may have sent a signal or something accidentally?
+
+    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt --batch-size 100
+    => ran for a few seconds before restarting with increased batch size
+    => increased batch size seems to be too much for releases? stick with 50
+    => still lots of long autovacuum (ANALYZE)
+
+    67017.73user 2054.96system 25:59:38elapsed 73%CPU (0avgtext+0avgdata 424692maxresident)k
+    2085872inputs+219678920outputs (100457major+30837042minor)pagefaults 0swaps
+    # 25:59:38 elapsed
+
+    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite # --batch-size 100
+    => taking longer to start than I remembered...
+
+    Done! Inserted 6607075
+    real    1521m37.856s
+    user    305m38.444s
+    sys     18m13.916s
+    => 25.5 hours
+
+Disk/table usage:
+
+    Size:  233.95G
+
+                              table_name                          | table_size | indexes_size | total_size 
+    --------------------------------------------------------------+------------+--------------+------------
+     "public"."release_ref"                                       | 92 GB      | 26 GB        | 118 GB
+     "public"."release_rev"                                       | 30 GB      | 14 GB        | 45 GB
+     "public"."release_contrib"                                   | 12 GB      | 13 GB        | 26 GB
+     "public"."release_edit"                                      | 4710 MB    | 3353 MB      | 8062 MB
+     "public"."work_edit"                                         | 4710 MB    | 3352 MB      | 8062 MB
+     "public"."work_ident"                                        | 3454 MB    | 4153 MB      | 7608 MB
+     "public"."release_ident"                                     | 3454 MB    | 4152 MB      | 7606 MB
+     "public"."work_rev"                                          | 2241 MB    | 2076 MB      | 4317 MB
+     "public"."file_rev_url"                                      | 1994 MB    | 778 MB       | 2772 MB
+     "public"."file_rev"                                          | 827 MB     | 1215 MB      | 2042 MB
+     "public"."file_ident"                                        | 501 MB     | 533 MB       | 1034 MB
+     "public"."file_edit"                                         | 590 MB     | 411 MB       | 1001 MB
+     "public"."creator_rev"                                       | 371 MB     | 455 MB       | 826 MB
+     "public"."abstracts"                                         | 649 MB     | 52 MB        | 701 MB
+     "public"."file_release"                                      | 251 MB     | 435 MB       | 686 MB
+     "public"."creator_edit"                                      | 347 MB     | 242 MB       | 590 MB
+     "public"."creator_ident"                                     | 255 MB     | 305 MB       | 560 MB
+     "public"."release_rev_abstract"                              | 70 MB      | 90 MB        | 160 MB
+     "public"."changelog"                                         | 52 MB      | 53 MB        | 106 MB
+     "public"."editgroup"                                         | 60 MB      | 35 MB        | 94 MB
+     "public"."container_rev"                                     | 40 MB      | 27 MB        | 67 MB
+     "public"."container_edit"                                    | 24 MB      | 16 MB        | 39 MB
+     "public"."container_ident"                                   | 17 MB      | 20 MB        | 37 MB
+
+I think this is before the JSONB schema change, so won't reflect actual/final.
+
+Perf:
+
+           relname        | too_much_seq | case |  rel_size   | seq_scan | idx_scan  
+    ----------------------+--------------+------+-------------+----------+-----------
+     file_rev_url         |          -18 | OK   |  2089836544 |        2 |        20
+     file_release         |        -4001 | OK   |   263249920 |        2 |      4003
+     container_edit       |      -132140 | OK   |    24608768 |        2 |    132142
+     release_edit         |      -132140 | OK   |  4937408512 |        2 |    132142
+     work_edit            |      -132140 | OK   |  4937351168 |        2 |    132142
+     file_edit            |      -132140 | OK   |   618545152 |        2 |    132142
+     creator_edit         |      -132140 | OK   |   363954176 |        2 |    132142
+     changelog            |      -134456 | OK   |    54919168 |        5 |    134461
+     container_rev        |     -1276670 | OK   |    41680896 |   592993 |   1869663
+     abstracts            |     -1353093 | OK   |   669425664 |        3 |   1353096
+     release_ref          |     -3429019 | OK   | 99223470080 |        3 |   3429022
+     release_contrib      |     -3429026 | OK   | 13406437376 |        3 |   3429029
+     release_rev_abstract |     -3435251 | OK   |    73359360 |        3 |   3435254
+     creator_ident        |     -5360858 | OK   |   266977280 |       11 |   5360869
+     creator_rev          |     -8071846 | OK   |   388734976 |   257781 |   8329627
+     file_ident           |    -13478348 | OK   |   524746752 |        7 |  13478355
+     file_rev             |    -30368193 | OK   |   867401728 |        4 |  30368197
+     container_ident      |    -47140767 | OK   |    18219008 |        9 |  47140776
+     release_ident        |    -61021070 | OK   |  3620864000 |       24 |  61021094
+     work_rev             |   -106056502 | OK   |  2349146112 |        1 | 106056503
+     work_ident           |   -106320780 | OK   |  3620954112 |        6 | 106320786
+     editgroup            |   -117742455 | OK   |    62398464 |        2 | 117742457
+     release_rev          |   -563415274 | OK   | 32455860224 |        8 | 563415282
+
+Thought I had resoled those scans... hrm. Pretty sure (but not 100%) that I
+cleared the counters.
+
+Dumps!
+
+    time psql fatcat_prod < ../extra/quick_dump.sql
+    real    4m5.729s
+    user    0m0.056s
+    sys     0m0.016s
+    => while other stuff running
+
+    time ./ident_table_snapshot.sh
+    => seems very dominated by tar/xz compression time; should switch back to gzip
+
+    DATABASE_URL=$DATABASE_URL time ./ident_table_snapshot.sh /tmp
+    614.80user 31.74system 13:23.51elapsed 80%CPU (0avgtext+0avgdata 10124maxresident)k
+    8311456inputs+9761200outputs (2major+3792minor)pagefaults 0swaps
+    => 13.5 minutes
+    => fatcat_idents.2018-09-13.215147.r874951.tar.gz: 4.7GB
+    => uncompressed, 8.3GB total
+
+    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release -j8 -q | pv -l | gzip > fatcat_release_dump.json.gz
+    => can not run with more than -j8, it seems (postgres client limit)
+    => TPS: 12522 (!)
+    => 101k 0:00:34 [2.99k/s]
+    => estimate 6.5 hours
+
+    # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers
+    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz
+    => TPS: 29605 (!)
+    => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?)
+    => 341k 0:01:07 [3.48k/s] (ah, must have been in disk cache)
+    => 4.12M 0:16:11 (averaging 4.2k/s; estimate 4 hours total)
+    => fatcat_release_dump_expanded.json.gz: 3.4G with 6.76M lines, so expecting 35GB or so
+
+    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+     14   3  73   8   0   2|  43M   21M| 143k  326B|   0     0 |  64k  113k
+     18   4  72   5   0   2|  55M 4096k| 246k  326B|   0     0 |  84k  157k
+     17   4  72   6   0   2|  53M 9964k| 428k  908B|   0     0 |  84k  155k
+     14   3  73   8   0   1|  46M   20M| 143k   87k|   0     0 |  71k  132k
+     18   3  72   5   0   2|  52M 8880k| 318k   95k|   0     0 |  82k  149k
+
+    => fatcat-export using about 200% CPU in a single thread; other 8 threads at about 25% each
+        => might be interesting to profile... too many copies? switch to tsv library?
+    => seems like postgres only about 50% utilized
+
+Benchmarks!
+
+    # stub entity
+    $ wrk -t8 -c100 -d30s http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+    Running 30s test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+      8 threads and 100 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency     7.44ms    5.26ms  67.53ms   60.07%
+        Req/Sec     1.71k   113.55     2.07k    68.79%
+      408986 requests in 30.01s, 117.01MB read
+    Requests/sec:  13627.62
+    Transfer/sec:      3.90MB
+    => TPS: 39958
+    => lots of "BEGIN" and "SELECT 1", implying new connections? hrm.
+    => should run longer for more stable results
+
+    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+
+    Running 2m test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+      12 threads and 400 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency    25.75ms   17.02ms 263.74ms   65.11%
+        Req/Sec     1.05k   377.38     1.82k    74.81%
+      Latency Distribution
+         50%   25.40ms
+         75%   26.56ms
+         90%   49.57ms
+         99%   76.47ms
+      1128253 requests in 2.00m, 322.80MB read
+      Socket errors: connect 0, read 10, write 0, timeout 0
+    Requests/sec:   9396.03
+    Transfer/sec:      2.69MB
+
+    => single 470% fatcatd thread; diesel?
+    => 50% systemd-journald
+
+    # rich entity
+    $ wrk -t8 -c100 -d30s https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+    Running 30s test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+      8 threads and 100 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency    33.35ms   14.01ms 152.05ms   79.98%
+        Req/Sec   363.14     30.36   464.00     76.48%
+      86719 requests in 30.04s, 234.46MB read
+    Requests/sec:   2886.55
+    Transfer/sec:      7.80MB
+
+    wrk -t12 -c400 -d2m --latency https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+
+    Running 2m test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+      12 threads and 400 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency   149.19ms  125.30ms   1.93s    95.06%
+        Req/Sec   237.89     34.60   650.00     73.54%
+      Latency Distribution
+         50%  129.86ms
+         75%  134.14ms
+         90%  210.42ms
+         99%  904.26ms
+      340625 requests in 2.00m, 0.90GB read
+      Socket errors: connect 0, read 0, write 0, timeout 303
+    Requests/sec:   2836.17
+    Transfer/sec:      7.67MB
+
+    => TPS: 9090
+    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+     usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+      4   0  93   3   0   0|2401k   13M|   0     0 |2317B 4812B|4274  3824 
+     22   6  68   0   0   3|4096B    0 |  57k  956B|4096B    0 | 135k  255k
+     21   7  68   0   0   3|4096B    0 |  61k   66B|4096B    0 | 137k  259k
+     21   7  68   0   0   3|  16k 8192B| 179k  636B|  16k    0 | 134k  252k
+     21   7  68   0   0   3|   0     0 |  53k   66B|   0     0 | 135k  255k
+     22   6  68   0   0   3|   0    17M|  51k  326B|   0     0 | 137k  261k
+     22   6  68   0   0   3|   0     0 |  82k  903B|   0     0 | 136k  257k
+
+    # static (404)
+    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/asdf
+    => full 2600+% CPU in a single fatcat process (?)
+    => lots of load in wrk also
+    => system at a whole basically 100% CPU on all cores, though mostly kernel
+
+    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+      4   0  93   3   0   0|2399k   13M|   0     0 |2315B 4808B|4359  3976 
+     26  66   2   0   0   6|   0     0 |4191k  956B|   0     0 | 465k  536k
+     26  67   1   0   0   6|   0     0 | 511k   66B|   0     0 | 466k  543k
+     27  64   3   0   0   6|   0     0 |2648k  318B|   0     0 | 481k  555k
+     25  67   2   0   0   6|   0     0 |1909k  768B|   0     0 | 481k  550k
+
+    Running 2m test @ http://localhost:9411/v0/asdf
+      12 threads and 400 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency     1.40ms    1.65ms 219.48ms   93.33%
+        Req/Sec    17.71k     8.97k   65.69k    63.84%
+      Latency Distribution
+         50%    1.13ms
+         75%    1.89ms
+         90%    2.71ms
+         99%    4.76ms
+      21145472 requests in 2.00m, 1.61GB read
+      Socket errors: connect 0, read 12, write 0, timeout 0
+      Non-2xx or 3xx responses: 21145472
+    Requests/sec: 176071.79
+    Transfer/sec:     13.77MB
+
+Noting pretty non-trival traffic to 207.241.229.230 and
+es-worker-02.us.archive.org (207.241.229.246)... probably because this is a
+datanode?
+
+## Misc
+
+Postgres really wanted to log: <https://wiki.postgresql.org/wiki/Per-user_log_settings>
+
+    ALTER DATABASE fatcat_prod SET log_statement = 'ddl';
+    ALTER USER fatcat SET log_statement = 'ddl';
+
+
+## Later Imports
+
+    zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+    => only ~40 TPS
+    => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index?
+
+           relname        | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan 
+    ----------------------+--------------+----------------+-------------+----------+----------
+     file_rev             |     -5343173 | OK             |   867483648 |      693 |  5343866
+
+    EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
+
+Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success.
+
+    http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found
+    http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found
+        => but no releases?
+
+SET auto_explain.log_min_duration = 0;
+SET auto_explain.log_analyze = true;
+ALTER USER fatcat SET log_statement = 'all';
+ALTER USER fatcat SET auto_explain.log_min_duration = 0;
+ALTER USER fatcat SET auto_explain.log_analyze = true;;
+sudo service fatcat-api restart
+
+
+2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG:  duration: 3028.143 ms  plan:
+        Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3
+        Limit  (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1)
+          ->  Nested Loop  (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1)
+                ->  Seq Scan on file_rev  (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1)
+                      Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text))
+                      Rows Removed by Filter: 6607786
+                ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..1.92 rows=1 width=49) (never executed)
+                      Index Cond: (rev_id = file_rev.id)
+                      Filter: (is_live AND (redirect_id IS NULL))
+
+
+fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Limit  (cost=0.99..5.04 rows=1 width=454)
+   ->  Nested Loop  (cost=0.99..5.04 rows=1 width=454)
+         ->  Index Scan using file_rev_sha1_idx on file_rev  (cost=0.56..2.58 rows=1 width=405)
+               Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar))
+         ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..2.45 rows=1 width=49)
+               Index Cond: (rev_id = file_rev.id)
+               Filter: (is_live AND (redirect_id IS NULL))
+
+After all this debugging, asked on IRC and learned that I really shouldn't ever
+use CHAR() (or even VARCHAR()). Will update the schema.
+
+## Investigating Missing Rows
+
+    fatcat_prod=# select count(*) from release_ident; => 53028167
+
+Thought I had more than that previously? Table sizes were also bigger. Hrm.
+
diff --git a/notes/bootstrap/import_timing_20180920.txt b/notes/bootstrap/import_timing_20180920.txt
new file mode 100644
index 00000000..a57ffd77
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180920.txt
@@ -0,0 +1,96 @@
+
+This was on fatcat-prod-vm (2TB disk).
+
+    time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv
+
+    Processed 53300 lines, inserted 53283, updated 0.
+    real    0m32.463s
+    user    0m8.716s
+    sys     0m0.284s
+
+    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+
+    Processed 48900 lines, inserted 48731, updated 0. <= these numbers times 80x
+    100% 80:0=0s
+
+    real    10m20.598s
+    user    26m16.544s
+    sys     1m40.284s
+
+    time xzcat /srv/fatcat/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+
+    Processed 4679900 lines, inserted 3755867, updated 0.
+    107730.08user 4110.22system 16:31:25elapsed 188%CPU (0avgtext+0avgdata 447496maxresident)k
+    77644160inputs+361948352outputs (105major+49094767minor)pagefaults 0swaps
+
+    => 16.5 hours, faster!
+
+    select count(id) from release_ident; => 75106713
+
+    kernel/system crashed after first file import (!), so don't have numbers from that.
+
+Table sizes at this point:
+
+    select count(id) from file_ident; => 6334606
+
+    Size:  389.25G
+
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref"                                       | 170 GB     | 47 GB        | 217 GB
+ "public"."release_rev"                                       | 44 GB      | 21 GB        | 65 GB
+ "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB
+ "public"."release_edit"                                      | 6671 MB    | 6505 MB      | 13 GB
+ "public"."work_edit"                                         | 6671 MB    | 6505 MB      | 13 GB
+ "public"."release_ident"                                     | 4892 MB    | 5875 MB      | 11 GB
+ "public"."work_ident"                                        | 4892 MB    | 5874 MB      | 11 GB
+ "public"."work_rev"                                          | 3174 MB    | 2936 MB      | 6109 MB
+ "public"."file_rev_url"                                      | 3634 MB    | 1456 MB      | 5090 MB
+ "public"."file_rev"                                          | 792 MB     | 1281 MB      | 2073 MB
+ "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB
+ "public"."file_edit"                                         | 565 MB     | 561 MB       | 1126 MB
+ "public"."file_release"                                      | 380 MB     | 666 MB       | 1045 MB
+ "public"."file_ident"                                        | 415 MB     | 496 MB       | 911 MB
+ "public"."creator_rev"                                       | 371 MB     | 457 MB       | 828 MB
+ "public"."creator_edit"                                      | 347 MB     | 353 MB       | 700 MB
+ "public"."creator_ident"                                     | 255 MB     | 305 MB       | 559 MB
+ "public"."release_rev_abstract"                              | 183 MB     | 237 MB       | 421 MB
+ "public"."changelog"                                         | 122 MB     | 126 MB       | 247 MB
+ "public"."editgroup"                                         | 138 MB     | 81 MB        | 219 MB
+ "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB
+ "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB
+ "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB
+
+
+       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan 
+----------------------+--------------+------+--------------+----------+----------
+ release_edit         |            0 | OK   |   6993084416 |        0 |        0
+ container_rev        |            0 | OK   |     54124544 |        0 |        0
+ creator_ident        |            0 | OK   |    266919936 |        0 |        0
+ creator_edit         |            0 | OK   |    363921408 |        0 |        0
+ work_rev             |            0 | OK   |   3327262720 |        0 |        0
+ creator_rev          |            0 | OK   |    388726784 |        0 |        0
+ work_ident           |            0 | OK   |   5128560640 |        0 |        0
+ work_edit            |            0 | OK   |   6993108992 |        0 |        0
+ container_ident      |            0 | OK   |     25092096 |        0 |        0
+ file_edit            |            0 | OK   |    598278144 |        0 |        0
+ container_edit       |            0 | OK   |     33857536 |        0 |        0
+ changelog            |            0 | OK   |    127549440 |        0 |        0
+ abstracts            |        -4714 | OK   |   1706713088 |        0 |     4714
+ file_release         |       -13583 | OK   |    401752064 |        0 |    13583
+ file_rev_url         |       -13583 | OK   |   3832389632 |        0 |    13583
+ editgroup            |       -74109 | OK   |    144277504 |        0 |    74109
+ release_contrib      |       -76699 | OK   |  20357849088 |        0 |    76699
+ release_ref          |       -76700 | OK   | 183009157120 |        3 |    76703
+ release_rev_abstract |       -76939 | OK   |    192102400 |        0 |    76939
+ release_rev          |       -77965 | OK   |  47602647040 |        0 |    77965
+ file_ident           |      -100089 | OK   |    438255616 |        3 |   100092
+ release_ident        |      -152809 | OK   |   5128617984 |        0 |   152809
+ file_rev             |      -440780 | OK   |    837705728 |        0 |   440780
+(23 rows)
+
+Continuing imports:
+
+    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+    => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""}
+
diff --git a/notes/bootstrap/import_timing_20180923.txt b/notes/bootstrap/import_timing_20180923.txt
new file mode 100644
index 00000000..c7161842
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180923.txt
@@ -0,0 +1,91 @@
+
+    105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k
+    71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps
+
+    real    959m39.521s
+    user    1845m10.392s
+    sys     70m33.780s
+
+Did I get the same error again? I'm confused:
+
+    HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"}
+    (but not in all threads)
+
+Yes, ugh, because 50*2500 can be over (it's not just individual large releases,
+they come in big batches).
+
+But:
+
+    select count(id) from release_ident; => 70006121
+
+A lot, though not 72 million like last time, hrm. I'm... going to move ahead I
+guess.
+
+"Processed 4440850 lines, inserted 3509600, updated 0."
+    => implies 79029915 records
+
+    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update -
+    Processed 530750 lines, inserted 435239, updated 0. (etc)
+    Command exited with non-zero status 1
+    15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k
+    127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps
+
+    real    143m52.681s
+    user    252m31.620s
+    sys     11m21.608s
+
+    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+
+    Processed 485200 lines, inserted 244101, updated 168344. (etc)
+    22671.44user 1069.84system 3:27:47elapsed 190%CPU (0avgtext+0avgdata 39348maxresident)k
+    99672inputs+2497848outputs (109major+422150minor)pagefaults 0swaps
+
+fatcat-export dump:
+
+     INFO 2018-09-25T10:01:06Z: fatcat_export: Done reading (70006121 lines), waiting for workers to exit...
+      197GiB 4:56:17 [11.4MiB/s] [                                   <=>                                                     ]
+
+How big is everything?
+
+    select count(*) from file_release; => 10,485,964
+    select count (distinct target_release_ident_id) from file_release; => 6,486,934
+    select count(id) from release_ident; => 70,006,121
+    select count(*) from container_ident; => 354,793
+    select count(*) from creator_ident; => 3,906,990
+    Size:  324.24G
+    /dev/vda1       1.8T  511G  1.2T  31% /
+
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref"                                       | 121 GB     | 42 GB        | 163 GB
+ "public"."release_rev"                                       | 33 GB      | 19 GB        | 52 GB
+ "public"."release_contrib"                                   | 21 GB      | 18 GB        | 39 GB
+ "public"."release_edit"                                      | 6218 MB    | 6084 MB      | 12 GB
+ "public"."work_edit"                                         | 6218 MB    | 6084 MB      | 12 GB
+ "public"."release_ident"                                     | 4560 MB    | 5470 MB      | 10030 MB
+ "public"."work_ident"                                        | 4560 MB    | 5466 MB      | 10027 MB
+ "public"."file_rev_url"                                      | 5543 MB    | 2112 MB      | 7655 MB
+ "public"."work_rev"                                          | 2958 MB    | 2733 MB      | 5691 MB
+ "public"."file_rev"                                          | 1201 MB    | 1811 MB      | 3012 MB
+ "public"."abstracts"                                         | 2294 MB    | 184 MB       | 2478 MB
+ "public"."file_edit"                                         | 931 MB     | 864 MB       | 1795 MB
+ "public"."file_release"                                      | 605 MB     | 1058 MB      | 1663 MB
+ "public"."file_ident"                                        | 529 MB     | 633 MB       | 1162 MB
+ "public"."creator_rev"                                       | 371 MB     | 456 MB       | 826 MB
+ "public"."creator_edit"                                      | 347 MB     | 352 MB       | 699 MB
+ "public"."release_rev_abstract"                              | 250 MB     | 325 MB       | 575 MB
+ "public"."creator_ident"                                     | 255 MB     | 304 MB       | 559 MB
+ "public"."changelog"                                         | 122 MB     | 127 MB       | 250 MB
+ "public"."editgroup"                                         | 138 MB     | 82 MB        | 220 MB
+ "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB
+ "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB
+ "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB
+
+Hrm, bunch of not-accepted containers:
+
+    select count(*) from container_ident where is_live='f'; => 301507
+    select count(*) from release_ident where is_live='f'; => 0
+    select count(*) from work_ident where is_live='f'; => 0
+    select count(*) from creator_ident where is_live='f'; => 1 (there was a hang earlier)
+    select count(*) from file_ident where is_live='f'; => 0
+
diff --git a/notes/bootstrap/initial_sources.txt b/notes/bootstrap/initial_sources.txt
new file mode 100644
index 00000000..cc22019d
--- /dev/null
+++ b/notes/bootstrap/initial_sources.txt
@@ -0,0 +1,26 @@
+
+Probably start with:
+
+    crossref (including citations)
+    arxiv
+    medline
+
+then merge in:
+
+    dblp
+    CORE
+    MSAG dump
+    VIAF
+    archive.org paper/url manifest
+    semantic scholar
+    oaDOI
+
+and later:
+
+    wikidata
+    opencitations
+    openlibrary
+
+national libraries:
+
+    http://www.dnb.de/EN/Service/DigitaleDienste/LinkedData/linkeddata_node.html
diff --git a/notes/bootstrap/old_imports.txt b/notes/bootstrap/old_imports.txt
new file mode 100644
index 00000000..1233d4a8
--- /dev/null
+++ b/notes/bootstrap/old_imports.txt
@@ -0,0 +1,20 @@
+
+## ORCID
+
+Directly from compressed tarball; takes about 2 hours in production:
+
+    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid -
+
+After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted
+tarball:
+
+    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json
+    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+
+Does not work:
+
+    ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json
+
+Instead:
+
+    cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid -
diff --git a/notes/contrib_types.txt b/notes/contrib_types.txt
deleted file mode 100644
index 01024b40..00000000
--- a/notes/contrib_types.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-
-See also: <http://docs.citationstyles.org/en/stable/specification.html#roles>
-
-author
-editor
-translator
-illustrator
-interviewer
-reviewer
-
-
-container-author
-container-editor
-reviewed-author
diff --git a/notes/import_timing_20180815.txt b/notes/import_timing_20180815.txt
deleted file mode 100644
index 1206cc41..00000000
--- a/notes/import_timing_20180815.txt
+++ /dev/null
@@ -1,292 +0,0 @@
-
-Schema changes since previous imports:
-- more fields (identifiers+indexes)
-- timestamps
-- UUIDs more places
-- fixed some crossref import bugs?
-- abstracts
-- file_urls as table (not single value)
-- timestamps
-- TEXT -> CHAR in a few places
-- removed many work fields
-
-## Containers
-
-(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
-
-real    1m25.292s
-user    0m12.640s
-sys     0m0.412s
-
-## Creators
-
-time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
-
-(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*)
-real    22m2.465s
-user    26m41.924s
-sys     1m33.844s
-
-## Releases
-
-xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
-
-    128516.30 user
-    3905.14 system
-    44:17:05 elapsed
-    83% CPU
-
-Almost 44 hours... I think I remember more like 36 hours last time? Things
-slowed down a lot towards the end, many more ORCID cross-references?
-
-looking in htop, postgres seems to be primary bottleneck. At something like 12
-hours in, had 44 million release_ident rows, which is 1000/second.
-
-Note: seems like the more frequently `count(*)` is run, the more performant.
-Because in-memory?
-
-    2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG:  duration: 42949.549 ms  statement: select count(id) from release_ident;
-
-    fatcat_prod=# select count(*) from release_ident;
-    count   
-    ----------
-    44185608
-    (1 row)
-
-    Time: 2753.916 ms (00:02.754)
-    fatcat_prod=# select count(*) from release_ident;
-    count   
-    ----------
-    44187937
-    (1 row)
-
-    Time: 2711.670 ms (00:02.712)
-
-As expected, autovacuum very busy. Only ~150 TPS; but that includes batch
-writes? 75061172 rows.
-
-## Files
-
-    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
-
-    Done! Inserted 6607075
-
-    real    2152m28.822s => 36 hours (!)
-    user    401m46.464s
-    sys     21m45.724s
-
-
-Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table?
-
-    SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce';
-
-Created index:
-    
-    CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev);
-
-... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't
-see non-indexed queries. Maybe an ANALYZE does need to happen?
-
-This being single-threaded is going to be a problem in the future. ~50 million
-files would be ~2 weeks.
-
-## Post-Import Status
-
-    Size:  358.89G (postgres self-reported)
-    Mem.:   57.10% -    16.85G/49.14G
-
-Was 184G last time in late June; doubled in size (!).
-
-    bnewbold@wbgrp-svc500$ df -h /
-    Filesystem      Size  Used Avail Use% Mounted on
-    /dev/vda1       858G  529G  286G  65% /
-
-    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/
-    361G    /var/lib/postgresql/
-    83G     /srv/datasets/
-    77G     /srv/elastic-blah/
-
-    fatcat_prod=# select count(*) from changelog; => 2,085,067
-
-    SELECT
-        table_name,
-        pg_size_pretty(table_size) AS table_size,
-        pg_size_pretty(indexes_size) AS indexes_size,
-        pg_size_pretty(total_size) AS total_size
-    FROM (
-        SELECT
-            table_name,
-            pg_table_size(table_name) AS table_size,
-            pg_indexes_size(table_name) AS indexes_size,
-            pg_total_relation_size(table_name) AS total_size
-        FROM (
-            SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
-            FROM information_schema.tables
-        ) AS all_tables
-        ORDER BY total_size DESC
-    ) AS pretty_sizes;
-
-                          table_name                          | table_size | indexes_size | total_size 
---------------------------------------------------------------+------------+--------------+------------
- "public"."release_ref"                                       | 159 GB     | 47 GB        | 206 GB
- "public"."release_rev"                                       | 40 GB      | 10 GB        | 51 GB
- "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB
- "public"."release_ident"                                     | 5797 MB    | 6597 MB      | 12 GB
- "public"."work_ident"                                        | 5787 MB    | 6394 MB      | 12 GB
- "public"."release_edit"                                      | 6674 MB    | 4646 MB      | 11 GB
- "public"."work_edit"                                         | 6674 MB    | 4646 MB      | 11 GB
- "public"."work_rev"                                          | 3175 MB    | 2939 MB      | 6114 MB
- "public"."file_rev_url"                                      | 1995 MB    | 275 MB       | 2270 MB
- "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB
- "public"."file_rev"                                          | 829 MB     | 954 MB       | 1783 MB
- "public"."file_ident"                                        | 498 MB     | 532 MB       | 1030 MB
- "public"."file_release"                                      | 369 MB     | 642 MB       | 1011 MB
- "public"."file_edit"                                         | 591 MB     | 410 MB       | 1002 MB
- "public"."creator_rev"                                       | 337 MB     | 318 MB       | 655 MB
- "public"."creator_ident"                                     | 280 MB     | 297 MB       | 577 MB
- "public"."creator_edit"                                      | 316 MB     | 220 MB       | 536 MB
- "public"."release_rev_abstract"                              | 183 MB     | 84 MB        | 267 MB
- "public"."changelog"                                         | 123 MB     | 125 MB       | 249 MB
- "public"."editgroup"                                         | 139 MB     | 81 MB        | 220 MB
- "public"."container_rev"                                     | 19 MB      | 6912 kB      | 26 MB
- "public"."container_ident"                                   | 6896 kB    | 7016 kB      | 14 MB
- "public"."container_edit"                                    | 8056 kB    | 5240 kB      | 13 MB
-
-In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB.
-
-For many of these indexes, and the _ident tables, switching from UUID to
-BIGSERIAL would half the size.
-
-## Exports
-
-    time ./fatcat_export.py changelog - | pv -l | wc
-
-    As of:
-
-    159k 1:17:35 [34.3 /s]
-    159,740 lines
-    2,427,277,881 chars (bytes; 2.4GB)
-
-    real    77m35.183s
-    user    15m36.208s
-    sys     0m31.484s
-
-Running at about 100/sec; estimate 6 hours for completion. Could shard using
-start/end flags, but am not here.
-
-Running `quick_dump.sql` (identifier tables, in a transaction):
-
-    251M Aug 19 23:08 fatcat_ident_creators.tsv
-    5.9M Aug 19 23:08 fatcat_ident_containers.tsv
-    467M Aug 19 23:08 fatcat_ident_files.tsv
-    5.2G Aug 19 23:10 fatcat_ident_releases.tsv
-    5.2G Aug 19 23:11 fatcat_ident_works.tsv
-     12K Aug 19 23:11 .
-    1.8G Aug 19 23:12 fatcat_abstracts.json
-
-Work and Release tables in under 2 minutes each; say 5 minutes total.
-
-    time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc
-
-    172k 1:07:08 [42.7 /s]
-    172181 lines
-    1,118,166,293 chars (bytes; 1.1 GB)
-
-    real    67m8.340s
-    user    10m21.988s
-    sys     0m34.612s
-
-Running at only 10/sec or so, this would take forever even if sharded. :(
-
-Both exports/dumps are running in parallel. "Expand" queries might help with speed?
-
-## Postgres Analysis
-
-SELECT *
-FROM
-  pg_stat_statements
-ORDER BY
-  total_time DESC LIMIT 5;
-
-Summary:
-
-    SELECT "creator_ident" by ORCID
-        1,295,864 calls
-        930,305,208 total time
-        717.9 mean time     <= this should be less than a ms!
-
-    INSERT INTO release_rev
-        75144055 calls
-        111470961 total time
-        1.483 mean time
-
-    INSERT INTO work_rev
-        75,144,055 calls
-        82693994 total time
-        1.1 mean time
-
-    INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING *
-        26,008,280 calls    <= why so few? different query depending on number
-                               of rows inserted
-        18955782 total time
-        0.728 mean time
-
-    SELECT container_ident
-        78,4143 calls
-        17683156 total time
-        22.55 mean time     <= why so slow?
-
-    INSERT INTO release_contrib 
-        15,072,820 calls
-
-    INSERT INTO "release_contrib
-
-
-       relname        | too_much_seq |      case      |   rel_size   | seq_scan | idx_scan  
-----------------------+--------------+----------------+--------------+----------+-----------
- file_rev_url         |         2391 | Missing Index? |   2091147264 |     2391 |         0
- file_release         |       -30670 | OK             |    386899968 |        2 |     30672
- container_rev        |      -979948 | OK             |     20242432 |   784146 |   1764094
- file_edit            |     -2206807 | OK             |    619896832 |        6 |   2206813
- creator_edit         |     -2206810 | OK             |    331079680 |       11 |   2206821
- work_edit            |     -2206811 | OK             |   6996566016 |       14 |   2206825
- release_edit         |     -2206811 | OK             |   6996582400 |       14 |   2206825
- container_edit       |     -2206816 | OK             |      8216576 |        5 |   2206821
- changelog            |     -2209659 | OK             |    129286144 |       10 |   2209669
- abstracts            |     -3486466 | OK             |   1706237952 |        8 |   3486474
- release_rev_abstract |     -4975493 | OK             |    191602688 |    42919 |   5018412
- release_ref          |     -5032717 | OK             | 170494861312 |        3 |   5032720
- release_contrib      |     -5032744 | OK             |  20370251776 |        3 |   5032747
- creator_rev          |     -8400410 | OK             |    353583104 |  1296507 |   9696917
- file_ident           |    -13483224 | OK             |    522190848 |        7 |  13483231
- creator_ident        |    -16686744 | OK             |    293625856 |        3 |  16686747
- file_rev             |    -32405557 | OK             |    868515840 |        4 |  32405561
- container_ident      |    -69162337 | OK             |      7028736 |        3 |  69162340
- work_rev             |   -150288161 | OK             |   3328589824 |        1 | 150288162
- editgroup            |   -162783807 | OK             |    146112512 |        9 | 162783816
- release_ident        |   -165676917 | OK             |   6076841984 |       52 | 165676969
- work_ident           |   -229439828 | OK             |   6066814976 |        3 | 229439831
- release_rev          |   -930140217 | OK             |  43360542720 |        9 | 930140226
-
-TODO changes:
-- don't return all as often; in particular, inserting release_contrib, release_ref
-x missing an index somewhere on file_rev_url, release_rev_abstract
-x why so many seq_scan on container_rev, creator_rev
-    => running/EXPLAIN same query on psql hits index, not seq_scan
-    => seemed to be an issue with VALUE params getting sent separately; query
-    planner only looked at query and wasn't using index on ORCID/ISSN-L because
-    it didn't know those values were not-NULL?
-    => adding NOT NULL to query seems to have sped up case of there being a
-    "hit", but no hit still slow. might need to change indices or something for
-    the (perhaps common in future) case of DOI lookups with invalid DOIs (eg,
-    CORE import)
-
-random DEBUG queries:
-
-    EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-
-    EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1;
-
-    EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1;
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
diff --git a/notes/import_timing_20180908.txt b/notes/import_timing_20180908.txt
deleted file mode 100644
index 3091e4fa..00000000
--- a/notes/import_timing_20180908.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-
-Changes since last time:
-- auto-accept flag (potentially no need to UPDATE/vacuum database)
-- large CRUD code refactor (but not complete removal of api_server CRUD yet)
-- removed "single query insert per entity", but added row-based inserts
-
-## Laptop Rough Timing
-
-HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379:
-=> note: in this branch, importer is sitll creating editgroups in separate request
-
-    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
-    real    0m28.779s
-    user    0m5.716s
-    sys     0m0.208s
-
-    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
-
-    89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k
-    78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps
-    # 1:46.67 elapsed
-
-    # running again
-    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
-    real    0m29.714s
-    user    0m6.048s
-    sys     0m0.212s
-
-    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
-    89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k
-    0inputs+377456outputs (0major+583532minor)pagefaults 0swaps
-    # 1:48.11 elapsed
-
-MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a
-
-    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
-    real    1m8.061s
-    user    0m7.384s
-    sys     0m0.280s
-
-    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
-    96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k
-    0inputs+389168outputs (0major+586810minor)pagefaults 0swaps
-    # 3:23.05 elapsed
-
-AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510
-
-    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
-    real    1m7.757s
-    user    0m6.240s
-    sys     0m0.228s
-
-    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
-    91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k
-    0inputs+384976outputs (0major+580544minor)pagefaults 0swaps
-
-So, annecdotally, seems like autoaccept didn't do much here (not
-vacuum-limited?), but the row inserts were more than a factor of 2x performance
-improvement. Great! Could try experimenting with even larger batch sizes...
-
-
-## Production Import
-
-http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379
-
-    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
-
-    real    0m30.845s
-    user    0m8.884s
-    sys     0m0.312s
-
-
-    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
-    # TPS: 1181
-    real    6m54.019s => down from 22m!
-    user    25m33.980s
-    sys     1m30.480s
-
-    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
-    # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?)
-    # parallel at ~80%
-    # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction
-    # postgres still does a vacuum to analyze; good!
-    # ~600-1000 TPS near start (large variance)
-    # left to run overnight...
-    # slowed down to ~50-80 TPS about 10 hours later
-    # lots of IOWAIT
-    # only at 40309165 rows in the morning; this could take a long time
-
-    #   PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s
-    #   Size:  153.71G -     1.90M/s        | TPS:          77
-    #   Mem.:   68.70% -    22.60G/49.14G   | IO Max:    20448/s
-    #   Swap:    3.80% -     1.88G/50.00G   | Read :     10.17M/s -   2603/s
-    #   Load:    11.71 11.80 12.01          | Write:      4.00M/s -   1024/s
-
-    92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k
-    1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps
-    # 35:45:15 elapsed
-
-    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
-
-## Perf Tweaks
-
-
-
-    SELECT
-      relname,
-      seq_scan - idx_scan AS too_much_seq,
-      CASE
-        WHEN
-          seq_scan - coalesce(idx_scan, 0) > 0
-        THEN
-          'Missing Index?'
-        ELSE
-          'OK'
-      END,
-      pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
-    FROM
-      pg_stat_all_tables
-    WHERE
-      schemaname = 'public'
-      AND pg_relation_size(relname::regclass) > 80000
-    ORDER BY
-      too_much_seq DESC;
-
-       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan  
-----------------------+--------------+------+--------------+----------+-----------
- file_rev_url         |           -1 | OK   |    163323904 |        2 |         3
- file_release         |           -3 | OK   |     24461312 |        2 |         5
- release_edit         |       -10265 | OK   |   6080495616 |        2 |     10267
- container_edit       |       -10265 | OK   |     31170560 |        2 |     10267
- work_edit            |       -10265 | OK   |   6080364544 |        2 |     10267
- file_edit            |       -10265 | OK   |     49111040 |        2 |     10267
- creator_edit         |       -10265 | OK   |    330924032 |        2 |     10267
- changelog            |       -11692 | OK   |    106668032 |        2 |     11694
- release_ref          |      -374197 | OK   | 125437673472 |        3 |    374200
- release_contrib      |      -374202 | OK   |  16835354624 |        3 |    374205
- release_rev_abstract |      -374919 | OK   |    162250752 |        3 |    374922
- file_ident           |     -1047172 | OK   |     41926656 |        2 |   1047174
- container_rev        |     -1378943 | OK   |     50356224 |   724612 |   2103555   <=
- file_rev             |     -2407647 | OK   |     68493312 |        4 |   2407651
- abstracts            |     -2765610 | OK   |   1450901504 |        1 |   2765611
- creator_ident        |     -7127467 | OK   |    242688000 |        2 |   7127469
- creator_rev          |     -7943007 | OK   |    353370112 |   839155 |   8782162   <=
- container_ident      |    -57488245 | OK   |     23060480 |        2 |  57488247
- release_ident        |    -66085389 | OK   |   4459159552 |       14 |  66085403
- work_rev             |   -130613391 | OK   |   2892775424 |        1 | 130613392
- work_ident           |   -130633923 | OK   |   4459192320 |        2 | 130633925
- editgroup            |   -136775970 | OK   |    120561664 |        1 | 136775971
- release_rev          |   -718337215 | OK   |  36850507776 |        8 | 718337223
-
-
-Slowest queries:
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-
-    SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3
-
-    SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping...
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-
-
-DEBUG: (while a file import is running)
-
-Creator lookup where row exists:
-
-    time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663
-    real    0m0.020s
-    user    0m0.004s
-    sys     0m0.004s
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-    => (1 row)
-    Time: 0.988 ms
-    
-Creator lookup where row doesn't exist:
-
-    bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X
-    {"message":"Not found: 0000-0002-8867-166X"}
-    real    0m1.282s
-    user    0m0.008s
-    sys     0m0.000s
-    Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms)
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-    => (0 rows)
-    Time: 0.810 ms
-
--------
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-
-from logs:
-
-    execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-
-NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :(
-
-
-    exact logs:
-    SET TIME ZONE 'UTC'
-    SET CLIENT_ENCODING TO 'UTF8'
-    SELECT 1
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-    parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1'
-
-    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1';
-
-CHANGES:
-- restart fatcat-api
-- restart postgresql
-- select pg_stat_reset();
-
-seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev.
-
-
-                          table_name                          | table_size | indexes_size | total_size 
---------------------------------------------------------------+------------+--------------+------------
- "public"."release_ref"                                       | 117 GB     | 34 GB        | 151 GB
- "public"."release_rev"                                       | 34 GB      | 9202 MB      | 43 GB
- "public"."release_contrib"                                   | 16 GB      | 17 GB        | 33 GB
- "public"."work_edit"                                         | 5800 MB    | 4033 MB      | 9833 MB
- "public"."release_edit"                                      | 5800 MB    | 4032 MB      | 9833 MB
- "public"."work_ident"                                        | 4254 MB    | 5102 MB      | 9356 MB
- "public"."release_ident"                                     | 4254 MB    | 5095 MB      | 9349 MB
- "public"."work_rev"                                          | 2759 MB    | 2545 MB      | 5304 MB
- "public"."abstracts"                                         | 1417 MB    | 114 MB       | 1530 MB
- "public"."creator_rev"                                       | 337 MB     | 277 MB       | 614 MB
- "public"."creator_edit"                                      | 316 MB     | 221 MB       | 536 MB
- "public"."creator_ident"                                     | 232 MB     | 279 MB       | 511 MB
- "public"."file_rev_url"                                      | 260 MB     | 101 MB       | 361 MB
- "public"."release_rev_abstract"                              | 155 MB     | 200 MB       | 355 MB
- "public"."file_rev"                                          | 109 MB     | 124 MB       | 233 MB
- "public"."changelog"                                         | 102 MB     | 106 MB       | 208 MB
- "public"."editgroup"                                         | 116 MB     | 69 MB        | 184 MB
- "public"."file_ident"                                        | 66 MB      | 70 MB        | 136 MB
- "public"."file_edit"                                         | 78 MB      | 53 MB        | 131 MB
- "public"."file_release"                                      | 38 MB      | 64 MB        | 103 MB
- "public"."container_rev"                                     | 48 MB      | 26 MB        | 74 MB
- "public"."container_edit"                                    | 30 MB      | 20 MB        | 50 MB
- "public"."container_ident"                                   | 22 MB      | 26 MB        | 48 MB
-
-CHANGES:
-- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case.
-
diff --git a/notes/import_timing_20180910.txt b/notes/import_timing_20180910.txt
deleted file mode 100644
index 43c76e43..00000000
--- a/notes/import_timing_20180910.txt
+++ /dev/null
@@ -1,319 +0,0 @@
-
-Changes since last time:
-- removed "WHERE IS NOT NULL" from lookup indexes
-- some cleanups
-- JSONB
-
-## QA Timing
-
-master branch b15fa552e288bec5bbc2b07a3e11bab9235a1e7c
-
-    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv # --batch-size 100
-    real    0m30.249s
-    user    0m9.364s
-    sys     0m0.268s
-
-    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - # --batch-size 100
-    => something went sort of wrong? seemed to hang, with no activity.
-    => may have sent a signal or something accidentally?
-
-    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt --batch-size 100
-    => ran for a few seconds before restarting with increased batch size
-    => increased batch size seems to be too much for releases? stick with 50
-    => still lots of long autovacuum (ANALYZE)
-
-    67017.73user 2054.96system 25:59:38elapsed 73%CPU (0avgtext+0avgdata 424692maxresident)k
-    2085872inputs+219678920outputs (100457major+30837042minor)pagefaults 0swaps
-    # 25:59:38 elapsed
-
-    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite # --batch-size 100
-    => taking longer to start than I remembered...
-
-    Done! Inserted 6607075
-    real    1521m37.856s
-    user    305m38.444s
-    sys     18m13.916s
-    => 25.5 hours
-
-Disk/table usage:
-
-    Size:  233.95G
-
-                              table_name                          | table_size | indexes_size | total_size 
-    --------------------------------------------------------------+------------+--------------+------------
-     "public"."release_ref"                                       | 92 GB      | 26 GB        | 118 GB
-     "public"."release_rev"                                       | 30 GB      | 14 GB        | 45 GB
-     "public"."release_contrib"                                   | 12 GB      | 13 GB        | 26 GB
-     "public"."release_edit"                                      | 4710 MB    | 3353 MB      | 8062 MB
-     "public"."work_edit"                                         | 4710 MB    | 3352 MB      | 8062 MB
-     "public"."work_ident"                                        | 3454 MB    | 4153 MB      | 7608 MB
-     "public"."release_ident"                                     | 3454 MB    | 4152 MB      | 7606 MB
-     "public"."work_rev"                                          | 2241 MB    | 2076 MB      | 4317 MB
-     "public"."file_rev_url"                                      | 1994 MB    | 778 MB       | 2772 MB
-     "public"."file_rev"                                          | 827 MB     | 1215 MB      | 2042 MB
-     "public"."file_ident"                                        | 501 MB     | 533 MB       | 1034 MB
-     "public"."file_edit"                                         | 590 MB     | 411 MB       | 1001 MB
-     "public"."creator_rev"                                       | 371 MB     | 455 MB       | 826 MB
-     "public"."abstracts"                                         | 649 MB     | 52 MB        | 701 MB
-     "public"."file_release"                                      | 251 MB     | 435 MB       | 686 MB
-     "public"."creator_edit"                                      | 347 MB     | 242 MB       | 590 MB
-     "public"."creator_ident"                                     | 255 MB     | 305 MB       | 560 MB
-     "public"."release_rev_abstract"                              | 70 MB      | 90 MB        | 160 MB
-     "public"."changelog"                                         | 52 MB      | 53 MB        | 106 MB
-     "public"."editgroup"                                         | 60 MB      | 35 MB        | 94 MB
-     "public"."container_rev"                                     | 40 MB      | 27 MB        | 67 MB
-     "public"."container_edit"                                    | 24 MB      | 16 MB        | 39 MB
-     "public"."container_ident"                                   | 17 MB      | 20 MB        | 37 MB
-
-I think this is before the JSONB schema change, so won't reflect actual/final.
-
-Perf:
-
-           relname        | too_much_seq | case |  rel_size   | seq_scan | idx_scan  
-    ----------------------+--------------+------+-------------+----------+-----------
-     file_rev_url         |          -18 | OK   |  2089836544 |        2 |        20
-     file_release         |        -4001 | OK   |   263249920 |        2 |      4003
-     container_edit       |      -132140 | OK   |    24608768 |        2 |    132142
-     release_edit         |      -132140 | OK   |  4937408512 |        2 |    132142
-     work_edit            |      -132140 | OK   |  4937351168 |        2 |    132142
-     file_edit            |      -132140 | OK   |   618545152 |        2 |    132142
-     creator_edit         |      -132140 | OK   |   363954176 |        2 |    132142
-     changelog            |      -134456 | OK   |    54919168 |        5 |    134461
-     container_rev        |     -1276670 | OK   |    41680896 |   592993 |   1869663
-     abstracts            |     -1353093 | OK   |   669425664 |        3 |   1353096
-     release_ref          |     -3429019 | OK   | 99223470080 |        3 |   3429022
-     release_contrib      |     -3429026 | OK   | 13406437376 |        3 |   3429029
-     release_rev_abstract |     -3435251 | OK   |    73359360 |        3 |   3435254
-     creator_ident        |     -5360858 | OK   |   266977280 |       11 |   5360869
-     creator_rev          |     -8071846 | OK   |   388734976 |   257781 |   8329627
-     file_ident           |    -13478348 | OK   |   524746752 |        7 |  13478355
-     file_rev             |    -30368193 | OK   |   867401728 |        4 |  30368197
-     container_ident      |    -47140767 | OK   |    18219008 |        9 |  47140776
-     release_ident        |    -61021070 | OK   |  3620864000 |       24 |  61021094
-     work_rev             |   -106056502 | OK   |  2349146112 |        1 | 106056503
-     work_ident           |   -106320780 | OK   |  3620954112 |        6 | 106320786
-     editgroup            |   -117742455 | OK   |    62398464 |        2 | 117742457
-     release_rev          |   -563415274 | OK   | 32455860224 |        8 | 563415282
-
-Thought I had resoled those scans... hrm. Pretty sure (but not 100%) that I
-cleared the counters.
-
-Dumps!
-
-    time psql fatcat_prod < ../extra/quick_dump.sql
-    real    4m5.729s
-    user    0m0.056s
-    sys     0m0.016s
-    => while other stuff running
-
-    time ./ident_table_snapshot.sh
-    => seems very dominated by tar/xz compression time; should switch back to gzip
-
-    DATABASE_URL=$DATABASE_URL time ./ident_table_snapshot.sh /tmp
-    614.80user 31.74system 13:23.51elapsed 80%CPU (0avgtext+0avgdata 10124maxresident)k
-    8311456inputs+9761200outputs (2major+3792minor)pagefaults 0swaps
-    => 13.5 minutes
-    => fatcat_idents.2018-09-13.215147.r874951.tar.gz: 4.7GB
-    => uncompressed, 8.3GB total
-
-    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release -j8 -q | pv -l | gzip > fatcat_release_dump.json.gz
-    => can not run with more than -j8, it seems (postgres client limit)
-    => TPS: 12522 (!)
-    => 101k 0:00:34 [2.99k/s]
-    => estimate 6.5 hours
-
-    # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers
-    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz
-    => TPS: 29605 (!)
-    => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?)
-    => 341k 0:01:07 [3.48k/s] (ah, must have been in disk cache)
-    => 4.12M 0:16:11 (averaging 4.2k/s; estimate 4 hours total)
-    => fatcat_release_dump_expanded.json.gz: 3.4G with 6.76M lines, so expecting 35GB or so
-
-    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
-    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
-     14   3  73   8   0   2|  43M   21M| 143k  326B|   0     0 |  64k  113k
-     18   4  72   5   0   2|  55M 4096k| 246k  326B|   0     0 |  84k  157k
-     17   4  72   6   0   2|  53M 9964k| 428k  908B|   0     0 |  84k  155k
-     14   3  73   8   0   1|  46M   20M| 143k   87k|   0     0 |  71k  132k
-     18   3  72   5   0   2|  52M 8880k| 318k   95k|   0     0 |  82k  149k
-
-    => fatcat-export using about 200% CPU in a single thread; other 8 threads at about 25% each
-        => might be interesting to profile... too many copies? switch to tsv library?
-    => seems like postgres only about 50% utilized
-
-Benchmarks!
-
-    # stub entity
-    $ wrk -t8 -c100 -d30s http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
-    Running 30s test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
-      8 threads and 100 connections
-      Thread Stats   Avg      Stdev     Max   +/- Stdev
-        Latency     7.44ms    5.26ms  67.53ms   60.07%
-        Req/Sec     1.71k   113.55     2.07k    68.79%
-      408986 requests in 30.01s, 117.01MB read
-    Requests/sec:  13627.62
-    Transfer/sec:      3.90MB
-    => TPS: 39958
-    => lots of "BEGIN" and "SELECT 1", implying new connections? hrm.
-    => should run longer for more stable results
-
-    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
-
-    Running 2m test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
-      12 threads and 400 connections
-      Thread Stats   Avg      Stdev     Max   +/- Stdev
-        Latency    25.75ms   17.02ms 263.74ms   65.11%
-        Req/Sec     1.05k   377.38     1.82k    74.81%
-      Latency Distribution
-         50%   25.40ms
-         75%   26.56ms
-         90%   49.57ms
-         99%   76.47ms
-      1128253 requests in 2.00m, 322.80MB read
-      Socket errors: connect 0, read 10, write 0, timeout 0
-    Requests/sec:   9396.03
-    Transfer/sec:      2.69MB
-
-    => single 470% fatcatd thread; diesel?
-    => 50% systemd-journald
-
-    # rich entity
-    $ wrk -t8 -c100 -d30s https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
-    Running 30s test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
-      8 threads and 100 connections
-      Thread Stats   Avg      Stdev     Max   +/- Stdev
-        Latency    33.35ms   14.01ms 152.05ms   79.98%
-        Req/Sec   363.14     30.36   464.00     76.48%
-      86719 requests in 30.04s, 234.46MB read
-    Requests/sec:   2886.55
-    Transfer/sec:      7.80MB
-
-    wrk -t12 -c400 -d2m --latency https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
-
-    Running 2m test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
-      12 threads and 400 connections
-      Thread Stats   Avg      Stdev     Max   +/- Stdev
-        Latency   149.19ms  125.30ms   1.93s    95.06%
-        Req/Sec   237.89     34.60   650.00     73.54%
-      Latency Distribution
-         50%  129.86ms
-         75%  134.14ms
-         90%  210.42ms
-         99%  904.26ms
-      340625 requests in 2.00m, 0.90GB read
-      Socket errors: connect 0, read 0, write 0, timeout 303
-    Requests/sec:   2836.17
-    Transfer/sec:      7.67MB
-
-    => TPS: 9090
-    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
-     usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
-      4   0  93   3   0   0|2401k   13M|   0     0 |2317B 4812B|4274  3824 
-     22   6  68   0   0   3|4096B    0 |  57k  956B|4096B    0 | 135k  255k
-     21   7  68   0   0   3|4096B    0 |  61k   66B|4096B    0 | 137k  259k
-     21   7  68   0   0   3|  16k 8192B| 179k  636B|  16k    0 | 134k  252k
-     21   7  68   0   0   3|   0     0 |  53k   66B|   0     0 | 135k  255k
-     22   6  68   0   0   3|   0    17M|  51k  326B|   0     0 | 137k  261k
-     22   6  68   0   0   3|   0     0 |  82k  903B|   0     0 | 136k  257k
-
-    # static (404)
-    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/asdf
-    => full 2600+% CPU in a single fatcat process (?)
-    => lots of load in wrk also
-    => system at a whole basically 100% CPU on all cores, though mostly kernel
-
-    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
-    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
-      4   0  93   3   0   0|2399k   13M|   0     0 |2315B 4808B|4359  3976 
-     26  66   2   0   0   6|   0     0 |4191k  956B|   0     0 | 465k  536k
-     26  67   1   0   0   6|   0     0 | 511k   66B|   0     0 | 466k  543k
-     27  64   3   0   0   6|   0     0 |2648k  318B|   0     0 | 481k  555k
-     25  67   2   0   0   6|   0     0 |1909k  768B|   0     0 | 481k  550k
-
-    Running 2m test @ http://localhost:9411/v0/asdf
-      12 threads and 400 connections
-      Thread Stats   Avg      Stdev     Max   +/- Stdev
-        Latency     1.40ms    1.65ms 219.48ms   93.33%
-        Req/Sec    17.71k     8.97k   65.69k    63.84%
-      Latency Distribution
-         50%    1.13ms
-         75%    1.89ms
-         90%    2.71ms
-         99%    4.76ms
-      21145472 requests in 2.00m, 1.61GB read
-      Socket errors: connect 0, read 12, write 0, timeout 0
-      Non-2xx or 3xx responses: 21145472
-    Requests/sec: 176071.79
-    Transfer/sec:     13.77MB
-
-Noting pretty non-trival traffic to 207.241.229.230 and
-es-worker-02.us.archive.org (207.241.229.246)... probably because this is a
-datanode?
-
-## Misc
-
-Postgres really wanted to log: <https://wiki.postgresql.org/wiki/Per-user_log_settings>
-
-    ALTER DATABASE fatcat_prod SET log_statement = 'ddl';
-    ALTER USER fatcat SET log_statement = 'ddl';
-
-
-## Later Imports
-
-    zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
-    => only ~40 TPS
-    => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index?
-
-           relname        | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan 
-    ----------------------+--------------+----------------+-------------+----------+----------
-     file_rev             |     -5343173 | OK             |   867483648 |      693 |  5343866
-
-    EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
-
-Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success.
-
-    http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found
-    http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found
-        => but no releases?
-
-SET auto_explain.log_min_duration = 0;
-SET auto_explain.log_analyze = true;
-ALTER USER fatcat SET log_statement = 'all';
-ALTER USER fatcat SET auto_explain.log_min_duration = 0;
-ALTER USER fatcat SET auto_explain.log_analyze = true;;
-sudo service fatcat-api restart
-
-
-2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG:  duration: 3028.143 ms  plan:
-        Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3
-        Limit  (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1)
-          ->  Nested Loop  (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1)
-                ->  Seq Scan on file_rev  (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1)
-                      Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text))
-                      Rows Removed by Filter: 6607786
-                ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..1.92 rows=1 width=49) (never executed)
-                      Index Cond: (rev_id = file_rev.id)
-                      Filter: (is_live AND (redirect_id IS NULL))
-
-
-fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
-                                                        QUERY PLAN                                                        
---------------------------------------------------------------------------------------------------------------------------
- Limit  (cost=0.99..5.04 rows=1 width=454)
-   ->  Nested Loop  (cost=0.99..5.04 rows=1 width=454)
-         ->  Index Scan using file_rev_sha1_idx on file_rev  (cost=0.56..2.58 rows=1 width=405)
-               Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar))
-         ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..2.45 rows=1 width=49)
-               Index Cond: (rev_id = file_rev.id)
-               Filter: (is_live AND (redirect_id IS NULL))
-
-After all this debugging, asked on IRC and learned that I really shouldn't ever
-use CHAR() (or even VARCHAR()). Will update the schema.
-
-## Investigating Missing Rows
-
-    fatcat_prod=# select count(*) from release_ident; => 53028167
-
-Thought I had more than that previously? Table sizes were also bigger. Hrm.
-
diff --git a/notes/import_timing_20180920.txt b/notes/import_timing_20180920.txt
deleted file mode 100644
index a57ffd77..00000000
--- a/notes/import_timing_20180920.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-
-This was on fatcat-prod-vm (2TB disk).
-
-    time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv
-
-    Processed 53300 lines, inserted 53283, updated 0.
-    real    0m32.463s
-    user    0m8.716s
-    sys     0m0.284s
-
-    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
-
-    Processed 48900 lines, inserted 48731, updated 0. <= these numbers times 80x
-    100% 80:0=0s
-
-    real    10m20.598s
-    user    26m16.544s
-    sys     1m40.284s
-
-    time xzcat /srv/fatcat/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
-
-    Processed 4679900 lines, inserted 3755867, updated 0.
-    107730.08user 4110.22system 16:31:25elapsed 188%CPU (0avgtext+0avgdata 447496maxresident)k
-    77644160inputs+361948352outputs (105major+49094767minor)pagefaults 0swaps
-
-    => 16.5 hours, faster!
-
-    select count(id) from release_ident; => 75106713
-
-    kernel/system crashed after first file import (!), so don't have numbers from that.
-
-Table sizes at this point:
-
-    select count(id) from file_ident; => 6334606
-
-    Size:  389.25G
-
-                          table_name                          | table_size | indexes_size | total_size 
---------------------------------------------------------------+------------+--------------+------------
- "public"."release_ref"                                       | 170 GB     | 47 GB        | 217 GB
- "public"."release_rev"                                       | 44 GB      | 21 GB        | 65 GB
- "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB
- "public"."release_edit"                                      | 6671 MB    | 6505 MB      | 13 GB
- "public"."work_edit"                                         | 6671 MB    | 6505 MB      | 13 GB
- "public"."release_ident"                                     | 4892 MB    | 5875 MB      | 11 GB
- "public"."work_ident"                                        | 4892 MB    | 5874 MB      | 11 GB
- "public"."work_rev"                                          | 3174 MB    | 2936 MB      | 6109 MB
- "public"."file_rev_url"                                      | 3634 MB    | 1456 MB      | 5090 MB
- "public"."file_rev"                                          | 792 MB     | 1281 MB      | 2073 MB
- "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB
- "public"."file_edit"                                         | 565 MB     | 561 MB       | 1126 MB
- "public"."file_release"                                      | 380 MB     | 666 MB       | 1045 MB
- "public"."file_ident"                                        | 415 MB     | 496 MB       | 911 MB
- "public"."creator_rev"                                       | 371 MB     | 457 MB       | 828 MB
- "public"."creator_edit"                                      | 347 MB     | 353 MB       | 700 MB
- "public"."creator_ident"                                     | 255 MB     | 305 MB       | 559 MB
- "public"."release_rev_abstract"                              | 183 MB     | 237 MB       | 421 MB
- "public"."changelog"                                         | 122 MB     | 126 MB       | 247 MB
- "public"."editgroup"                                         | 138 MB     | 81 MB        | 219 MB
- "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB
- "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB
- "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB
-
-
-       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan 
-----------------------+--------------+------+--------------+----------+----------
- release_edit         |            0 | OK   |   6993084416 |        0 |        0
- container_rev        |            0 | OK   |     54124544 |        0 |        0
- creator_ident        |            0 | OK   |    266919936 |        0 |        0
- creator_edit         |            0 | OK   |    363921408 |        0 |        0
- work_rev             |            0 | OK   |   3327262720 |        0 |        0
- creator_rev          |            0 | OK   |    388726784 |        0 |        0
- work_ident           |            0 | OK   |   5128560640 |        0 |        0
- work_edit            |            0 | OK   |   6993108992 |        0 |        0
- container_ident      |            0 | OK   |     25092096 |        0 |        0
- file_edit            |            0 | OK   |    598278144 |        0 |        0
- container_edit       |            0 | OK   |     33857536 |        0 |        0
- changelog            |            0 | OK   |    127549440 |        0 |        0
- abstracts            |        -4714 | OK   |   1706713088 |        0 |     4714
- file_release         |       -13583 | OK   |    401752064 |        0 |    13583
- file_rev_url         |       -13583 | OK   |   3832389632 |        0 |    13583
- editgroup            |       -74109 | OK   |    144277504 |        0 |    74109
- release_contrib      |       -76699 | OK   |  20357849088 |        0 |    76699
- release_ref          |       -76700 | OK   | 183009157120 |        3 |    76703
- release_rev_abstract |       -76939 | OK   |    192102400 |        0 |    76939
- release_rev          |       -77965 | OK   |  47602647040 |        0 |    77965
- file_ident           |      -100089 | OK   |    438255616 |        3 |   100092
- release_ident        |      -152809 | OK   |   5128617984 |        0 |   152809
- file_rev             |      -440780 | OK   |    837705728 |        0 |   440780
-(23 rows)
-
-Continuing imports:
-
-    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
-    => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""}
-
diff --git a/notes/import_timing_20180923.txt b/notes/import_timing_20180923.txt
deleted file mode 100644
index f8814f3d..00000000
--- a/notes/import_timing_20180923.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-
-    105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k
-    71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps
-
-    real    959m39.521s
-    user    1845m10.392s
-    sys     70m33.780s
-
-Did I get the same error again? I'm confused:
-
-    HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"}
-    (but not in all threads)
-
-Yes, ugh, because 50*2500 can be over (it's not just individual large releases,
-they come in big batches).
-
-But:
-
-    select count(id) from release_ident; => 70006121
-
-A lot, though not 72 million like last time, hrm. I'm... going to move ahead I
-guess.
-
-"Processed 4440850 lines, inserted 3509600, updated 0."
-    => implies 79029915 records
-
-    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update -
-    Processed 530750 lines, inserted 435239, updated 0. (etc)
-    Command exited with non-zero status 1
-    15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k
-    127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps
-
-    real    143m52.681s
-    user    252m31.620s
-    sys     11m21.608s
-
-    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
-
-    (running...)
diff --git a/notes/initial_sources.txt b/notes/initial_sources.txt
deleted file mode 100644
index cc22019d..00000000
--- a/notes/initial_sources.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-
-Probably start with:
-
-    crossref (including citations)
-    arxiv
-    medline
-
-then merge in:
-
-    dblp
-    CORE
-    MSAG dump
-    VIAF
-    archive.org paper/url manifest
-    semantic scholar
-    oaDOI
-
-and later:
-
-    wikidata
-    opencitations
-    openlibrary
-
-national libraries:
-
-    http://www.dnb.de/EN/Service/DigitaleDienste/LinkedData/linkeddata_node.html
diff --git a/notes/old_imports.txt b/notes/old_imports.txt
deleted file mode 100644
index 1233d4a8..00000000
--- a/notes/old_imports.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-
-## ORCID
-
-Directly from compressed tarball; takes about 2 hours in production:
-
-    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid -
-
-After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted
-tarball:
-
-    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json
-    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
-
-Does not work:
-
-    ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json
-
-Instead:
-
-    cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid -
diff --git a/notes/performance/postgres_performance.txt b/notes/performance/postgres_performance.txt
new file mode 100644
index 00000000..cd2a5162
--- /dev/null
+++ b/notes/performance/postgres_performance.txt
@@ -0,0 +1,203 @@
+
+## Setup
+
+Add to postgres.conf:
+
+    shared_preload_libraries = 'auto_explain,pg_stat_statements'
+
+    # Increase the max size of the query strings Postgres records
+    track_activity_query_size = 2048
+
+    # Track statements generated by stored procedures as well
+    pg_stat_statements.track = all
+
+Also:
+    
+    track_counts (already default)
+    autovacuum (already default?)
+    log_min_error = warning
+    log_min_duration_statement = 5000
+
+Then from shell:
+
+    create extension pg_stat_statements;
+
+Regularly want to run:
+
+    VACUUM ANALYZE
+
+## Tuning Values
+
+postgres config:
+
+    max_connections = 100 (default)
+    shared_buffers = 128MB -> 10GB (while elastic on same machine; later 16 or more)
+    effective_cache_size = 4GB -> 24GB (while elastic on same machine)
+    work_mem = 4MB -> 128MB     # relatively few connections/box
+    fsync = on
+    commit_delay = ??? (and siblings)
+    random_page_cost = 1 (for SSD)
+    default_statistics_target = 100 -> 200
+    maintenance_work_mem = 64MB -> 8GB
+    synchronous_commit = off (during dev only! switch to on for production!)
+    wal_sync_method (keep default)
+    max_wal_size = 64 -> 128 (based on above HINT message)
+    # didn't mess with commit_delay/commit_siblings
+
+system:
+
+    sysctl -w vm.overcommit_memory=2
+    TODO: ulimit -n 65536
+    TODO: ulimit -p 800
+        LimitNOFILE
+        /lib/systemd/system/postgresql.service
+
+## Resources 
+
+https://www.geekytidbits.com/performance-tuning-postgres/
+
+Could try pgbadger to handle auto_explain type output.
+
+https://www.postgresql.org/docs/10/static/runtime-config-wal.html
+
+IA-specific resources:
+    https://git.archive.org/ia/mint/blob/master/postgres/postgres_9_2.yml
+    https://git.archive.org/ia/mint/blob/master/postgres/put_datadir_on_ssd.sh
+    https://git.archive.org/ia/mint/blob/master/postgres/templates/postgresql.conf.j2
+
+For bulk inserts:
+- make write-ahead-log larger (eg, 16MB. done.)
+- transactions of ~1000+ inserts
+- https://www.postgresql.org/docs/current/static/populate.html
+- https://www.depesz.com/2007/07/05/how-to-insert-data-to-database-as-fast-as-possible/
+- https://stackoverflow.com/questions/12206600/how-to-speed-up-insertion-performance-in-postgresql
+
+## 2018-06-27 Measurements (pre-tuning)
+
+fatcat_prod=# select count(*) from release_ident; 20983019
+fatcat_prod=# select count(*) from work_ident; 20988140
+fatcat_prod=# select count(*) from file_ident; 1482335
+fatcat_prod=# select count(*) from creator_ident; 4167419
+fatcat_prod=# select count(*) from container_ident; 61793
+
+select count(*) from release_contrib; 59798133
+
+bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/
+43G
+
+running import-crossref with 20 threads, and manifest importer with one (at 33%
+complete). had already imported ~7million works+releases previously.
+
+
+ PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgre
+  Size:   41.38G -   323.40K/s        | TPS:         885
+  Mem.:   50.80% -    23.86G/49.14G   | IO Max:    79539/s
+  Swap:    0.80% -   408.89M/50.00G   | Read :     67.04K/s -     16/s
+  Load:    6.69 7.41 7.69             | Write:      1.93M/s -    493/s
+
+
+----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+ 32   6  62   0   0   0| 296k 3880k| 334k 3144B|   0     0 |  21k   65k
+ 31   6  62   0   0   0|   0  3072k| 391k  318B|   0     0 |  51k  141k
+ 31   6  63   0   0   0|  16k 1080k| 344k 1988B|   0     0 |  35k  104k
+ 29   6  65   0   0   0| 136k 2608k| 175k  332B|   0     0 |9835    15k
+ 28   5  67   0   0   0| 408k 4368k| 285k  832B|   0     0 |  14k   17k
+ 33   5  62   0   0   0|  56k 3256k| 219k   99B|   0     0 |  22k   49k
+ 31   6  63   0   0   0| 188k 5120k| 158k  318B|   0     0 |  17k   29k
+ 30   6  64   0   0   0| 200k 6984k| 239k  988B|   0     0 |  16k   24k
+ 30   6  64   0   0   0| 168k 5504k| 159k  152B|   0     0 |  14k   20k
+ 28   7  65   0   0   0| 440k   12M| 236k  420B|   0     0 |  15k   18k
+ 29   6  65   0   0   0| 428k 6968k| 352k  310B|   0     0 |  19k   31k
+ 32   6  62   0   0   0|  64k 3480k| 288k  318B|   0     0 |  18k   55k
+ 32   6  62   0   0   0|  32k 2080k| 155k  318B|   0     0 |  20k   52k
+
+
+bnewbold@wbgrp-svc500$ uptime
+ 22:00:42 up 28 days, 22:31,  6 users,  load average: 7.94, 7.56, 7.72
+
+
+2018-06-27 21:57:36.102 UTC [401] LOG:  checkpoints are occurring too frequently (13 seconds apart)
+2018-06-27 21:57:36.102 UTC [401] HINT:  Consider increasing the configuration parameter "max_wal_size".
+
+
+     relname     | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan  
+-----------------+--------------+----------------+-------------+----------+-----------
+ changelog       |      1274670 | Missing Index? |    39411712 |  1274670 |         0
+ file_edit       |       612386 | Missing Index? |   108298240 |   612386 |         0
+ creator_edit    |       612386 | Missing Index? |   285540352 |   612386 |         0
+ container_edit  |       612386 | Missing Index? |     4784128 |   612386 |         0
+ release_edit    |       612386 | Missing Index? |  1454489600 |   612386 |         0
+ work_edit       |       612386 | Missing Index? |  1454415872 |   612386 |         0
+ release_contrib |       296675 | Missing Index? |  4725645312 |   296675 |         0
+ release_ref     |       296663 | Missing Index? |  8999837696 |   296663 |         0
+ file_release    |         -113 | OK             |    13918208 |      110 |       223
+ container_rev   |      -979326 | OK             |    16285696 |       63 |    979389
+ file_ident      |     -3671516 | OK             |   109002752 |      362 |   3671878
+ file_rev        |     -3944155 | OK             |   302940160 |       95 |   3944250
+ creator_rev     |     -8420205 | OK             |   318283776 |     1226 |   8421431
+ creator_ident   |     -9525338 | OK             |   309141504 |    52330 |   9577668
+ container_ident |    -20581876 | OK             |     4833280 |   272457 |  20854333
+ release_ident   |    -40548858 | OK             |  1440948224 |  4160919 |  44709777
+ work_rev        |    -42534913 | OK             |  1124671488 |     1161 |  42536074
+ editgroup       |    -48864662 | OK             |    34136064 |        1 |  48864663
+ work_ident      |    -65008911 | OK             |  1503313920 |     1239 |  65010150
+ release_rev     |   -185735794 | OK             | 13649428480 |      128 | 185735922
+
+## 2018-06-28 (after basic tuning + indexes)
+
+Early loading (manifest and 20x release):
+
+    PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s
+    Size:    4.57G -     6.45M/s        | TPS:       18812
+    Mem.:   59.70% -    23.62G/49.14G   | IO Max:     3601/s
+    Swap:    1.30% -   675.05M/50.00G   | Read :      0.00B/s -      0/s
+    Load:    12.98 10.58 5.25           | Write:      2.65M/s -    677/s
+
+
+      PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND 
+    24045 webcrawl  20   0  908872 204948  11756 S 153.3  0.4  16:13.03 fatcatd 
+    24328 webcrawl  20   0   78148  45220   4324 R  87.1  0.1   8:44.16 perl    
+    24056 postgres  20   0 10.441g 3.906g 3.886g R  69.9  7.9   6:57.47 postgres
+    24063 postgres  20   0 10.447g 3.899g 3.873g S  67.9  7.9   6:55.89 postgres
+    24059 postgres  20   0 10.426g 3.888g 3.883g R  67.5  7.9   6:59.15 postgres
+    24057 postgres  20   0 10.430g 3.883g 3.874g S  67.2  7.9   6:58.68 postgres
+    24061 postgres  20   0 10.448g 3.909g 3.881g R  66.2  8.0   6:54.30 postgres
+    24058 postgres  20   0 10.428g 3.883g 3.876g R  65.9  7.9   6:59.35 postgres
+    24062 postgres  20   0 10.426g 5.516g 5.511g R  64.9 11.2   6:58.29 postgres
+    24055 postgres  20   0 10.426g 3.878g 3.873g R  64.2  7.9   6:59.38 postgres
+    24054 postgres  20   0 10.430g 5.499g 5.491g R  63.6 11.2   6:57.27 postgres
+    24060 postgres  20   0 10.448g 3.900g 3.873g R  61.9  7.9   6:55.45 postgres
+    21711 postgres  20   0 10.419g 5.762g 5.760g D  16.6 11.7   3:00.67 postgres
+    21713 postgres  20   0 10.419g  21432  19512 S  11.3  0.0   3:25.11 postgres
+    24392 webcrawl  20   0 5309636 400912   8696 S   7.9  0.8   0:53.18 python3 
+    24383 webcrawl  20   0 5309436 400628   8648 S   7.6  0.8   0:52.29 python3 
+    24387 webcrawl  20   0 5309776 402968   8620 S   7.3  0.8   0:52.81 python3 
+    24394 webcrawl  20   0 5309624 400732   8644 S   7.3  0.8   0:53.30 python3 
+    24384 webcrawl  20   0 5309916 400948   8600 S   7.0  0.8   0:53.18 python3 
+
+Still get a *lot* of:
+
+    2018-06-29 00:14:05.948 UTC [21711] LOG:  checkpoints are occurring too frequently (1 second apart)
+    2018-06-29 00:14:05.948 UTC [21711] HINT:  Consider increasing the configuration parameter "max_wal_size".
+
+VACUUM is running basically continuously; should prevent that? 6 hours or
+longer on release_rev and release_ref tables. An auto-approve batch method
+would resovle this, I think (no update after insert).
+
+max_wal_size wasn't getting set correctly.
+
+The statements taking the most time are the complex inserts (multi-table
+inserts); they take a fraction of a second though (mean less than a
+milisecond).
+
+Manifest import runs really slow if release import is concurrent; much faster
+to wait until release import is done first (like a factor of 10x or more).
+
+With some 60 million releases:
+
+    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/
+    184G    /var/lib/postgresql/
+
+TODO: slow query log doesn't seem to be working (let alone auto_explain)
+
diff --git a/notes/performance/speed.txt b/notes/performance/speed.txt
new file mode 100644
index 00000000..f885aea7
--- /dev/null
+++ b/notes/performance/speed.txt
@@ -0,0 +1,82 @@
+
+## Early Prototyping
+
+### 2018-04-23
+
+- fatcat as marshmallow+sqlalchemy+flask, with API client
+- no refs, contibs, files, release contribs, containers, etc
+- no extra_json
+- sqlite
+- laptop
+- editgroup every 250 edits
+
+
+    /data/crossref/crossref-works.2018-01-21.badsample_5k.json
+
+    real    3m42.912s
+    user    0m20.448s
+    sys     0m2.852s
+
+    ~22 lines per second
+    12.5 hours per million
+    ~52 days for crossref (100 million)
+
+target:
+    crossref (100 million) loaded in 48 hours
+    579 lines per second
+    this test in under 10 seconds
+    ... but could be in parallel
+
+same except postgres, via:
+
+    docker run -p 5432:5432 postgres:latest
+    ./run.py --init-db --database-uri postgres://postgres@localhost:5432
+    ./run.py --database-uri postgres://postgres@localhost:5432
+
+    API processing using 60-100% of a core. postgres 12% of a core;
+    docker-proxy similar (!). overall 70 of system CPU idle.
+
+    real    2m27.771s
+    user    0m22.860s
+    sys     0m2.852s
+
+no profiling yet; need to look at database ops. probably don't even have any
+indices!
+
+## Rust Updates (2018-05-23)
+
+Re-running with tweaked python code, 5k sample file, postgres 9.6 running locally (not in docker):
+
+    real    2m27.598s
+    user    0m24.892s
+    sys     0m2.836s
+
+Using postgres and fatcat rust:
+
+    real    0m44.443s
+    user    0m25.288s
+    sys     0m0.880s
+
+api_client about half a core; fatcatd 3x processes, about 10% each; postgres
+very small.
+
+a bit faster, basically maxing out CPU:
+
+    time cat /data/crossref/crossref-works.2018-01-21.badsample_5k.json | parallel -j4 --pipe ./fatcat_client.py --host-url http://localhost:9411 ic -
+
+    real    0m28.998s
+    user    1m5.304s
+    sys     0m3.420s
+
+    200 lines per second; within a factor of 3; can perhaps hit target with
+    non-python client?
+
+python processes (clients) seem to be CPU limit in this case; all 4 cores
+effectively maxed out.
+
+running python again in parallel mode:
+
+    real    2m29.532s
+    user    0m47.692s
+    sys     0m4.840s
+
diff --git a/notes/postgres_performance.txt b/notes/postgres_performance.txt
deleted file mode 100644
index cd2a5162..00000000
--- a/notes/postgres_performance.txt
+++ /dev/null
@@ -1,203 +0,0 @@
-
-## Setup
-
-Add to postgres.conf:
-
-    shared_preload_libraries = 'auto_explain,pg_stat_statements'
-
-    # Increase the max size of the query strings Postgres records
-    track_activity_query_size = 2048
-
-    # Track statements generated by stored procedures as well
-    pg_stat_statements.track = all
-
-Also:
-    
-    track_counts (already default)
-    autovacuum (already default?)
-    log_min_error = warning
-    log_min_duration_statement = 5000
-
-Then from shell:
-
-    create extension pg_stat_statements;
-
-Regularly want to run:
-
-    VACUUM ANALYZE
-
-## Tuning Values
-
-postgres config:
-
-    max_connections = 100 (default)
-    shared_buffers = 128MB -> 10GB (while elastic on same machine; later 16 or more)
-    effective_cache_size = 4GB -> 24GB (while elastic on same machine)
-    work_mem = 4MB -> 128MB     # relatively few connections/box
-    fsync = on
-    commit_delay = ??? (and siblings)
-    random_page_cost = 1 (for SSD)
-    default_statistics_target = 100 -> 200
-    maintenance_work_mem = 64MB -> 8GB
-    synchronous_commit = off (during dev only! switch to on for production!)
-    wal_sync_method (keep default)
-    max_wal_size = 64 -> 128 (based on above HINT message)
-    # didn't mess with commit_delay/commit_siblings
-
-system:
-
-    sysctl -w vm.overcommit_memory=2
-    TODO: ulimit -n 65536
-    TODO: ulimit -p 800
-        LimitNOFILE
-        /lib/systemd/system/postgresql.service
-
-## Resources 
-
-https://www.geekytidbits.com/performance-tuning-postgres/
-
-Could try pgbadger to handle auto_explain type output.
-
-https://www.postgresql.org/docs/10/static/runtime-config-wal.html
-
-IA-specific resources:
-    https://git.archive.org/ia/mint/blob/master/postgres/postgres_9_2.yml
-    https://git.archive.org/ia/mint/blob/master/postgres/put_datadir_on_ssd.sh
-    https://git.archive.org/ia/mint/blob/master/postgres/templates/postgresql.conf.j2
-
-For bulk inserts:
-- make write-ahead-log larger (eg, 16MB. done.)
-- transactions of ~1000+ inserts
-- https://www.postgresql.org/docs/current/static/populate.html
-- https://www.depesz.com/2007/07/05/how-to-insert-data-to-database-as-fast-as-possible/
-- https://stackoverflow.com/questions/12206600/how-to-speed-up-insertion-performance-in-postgresql
-
-## 2018-06-27 Measurements (pre-tuning)
-
-fatcat_prod=# select count(*) from release_ident; 20983019
-fatcat_prod=# select count(*) from work_ident; 20988140
-fatcat_prod=# select count(*) from file_ident; 1482335
-fatcat_prod=# select count(*) from creator_ident; 4167419
-fatcat_prod=# select count(*) from container_ident; 61793
-
-select count(*) from release_contrib; 59798133
-
-bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/
-43G
-
-running import-crossref with 20 threads, and manifest importer with one (at 33%
-complete). had already imported ~7million works+releases previously.
-
-
- PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgre
-  Size:   41.38G -   323.40K/s        | TPS:         885
-  Mem.:   50.80% -    23.86G/49.14G   | IO Max:    79539/s
-  Swap:    0.80% -   408.89M/50.00G   | Read :     67.04K/s -     16/s
-  Load:    6.69 7.41 7.69             | Write:      1.93M/s -    493/s
-
-
-----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
-usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
- 32   6  62   0   0   0| 296k 3880k| 334k 3144B|   0     0 |  21k   65k
- 31   6  62   0   0   0|   0  3072k| 391k  318B|   0     0 |  51k  141k
- 31   6  63   0   0   0|  16k 1080k| 344k 1988B|   0     0 |  35k  104k
- 29   6  65   0   0   0| 136k 2608k| 175k  332B|   0     0 |9835    15k
- 28   5  67   0   0   0| 408k 4368k| 285k  832B|   0     0 |  14k   17k
- 33   5  62   0   0   0|  56k 3256k| 219k   99B|   0     0 |  22k   49k
- 31   6  63   0   0   0| 188k 5120k| 158k  318B|   0     0 |  17k   29k
- 30   6  64   0   0   0| 200k 6984k| 239k  988B|   0     0 |  16k   24k
- 30   6  64   0   0   0| 168k 5504k| 159k  152B|   0     0 |  14k   20k
- 28   7  65   0   0   0| 440k   12M| 236k  420B|   0     0 |  15k   18k
- 29   6  65   0   0   0| 428k 6968k| 352k  310B|   0     0 |  19k   31k
- 32   6  62   0   0   0|  64k 3480k| 288k  318B|   0     0 |  18k   55k
- 32   6  62   0   0   0|  32k 2080k| 155k  318B|   0     0 |  20k   52k
-
-
-bnewbold@wbgrp-svc500$ uptime
- 22:00:42 up 28 days, 22:31,  6 users,  load average: 7.94, 7.56, 7.72
-
-
-2018-06-27 21:57:36.102 UTC [401] LOG:  checkpoints are occurring too frequently (13 seconds apart)
-2018-06-27 21:57:36.102 UTC [401] HINT:  Consider increasing the configuration parameter "max_wal_size".
-
-
-     relname     | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan  
------------------+--------------+----------------+-------------+----------+-----------
- changelog       |      1274670 | Missing Index? |    39411712 |  1274670 |         0
- file_edit       |       612386 | Missing Index? |   108298240 |   612386 |         0
- creator_edit    |       612386 | Missing Index? |   285540352 |   612386 |         0
- container_edit  |       612386 | Missing Index? |     4784128 |   612386 |         0
- release_edit    |       612386 | Missing Index? |  1454489600 |   612386 |         0
- work_edit       |       612386 | Missing Index? |  1454415872 |   612386 |         0
- release_contrib |       296675 | Missing Index? |  4725645312 |   296675 |         0
- release_ref     |       296663 | Missing Index? |  8999837696 |   296663 |         0
- file_release    |         -113 | OK             |    13918208 |      110 |       223
- container_rev   |      -979326 | OK             |    16285696 |       63 |    979389
- file_ident      |     -3671516 | OK             |   109002752 |      362 |   3671878
- file_rev        |     -3944155 | OK             |   302940160 |       95 |   3944250
- creator_rev     |     -8420205 | OK             |   318283776 |     1226 |   8421431
- creator_ident   |     -9525338 | OK             |   309141504 |    52330 |   9577668
- container_ident |    -20581876 | OK             |     4833280 |   272457 |  20854333
- release_ident   |    -40548858 | OK             |  1440948224 |  4160919 |  44709777
- work_rev        |    -42534913 | OK             |  1124671488 |     1161 |  42536074
- editgroup       |    -48864662 | OK             |    34136064 |        1 |  48864663
- work_ident      |    -65008911 | OK             |  1503313920 |     1239 |  65010150
- release_rev     |   -185735794 | OK             | 13649428480 |      128 | 185735922
-
-## 2018-06-28 (after basic tuning + indexes)
-
-Early loading (manifest and 20x release):
-
-    PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s
-    Size:    4.57G -     6.45M/s        | TPS:       18812
-    Mem.:   59.70% -    23.62G/49.14G   | IO Max:     3601/s
-    Swap:    1.30% -   675.05M/50.00G   | Read :      0.00B/s -      0/s
-    Load:    12.98 10.58 5.25           | Write:      2.65M/s -    677/s
-
-
-      PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND 
-    24045 webcrawl  20   0  908872 204948  11756 S 153.3  0.4  16:13.03 fatcatd 
-    24328 webcrawl  20   0   78148  45220   4324 R  87.1  0.1   8:44.16 perl    
-    24056 postgres  20   0 10.441g 3.906g 3.886g R  69.9  7.9   6:57.47 postgres
-    24063 postgres  20   0 10.447g 3.899g 3.873g S  67.9  7.9   6:55.89 postgres
-    24059 postgres  20   0 10.426g 3.888g 3.883g R  67.5  7.9   6:59.15 postgres
-    24057 postgres  20   0 10.430g 3.883g 3.874g S  67.2  7.9   6:58.68 postgres
-    24061 postgres  20   0 10.448g 3.909g 3.881g R  66.2  8.0   6:54.30 postgres
-    24058 postgres  20   0 10.428g 3.883g 3.876g R  65.9  7.9   6:59.35 postgres
-    24062 postgres  20   0 10.426g 5.516g 5.511g R  64.9 11.2   6:58.29 postgres
-    24055 postgres  20   0 10.426g 3.878g 3.873g R  64.2  7.9   6:59.38 postgres
-    24054 postgres  20   0 10.430g 5.499g 5.491g R  63.6 11.2   6:57.27 postgres
-    24060 postgres  20   0 10.448g 3.900g 3.873g R  61.9  7.9   6:55.45 postgres
-    21711 postgres  20   0 10.419g 5.762g 5.760g D  16.6 11.7   3:00.67 postgres
-    21713 postgres  20   0 10.419g  21432  19512 S  11.3  0.0   3:25.11 postgres
-    24392 webcrawl  20   0 5309636 400912   8696 S   7.9  0.8   0:53.18 python3 
-    24383 webcrawl  20   0 5309436 400628   8648 S   7.6  0.8   0:52.29 python3 
-    24387 webcrawl  20   0 5309776 402968   8620 S   7.3  0.8   0:52.81 python3 
-    24394 webcrawl  20   0 5309624 400732   8644 S   7.3  0.8   0:53.30 python3 
-    24384 webcrawl  20   0 5309916 400948   8600 S   7.0  0.8   0:53.18 python3 
-
-Still get a *lot* of:
-
-    2018-06-29 00:14:05.948 UTC [21711] LOG:  checkpoints are occurring too frequently (1 second apart)
-    2018-06-29 00:14:05.948 UTC [21711] HINT:  Consider increasing the configuration parameter "max_wal_size".
-
-VACUUM is running basically continuously; should prevent that? 6 hours or
-longer on release_rev and release_ref tables. An auto-approve batch method
-would resovle this, I think (no update after insert).
-
-max_wal_size wasn't getting set correctly.
-
-The statements taking the most time are the complex inserts (multi-table
-inserts); they take a fraction of a second though (mean less than a
-milisecond).
-
-Manifest import runs really slow if release import is concurrent; much faster
-to wait until release import is done first (like a factor of 10x or more).
-
-With some 60 million releases:
-
-    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/
-    184G    /var/lib/postgresql/
-
-TODO: slow query log doesn't seem to be working (let alone auto_explain)
-
diff --git a/notes/schema/alignments.csv b/notes/schema/alignments.csv
new file mode 100644
index 00000000..b8619ddc
--- /dev/null
+++ b/notes/schema/alignments.csv
@@ -0,0 +1,57 @@
+fatcat entity,fatcat field,description,Crossref,ORCID,Bibtex,CSL,BIBFRAME,Dublin Core (DCMES),Dublin Core (DCMI),oai_dc (OAI-PMH),resourceSync,Highwire Press (google scholar),MEDLINE (?)
+WORK,,,,,,,Work,,,,,,
+,work_type,,,,,,,Type,,,,,
+RELEASE,,,,,,,Instance,,,,,,
+,title,,title,,title,,,Title,,title,,citation_title,
+,release_type,,type (???),,object type; howpublished,,,Type,,,,,
+,release_status,"NEW; eg, preprint, final",,,,,,,,,,,
+,container,,ISSN (via lookup),,ISSN (via lookup),,,Relation (?),,,,citation_issn (normalized),
+,release_date,RENAME,published-print or published-online,,"year, month",issued,,Date,Issued (?),,,citation_publication_date,
+,volume,,volume,,volume,,,,,,,citation_volume,
+,pages,,page,,pages,,,,,,,"citation_firstpage, citation_lastpage",
+,issue,,issue,,,,,,,,,citation_issue,
+,doi,always lower-case,DOI,,doi,DOI,,Identifier,,,,citation_doi,
+,isbn13,,ISBN (converted),,,ISBN,,Identifier,,,,citation_isbn,
+,publisher,,publisher,,publisher,,,Publisher,,,,"citation_dissertation_institution, citation_technical_report_institution",
+,language,NEW; RFC1766 (ISO 639-1 superset); may transition,,,,,,Language,,,,,
+,ref:index,,reference:[index],,,Citation-number,,,,,,,
+,ref:key,NEW,reference:key,,,citation-label,,,,,,,
+,ref:raw,RENAME,reference:unstructured,,,,,,,,,,
+,ref:container_title,NEW,reference:journal-title,,,,,,,,,,
+,ref:year,NEW,reference:year,,,,,,,,,,
+,ref:title,NEW,,,,,,,,,,,
+,ref:locator,"NEW; (aka, page number)",,,,,,,,,,,
+,contrib:role,,,,,,Role,,,,,,
+,contrib:index,,author[index],,,,,,,,,,
+,contrib:raw,RENAME,Author:{given+family},,author,,,,,,,,
+,extra:number,tech reports,,,,,,,,,,citation_technical_report_number,
+,extra:institution,for tech reports and dissertations,,,,,,,,,,,
+,extra:abstract,,,,,,,Description,,,,citation_abstract,
+,extra:pmcid,,,,,,,,,,,,
+,extra:pmid,,,,,,,,,,,,
+,extra:version,"eg, for software",,,,,,,,,,,
+,,,,,,,,,,,,,
+CREATOR,,,,,,,Agent,Creator / Contributor,,,,,
+,display_name,NEW; usually western/latinized  ,,.name.credit-name.value (?),,,,,,,,citation_author,
+,orcid,,,.path,,,,,,,,,
+,given_name,NEW,,.name.given-names.value,,,,,,,,,
+,surname,NEW,,.name.family-names.value,,,,,,,,,
+,extra:viaf,,,,,,,,,,,,
+,extra:alt_names[],,,,,,,,,,,,
+,,,,,,,,,,,,,
+CONTAINER,,,,,,,N/A,relation.isPartOf,,,,,
+,name,,,,,,,,,,,"citation_journal_title, citation_conference_title",
+,publisher,,,,,,,,,,,,
+,issnl,,,,,,,Identifier (?),,,,citation_issn (normalized),
+,abbrev,,,,,,,,,,,citation_journal_abbrev,
+,coden,,,,,,,,,,,,
+,extra:issn-e,,,,,,,,,,,,
+,extra:issn-p,,,,,,,,,,,,
+FILE,,,,,,,Item,,,,,,
+,release,,,,,,,,,,,,
+,size,,,,,,,,,,length,,
+,sha256,NEW,,,,,,,,,hash,,
+,sha1,,,,,,,,,,hash,,
+,md5,,,,,,,,,,hash,,
+,url,,,,www_pdf_url,,,,,,href,citation_pdf_url,
+,mimetype,NEW,,,,,,Format,,,type,,
diff --git a/notes/schema/alignments.txt b/notes/schema/alignments.txt
new file mode 100644
index 00000000..e2736268
--- /dev/null
+++ b/notes/schema/alignments.txt
@@ -0,0 +1,22 @@
+bibtex
+BIBFRAME
+schema.org: http://schema.org/CreativeWork
+dublin core
+
+entity specific:
+- crossref / release
+- orcid / creator
+- issn / container
+
+
+## Dublin Core (original/simple)
+
+Usage guide: <http://www.dublincore.org/documents/usageguide/elements/>
+
+Quick descriptions of the "original 15" fields: <http://mn.gov/bridges/dcore.html>
+
+## Citation Style Language
+
+Human-readable specification: <http://docs.citationstyles.org/en/1.0.1/specification.html>
+
+Specifically, the "variables" and type definitions: <http://docs.citationstyles.org/en/stable/specification.html#appendix-iv-variables>
diff --git a/notes/schema/contrib_types.txt b/notes/schema/contrib_types.txt
new file mode 100644
index 00000000..01024b40
--- /dev/null
+++ b/notes/schema/contrib_types.txt
@@ -0,0 +1,14 @@
+
+See also: <http://docs.citationstyles.org/en/stable/specification.html#roles>
+
+author
+editor
+translator
+illustrator
+interviewer
+reviewer
+
+
+container-author
+container-editor
+reviewed-author
diff --git a/notes/schema/work_release_types.txt b/notes/schema/work_release_types.txt
new file mode 100644
index 00000000..6eff118b
--- /dev/null
+++ b/notes/schema/work_release_types.txt
@@ -0,0 +1,28 @@
+
+see also: <http://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types>
+
+basics:
+- article-journal
+- book
+- chapter
+- dataset
+- manuscript
+- paper-conference
+- blog-post
+- report
+- thesis (aka, dissertation)
+
+- preprint
+- conference proceeding
+- book chapter
+- technical report/memo
+- dissertation/thesis
+- blog post
+- wiki page
+- presentation (slides, recording)
+
+"publication status" instead of pre-print and manuscript types?
+
+abstracts as files?
+
+videos, slides
diff --git a/notes/speed.txt b/notes/speed.txt
deleted file mode 100644
index f885aea7..00000000
--- a/notes/speed.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-
-## Early Prototyping
-
-### 2018-04-23
-
-- fatcat as marshmallow+sqlalchemy+flask, with API client
-- no refs, contibs, files, release contribs, containers, etc
-- no extra_json
-- sqlite
-- laptop
-- editgroup every 250 edits
-
-
-    /data/crossref/crossref-works.2018-01-21.badsample_5k.json
-
-    real    3m42.912s
-    user    0m20.448s
-    sys     0m2.852s
-
-    ~22 lines per second
-    12.5 hours per million
-    ~52 days for crossref (100 million)
-
-target:
-    crossref (100 million) loaded in 48 hours
-    579 lines per second
-    this test in under 10 seconds
-    ... but could be in parallel
-
-same except postgres, via:
-
-    docker run -p 5432:5432 postgres:latest
-    ./run.py --init-db --database-uri postgres://postgres@localhost:5432
-    ./run.py --database-uri postgres://postgres@localhost:5432
-
-    API processing using 60-100% of a core. postgres 12% of a core;
-    docker-proxy similar (!). overall 70 of system CPU idle.
-
-    real    2m27.771s
-    user    0m22.860s
-    sys     0m2.852s
-
-no profiling yet; need to look at database ops. probably don't even have any
-indices!
-
-## Rust Updates (2018-05-23)
-
-Re-running with tweaked python code, 5k sample file, postgres 9.6 running locally (not in docker):
-
-    real    2m27.598s
-    user    0m24.892s
-    sys     0m2.836s
-
-Using postgres and fatcat rust:
-
-    real    0m44.443s
-    user    0m25.288s
-    sys     0m0.880s
-
-api_client about half a core; fatcatd 3x processes, about 10% each; postgres
-very small.
-
-a bit faster, basically maxing out CPU:
-
-    time cat /data/crossref/crossref-works.2018-01-21.badsample_5k.json | parallel -j4 --pipe ./fatcat_client.py --host-url http://localhost:9411 ic -
-
-    real    0m28.998s
-    user    1m5.304s
-    sys     0m3.420s
-
-    200 lines per second; within a factor of 3; can perhaps hit target with
-    non-python client?
-
-python processes (clients) seem to be CPU limit in this case; all 4 cores
-effectively maxed out.
-
-running python again in parallel mode:
-
-    real    2m29.532s
-    user    0m47.692s
-    sys     0m4.840s
-
diff --git a/notes/work_release_types.txt b/notes/work_release_types.txt
deleted file mode 100644
index 6eff118b..00000000
--- a/notes/work_release_types.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-
-see also: <http://docs.citationstyles.org/en/stable/specification.html#appendix-iii-types>
-
-basics:
-- article-journal
-- book
-- chapter
-- dataset
-- manuscript
-- paper-conference
-- blog-post
-- report
-- thesis (aka, dissertation)
-
-- preprint
-- conference proceeding
-- book chapter
-- technical report/memo
-- dissertation/thesis
-- blog post
-- wiki page
-- presentation (slides, recording)
-
-"publication status" instead of pre-print and manuscript types?
-
-abstracts as files?
-
-videos, slides
-- 
cgit v1.2.3