From 930c28e4e655966658190fa6c1ad118884ee34f6 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 17 Aug 2018 15:10:39 -0700
Subject: more auth thoughts

---
 notes/auth_thoughts.txt | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'notes')

diff --git a/notes/auth_thoughts.txt b/notes/auth_thoughts.txt
index 3ccaf668..4782dd0f 100644
--- a/notes/auth_thoughts.txt
+++ b/notes/auth_thoughts.txt
@@ -10,3 +10,45 @@ haven't been revoked.
 Could use portier with openid connect as an email-based option. Otherwise,
 orcid, github, google.
 
+---------
+
+Use macaroons!
+
+editor/user table has a "auth_epoch" timestamp; only macaroons generated
+after this timestamp are valid. revocation is done by incrementing this
+timestamp ("touch").
+
+Rust CLI tool for managing users:
+- create editor
+
+Special users/editor that can create editor accounts via API; eg, one for
+fatcat-web.
+
+Associate one oauth2 id per domain per editor/user.
+
+Users come to fatcat-web and do oauth2 to login or create an account. All
+oauth2 internal to fatcat-web. If successful, fatcat-web does an
+(authenticated) lookup to API for that identifier. If found, requests a
+new macaroon to use as a cookie for auth. All future requests pass this
+cookie through as bearer auth. fatcat-web remains stateless! macaroon
+contains username (for display); no lookup-per page. Need to logout/login for
+this to update?
+
+Later, can do a "add additional account" feature.
+
+Backend:
+- oauth2 account table, foreign key to editor table
+    => this is the only private table
+- auth_epoch timestamp column on editor table
+- lock editor by setting auth_epoch to deep future
+
+TODO: privacy policy
+
+fatcat API doesn't *require* auth, but if auth is provided, it will check
+macaroon, and validate against editor table's timestamp.
+
+support oauth2 against:
+- orcid
+- git.archive.org
+- github
+? google
-- 
cgit v1.2.3


From 3c7d392bc4b36e8e5890d2ccb292f74e6b988e55 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 20 Aug 2018 14:49:06 -0700
Subject: notes on recent bulk import

---
 notes/import_timing_20180815.txt | 292 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 292 insertions(+)
 create mode 100644 notes/import_timing_20180815.txt

(limited to 'notes')

diff --git a/notes/import_timing_20180815.txt b/notes/import_timing_20180815.txt
new file mode 100644
index 00000000..1206cc41
--- /dev/null
+++ b/notes/import_timing_20180815.txt
@@ -0,0 +1,292 @@
+
+Schema changes since previous imports:
+- more fields (identifiers+indexes)
+- timestamps
+- UUIDs more places
+- fixed some crossref import bugs?
+- abstracts
+- file_urls as table (not single value)
+- timestamps
+- TEXT -> CHAR in a few places
+- removed many work fields
+
+## Containers
+
+(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
+
+real    1m25.292s
+user    0m12.640s
+sys     0m0.412s
+
+## Creators
+
+time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+
+(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*)
+real    22m2.465s
+user    26m41.924s
+sys     1m33.844s
+
+## Releases
+
+xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
+
+    128516.30 user
+    3905.14 system
+    44:17:05 elapsed
+    83% CPU
+
+Almost 44 hours... I think I remember more like 36 hours last time? Things
+slowed down a lot towards the end, many more ORCID cross-references?
+
+looking in htop, postgres seems to be primary bottleneck. At something like 12
+hours in, had 44 million release_ident rows, which is 1000/second.
+
+Note: seems like the more frequently `count(*)` is run, the more performant.
+Because in-memory?
+
+    2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG:  duration: 42949.549 ms  statement: select count(id) from release_ident;
+
+    fatcat_prod=# select count(*) from release_ident;
+    count   
+    ----------
+    44185608
+    (1 row)
+
+    Time: 2753.916 ms (00:02.754)
+    fatcat_prod=# select count(*) from release_ident;
+    count   
+    ----------
+    44187937
+    (1 row)
+
+    Time: 2711.670 ms (00:02.712)
+
+As expected, autovacuum very busy. Only ~150 TPS; but that includes batch
+writes? 75061172 rows.
+
+## Files
+
+    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
+
+    Done! Inserted 6607075
+
+    real    2152m28.822s => 36 hours (!)
+    user    401m46.464s
+    sys     21m45.724s
+
+
+Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table?
+
+    SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce';
+
+Created index:
+    
+    CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev);
+
+... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't
+see non-indexed queries. Maybe an ANALYZE does need to happen?
+
+This being single-threaded is going to be a problem in the future. ~50 million
+files would be ~2 weeks.
+
+## Post-Import Status
+
+    Size:  358.89G (postgres self-reported)
+    Mem.:   57.10% -    16.85G/49.14G
+
+Was 184G last time in late June; doubled in size (!).
+
+    bnewbold@wbgrp-svc500$ df -h /
+    Filesystem      Size  Used Avail Use% Mounted on
+    /dev/vda1       858G  529G  286G  65% /
+
+    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/
+    361G    /var/lib/postgresql/
+    83G     /srv/datasets/
+    77G     /srv/elastic-blah/
+
+    fatcat_prod=# select count(*) from changelog; => 2,085,067
+
+    SELECT
+        table_name,
+        pg_size_pretty(table_size) AS table_size,
+        pg_size_pretty(indexes_size) AS indexes_size,
+        pg_size_pretty(total_size) AS total_size
+    FROM (
+        SELECT
+            table_name,
+            pg_table_size(table_name) AS table_size,
+            pg_indexes_size(table_name) AS indexes_size,
+            pg_total_relation_size(table_name) AS total_size
+        FROM (
+            SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+            FROM information_schema.tables
+        ) AS all_tables
+        ORDER BY total_size DESC
+    ) AS pretty_sizes;
+
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref"                                       | 159 GB     | 47 GB        | 206 GB
+ "public"."release_rev"                                       | 40 GB      | 10 GB        | 51 GB
+ "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB
+ "public"."release_ident"                                     | 5797 MB    | 6597 MB      | 12 GB
+ "public"."work_ident"                                        | 5787 MB    | 6394 MB      | 12 GB
+ "public"."release_edit"                                      | 6674 MB    | 4646 MB      | 11 GB
+ "public"."work_edit"                                         | 6674 MB    | 4646 MB      | 11 GB
+ "public"."work_rev"                                          | 3175 MB    | 2939 MB      | 6114 MB
+ "public"."file_rev_url"                                      | 1995 MB    | 275 MB       | 2270 MB
+ "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB
+ "public"."file_rev"                                          | 829 MB     | 954 MB       | 1783 MB
+ "public"."file_ident"                                        | 498 MB     | 532 MB       | 1030 MB
+ "public"."file_release"                                      | 369 MB     | 642 MB       | 1011 MB
+ "public"."file_edit"                                         | 591 MB     | 410 MB       | 1002 MB
+ "public"."creator_rev"                                       | 337 MB     | 318 MB       | 655 MB
+ "public"."creator_ident"                                     | 280 MB     | 297 MB       | 577 MB
+ "public"."creator_edit"                                      | 316 MB     | 220 MB       | 536 MB
+ "public"."release_rev_abstract"                              | 183 MB     | 84 MB        | 267 MB
+ "public"."changelog"                                         | 123 MB     | 125 MB       | 249 MB
+ "public"."editgroup"                                         | 139 MB     | 81 MB        | 220 MB
+ "public"."container_rev"                                     | 19 MB      | 6912 kB      | 26 MB
+ "public"."container_ident"                                   | 6896 kB    | 7016 kB      | 14 MB
+ "public"."container_edit"                                    | 8056 kB    | 5240 kB      | 13 MB
+
+In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB.
+
+For many of these indexes, and the _ident tables, switching from UUID to
+BIGSERIAL would half the size.
+
+## Exports
+
+    time ./fatcat_export.py changelog - | pv -l | wc
+
+    As of:
+
+    159k 1:17:35 [34.3 /s]
+    159,740 lines
+    2,427,277,881 chars (bytes; 2.4GB)
+
+    real    77m35.183s
+    user    15m36.208s
+    sys     0m31.484s
+
+Running at about 100/sec; estimate 6 hours for completion. Could shard using
+start/end flags, but am not here.
+
+Running `quick_dump.sql` (identifier tables, in a transaction):
+
+    251M Aug 19 23:08 fatcat_ident_creators.tsv
+    5.9M Aug 19 23:08 fatcat_ident_containers.tsv
+    467M Aug 19 23:08 fatcat_ident_files.tsv
+    5.2G Aug 19 23:10 fatcat_ident_releases.tsv
+    5.2G Aug 19 23:11 fatcat_ident_works.tsv
+     12K Aug 19 23:11 .
+    1.8G Aug 19 23:12 fatcat_abstracts.json
+
+Work and Release tables in under 2 minutes each; say 5 minutes total.
+
+    time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc
+
+    172k 1:07:08 [42.7 /s]
+    172181 lines
+    1,118,166,293 chars (bytes; 1.1 GB)
+
+    real    67m8.340s
+    user    10m21.988s
+    sys     0m34.612s
+
+Running at only 10/sec or so, this would take forever even if sharded. :(
+
+Both exports/dumps are running in parallel. "Expand" queries might help with speed?
+
+## Postgres Analysis
+
+SELECT *
+FROM
+  pg_stat_statements
+ORDER BY
+  total_time DESC LIMIT 5;
+
+Summary:
+
+    SELECT "creator_ident" by ORCID
+        1,295,864 calls
+        930,305,208 total time
+        717.9 mean time     <= this should be less than a ms!
+
+    INSERT INTO release_rev
+        75144055 calls
+        111470961 total time
+        1.483 mean time
+
+    INSERT INTO work_rev
+        75,144,055 calls
+        82693994 total time
+        1.1 mean time
+
+    INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING *
+        26,008,280 calls    <= why so few? different query depending on number
+                               of rows inserted
+        18955782 total time
+        0.728 mean time
+
+    SELECT container_ident
+        78,4143 calls
+        17683156 total time
+        22.55 mean time     <= why so slow?
+
+    INSERT INTO release_contrib 
+        15,072,820 calls
+
+    INSERT INTO "release_contrib
+
+
+       relname        | too_much_seq |      case      |   rel_size   | seq_scan | idx_scan  
+----------------------+--------------+----------------+--------------+----------+-----------
+ file_rev_url         |         2391 | Missing Index? |   2091147264 |     2391 |         0
+ file_release         |       -30670 | OK             |    386899968 |        2 |     30672
+ container_rev        |      -979948 | OK             |     20242432 |   784146 |   1764094
+ file_edit            |     -2206807 | OK             |    619896832 |        6 |   2206813
+ creator_edit         |     -2206810 | OK             |    331079680 |       11 |   2206821
+ work_edit            |     -2206811 | OK             |   6996566016 |       14 |   2206825
+ release_edit         |     -2206811 | OK             |   6996582400 |       14 |   2206825
+ container_edit       |     -2206816 | OK             |      8216576 |        5 |   2206821
+ changelog            |     -2209659 | OK             |    129286144 |       10 |   2209669
+ abstracts            |     -3486466 | OK             |   1706237952 |        8 |   3486474
+ release_rev_abstract |     -4975493 | OK             |    191602688 |    42919 |   5018412
+ release_ref          |     -5032717 | OK             | 170494861312 |        3 |   5032720
+ release_contrib      |     -5032744 | OK             |  20370251776 |        3 |   5032747
+ creator_rev          |     -8400410 | OK             |    353583104 |  1296507 |   9696917
+ file_ident           |    -13483224 | OK             |    522190848 |        7 |  13483231
+ creator_ident        |    -16686744 | OK             |    293625856 |        3 |  16686747
+ file_rev             |    -32405557 | OK             |    868515840 |        4 |  32405561
+ container_ident      |    -69162337 | OK             |      7028736 |        3 |  69162340
+ work_rev             |   -150288161 | OK             |   3328589824 |        1 | 150288162
+ editgroup            |   -162783807 | OK             |    146112512 |        9 | 162783816
+ release_ident        |   -165676917 | OK             |   6076841984 |       52 | 165676969
+ work_ident           |   -229439828 | OK             |   6066814976 |        3 | 229439831
+ release_rev          |   -930140217 | OK             |  43360542720 |        9 | 930140226
+
+TODO changes:
+- don't return all as often; in particular, inserting release_contrib, release_ref
+x missing an index somewhere on file_rev_url, release_rev_abstract
+x why so many seq_scan on container_rev, creator_rev
+    => running/EXPLAIN same query on psql hits index, not seq_scan
+    => seemed to be an issue with VALUE params getting sent separately; query
+    planner only looked at query and wasn't using index on ORCID/ISSN-L because
+    it didn't know those values were not-NULL?
+    => adding NOT NULL to query seems to have sped up case of there being a
+    "hit", but no hit still slow. might need to change indices or something for
+    the (perhaps common in future) case of DOI lookups with invalid DOIs (eg,
+    CORE import)
+
+random DEBUG queries:
+
+    EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+    EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1;
+
+    EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1;
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-- 
cgit v1.2.3


From 54fbdd96193f82adeb3d92095a6955656c67f5e3 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 20 Aug 2018 14:49:22 -0700
Subject: cost notes on hosting a fatcat mirror

---
 notes/cloud_instances.txt | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 notes/cloud_instances.txt

(limited to 'notes')

diff --git a/notes/cloud_instances.txt b/notes/cloud_instances.txt
new file mode 100644
index 00000000..4582c431
--- /dev/null
+++ b/notes/cloud_instances.txt
@@ -0,0 +1,8 @@
+
+digital ocean
+    48 GB RAM, 12 cores, 960 GB $240/month
+    (or more)
+
+aws
+    i3.2xlarge 61 GB RAM, 8 cores, 1900 GB NVMe, $455/month
+
-- 
cgit v1.2.3


From 8528f06157b0e60842c860f81e3f2a69aa07aae9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 24 Aug 2018 12:59:28 -0700
Subject: WIP: autoaccept

---
 TODO                                          | 34 +++++-----
 fatcat-openapi2.yml                           | 25 +++++++
 notes/cloud_instances.txt                     |  2 +
 rust/fatcat-api/README.md                     |  2 +-
 rust/fatcat-api/api.yaml                      | 25 +++++++
 rust/fatcat-api/api/swagger.yaml              | 40 ++++++++++++
 rust/fatcat-api/examples/client.rs            | 10 +--
 rust/fatcat-api/examples/server_lib/server.rs | 48 +++++++++++---
 rust/fatcat-api/src/client.rs                 | 39 ++++++++---
 rust/fatcat-api/src/lib.rs                    | 93 +++++++++++++++++++++------
 rust/fatcat-api/src/server.rs                 | 15 +++--
 11 files changed, 269 insertions(+), 64 deletions(-)

(limited to 'notes')

diff --git a/TODO b/TODO
index d5e10629..b7aa470a 100644
--- a/TODO
+++ b/TODO
@@ -1,38 +1,43 @@
 
 ## Next Up
 
-- some significant slow-down has happened? transactions, or regexes?
+summer roadmap:
+- PUT/UPDATE, DELETE, and merge code paths
+- faster UPDATE-free bulk import code path
+- container import (extra?): lang, region, subject
+- basic API+webface creation, editing, merging, editgroup approval
+- elastic schema/transform for releases; bulk and continuous scripts
 
 features:
 - fast database dump command: both changelog-based and entity-based (rust)
     => lighter, more complete dumps for each entity type?
+- guide skeleton (mdbook; guide.fatcat.wiki)
 
 importers:
+- CORE
+- wikidata cross-ref (if they have a dump)
 - manifest: multiple URLs per SHA1
-- pubmed (medline)
+- pubmed (medline), if not in CORE
     => and/or, use pubmed ID lookups on crossref import
-- core
 - semantic scholar (up to 39 million; author de-dupe)
-- wikidata (if they have a dump)
 
 bugs:
 - test: release pointing to a collection that has been deleted/redirected
   => UI crash?
 
-july roadmap:
-- complete and test this round of schema changes
-- container import (extra?): lang, region, subject
-- re-run imports
-- basic API+webface creation, editing, merging, editgroup approval
-- elastic schema/transform for releases; bulk and continuous scripts
-
 ## Schema / Alignment / Scope
 
 - "container" -> "venue"?
-- release_type, release_status, url.rel enums (and others?)
+- release_type, release_status, url.rel write-time schema(and others?)
 
 name ref: https://www.w3.org/International/questions/qa-personal-names
 
+## API
+
+- how to send edit "extra" metadata?
+- hydrate entities in API
+    ? "expand" query param
+
 ## High-Level Priorities
 
 - full database dump (export)
@@ -50,11 +55,6 @@ name ref: https://www.w3.org/International/questions/qa-personal-names
 - batch inserts automerge: create editgroup and changelog, mark all edits as
   accepted, all in a single transaction
 
-## API
-
-- hydrate entities in API
-    ? "expand" query param
-
 ## Other
 
 - basic python hbase/elastic matcher
diff --git a/fatcat-openapi2.yml b/fatcat-openapi2.yml
index fda630dd..ea17e982 100644
--- a/fatcat-openapi2.yml
+++ b/fatcat-openapi2.yml
@@ -447,6 +447,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -540,6 +545,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -649,6 +659,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -742,6 +757,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -851,6 +871,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
diff --git a/notes/cloud_instances.txt b/notes/cloud_instances.txt
index 4582c431..b7071758 100644
--- a/notes/cloud_instances.txt
+++ b/notes/cloud_instances.txt
@@ -6,3 +6,5 @@ digital ocean
 aws
     i3.2xlarge 61 GB RAM, 8 cores, 1900 GB NVMe, $455/month
 
+OVH
+    MG-128 128 GB RAM, 16 cores, 2880 GB SSD (RAID), 500mbps unlimited b/w, $315/month
diff --git a/rust/fatcat-api/README.md b/rust/fatcat-api/README.md
index d0b266aa..7e4a2ec8 100644
--- a/rust/fatcat-api/README.md
+++ b/rust/fatcat-api/README.md
@@ -13,7 +13,7 @@ To see how to make this your own, look here:
 [README](https://github.com/swagger-api/swagger-codegen/blob/master/README.md)
 
 - API version: 0.1.0
-- Build date: 2018-08-20T08:47:01.260Z
+- Build date: 2018-08-22T00:54:09.323Z
 
 This autogenerated project defines an API crate `fatcat` which contains:
 * An `Api` trait defining the API in Rust.
diff --git a/rust/fatcat-api/api.yaml b/rust/fatcat-api/api.yaml
index fda630dd..ea17e982 100644
--- a/rust/fatcat-api/api.yaml
+++ b/rust/fatcat-api/api.yaml
@@ -447,6 +447,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -540,6 +545,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -649,6 +659,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -742,6 +757,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
@@ -851,6 +871,11 @@ paths:
           type: boolean
           required: false
           description: "If true, and editor is authorized, batch is accepted all at once"
+        - name: editgroup
+          in: query
+          type: string
+          required: false
+          description: "Editgroup to auto-accept and apply to all entities (required if 'autoaccept' is True)"
         - name: entity_list
           in: body
           required: true
diff --git a/rust/fatcat-api/api/swagger.yaml b/rust/fatcat-api/api/swagger.yaml
index 11f789dc..3b8ed6e3 100644
--- a/rust/fatcat-api/api/swagger.yaml
+++ b/rust/fatcat-api/api/swagger.yaml
@@ -84,6 +84,14 @@ paths:
         type: "boolean"
         formatString: "{:?}"
         example: "Some(true)"
+      - name: "editgroup"
+        in: "query"
+        description: "Editgroup to auto-accept and apply to all entities (required\
+          \ if 'autoaccept' is True)"
+        required: false
+        type: "string"
+        formatString: "{:?}"
+        example: "Some(\"editgroup_example\".to_string())"
       - in: "body"
         name: "entity_list"
         required: true
@@ -387,6 +395,14 @@ paths:
         type: "boolean"
         formatString: "{:?}"
         example: "Some(true)"
+      - name: "editgroup"
+        in: "query"
+        description: "Editgroup to auto-accept and apply to all entities (required\
+          \ if 'autoaccept' is True)"
+        required: false
+        type: "string"
+        formatString: "{:?}"
+        example: "Some(\"editgroup_example\".to_string())"
       - in: "body"
         name: "entity_list"
         required: true
@@ -744,6 +760,14 @@ paths:
         type: "boolean"
         formatString: "{:?}"
         example: "Some(true)"
+      - name: "editgroup"
+        in: "query"
+        description: "Editgroup to auto-accept and apply to all entities (required\
+          \ if 'autoaccept' is True)"
+        required: false
+        type: "string"
+        formatString: "{:?}"
+        example: "Some(\"editgroup_example\".to_string())"
       - in: "body"
         name: "entity_list"
         required: true
@@ -1044,6 +1068,14 @@ paths:
         type: "boolean"
         formatString: "{:?}"
         example: "Some(true)"
+      - name: "editgroup"
+        in: "query"
+        description: "Editgroup to auto-accept and apply to all entities (required\
+          \ if 'autoaccept' is True)"
+        required: false
+        type: "string"
+        formatString: "{:?}"
+        example: "Some(\"editgroup_example\".to_string())"
       - in: "body"
         name: "entity_list"
         required: true
@@ -1398,6 +1430,14 @@ paths:
         type: "boolean"
         formatString: "{:?}"
         example: "Some(true)"
+      - name: "editgroup"
+        in: "query"
+        description: "Editgroup to auto-accept and apply to all entities (required\
+          \ if 'autoaccept' is True)"
+        required: false
+        type: "string"
+        formatString: "{:?}"
+        example: "Some(\"editgroup_example\".to_string())"
       - in: "body"
         name: "entity_list"
         required: true
diff --git a/rust/fatcat-api/examples/client.rs b/rust/fatcat-api/examples/client.rs
index 06519232..34653196 100644
--- a/rust/fatcat-api/examples/client.rs
+++ b/rust/fatcat-api/examples/client.rs
@@ -95,7 +95,7 @@ fn main() {
         //     println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         //  },
         Some("CreateContainerBatch") => {
-            let result = client.create_container_batch(&Vec::new(), Some(true)).wait();
+            let result = client.create_container_batch(&Vec::new(), Some(true), Some("editgroup_example".to_string())).wait();
             println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         }
 
@@ -105,7 +105,7 @@ fn main() {
         //     println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         //  },
         Some("CreateCreatorBatch") => {
-            let result = client.create_creator_batch(&Vec::new(), Some(true)).wait();
+            let result = client.create_creator_batch(&Vec::new(), Some(true), Some("editgroup_example".to_string())).wait();
             println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         }
 
@@ -121,7 +121,7 @@ fn main() {
         //     println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         //  },
         Some("CreateFileBatch") => {
-            let result = client.create_file_batch(&Vec::new(), Some(true)).wait();
+            let result = client.create_file_batch(&Vec::new(), Some(true), Some("editgroup_example".to_string())).wait();
             println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         }
 
@@ -131,7 +131,7 @@ fn main() {
         //     println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         //  },
         Some("CreateReleaseBatch") => {
-            let result = client.create_release_batch(&Vec::new(), Some(true)).wait();
+            let result = client.create_release_batch(&Vec::new(), Some(true), Some("editgroup_example".to_string())).wait();
             println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         }
 
@@ -141,7 +141,7 @@ fn main() {
         //     println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         //  },
         Some("CreateWorkBatch") => {
-            let result = client.create_work_batch(&Vec::new(), Some(true)).wait();
+            let result = client.create_work_batch(&Vec::new(), Some(true), Some("editgroup_example".to_string())).wait();
             println!("{:?} (X-Span-ID: {:?})", result, client.context().x_span_id.clone().unwrap_or(String::from("<none>")));
         }
 
diff --git a/rust/fatcat-api/examples/server_lib/server.rs b/rust/fatcat-api/examples/server_lib/server.rs
index 32c7e97f..60e19847 100644
--- a/rust/fatcat-api/examples/server_lib/server.rs
+++ b/rust/fatcat-api/examples/server_lib/server.rs
@@ -38,13 +38,15 @@ impl Api for Server {
         &self,
         entity_list: &Vec<models::ContainerEntity>,
         autoaccept: Option<bool>,
+        editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> {
         let context = context.clone();
         println!(
-            "create_container_batch({:?}, {:?}) - X-Span-ID: {:?}",
+            "create_container_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}",
             entity_list,
             autoaccept,
+            editgroup,
             context.x_span_id.unwrap_or(String::from("<none>")).clone()
         );
         Box::new(futures::failed("Generic failure".into()))
@@ -56,12 +58,19 @@ impl Api for Server {
         Box::new(futures::failed("Generic failure".into()))
     }
 
-    fn create_creator_batch(&self, entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> {
+    fn create_creator_batch(
+        &self,
+        entity_list: &Vec<models::CreatorEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> {
         let context = context.clone();
         println!(
-            "create_creator_batch({:?}, {:?}) - X-Span-ID: {:?}",
+            "create_creator_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}",
             entity_list,
             autoaccept,
+            editgroup,
             context.x_span_id.unwrap_or(String::from("<none>")).clone()
         );
         Box::new(futures::failed("Generic failure".into()))
@@ -79,12 +88,19 @@ impl Api for Server {
         Box::new(futures::failed("Generic failure".into()))
     }
 
-    fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> {
+    fn create_file_batch(
+        &self,
+        entity_list: &Vec<models::FileEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> {
         let context = context.clone();
         println!(
-            "create_file_batch({:?}, {:?}) - X-Span-ID: {:?}",
+            "create_file_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}",
             entity_list,
             autoaccept,
+            editgroup,
             context.x_span_id.unwrap_or(String::from("<none>")).clone()
         );
         Box::new(futures::failed("Generic failure".into()))
@@ -96,12 +112,19 @@ impl Api for Server {
         Box::new(futures::failed("Generic failure".into()))
     }
 
-    fn create_release_batch(&self, entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> {
+    fn create_release_batch(
+        &self,
+        entity_list: &Vec<models::ReleaseEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> {
         let context = context.clone();
         println!(
-            "create_release_batch({:?}, {:?}) - X-Span-ID: {:?}",
+            "create_release_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}",
             entity_list,
             autoaccept,
+            editgroup,
             context.x_span_id.unwrap_or(String::from("<none>")).clone()
         );
         Box::new(futures::failed("Generic failure".into()))
@@ -113,12 +136,19 @@ impl Api for Server {
         Box::new(futures::failed("Generic failure".into()))
     }
 
-    fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> {
+    fn create_work_batch(
+        &self,
+        entity_list: &Vec<models::WorkEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> {
         let context = context.clone();
         println!(
-            "create_work_batch({:?}, {:?}) - X-Span-ID: {:?}",
+            "create_work_batch({:?}, {:?}, {:?}) - X-Span-ID: {:?}",
             entity_list,
             autoaccept,
+            editgroup,
             context.x_span_id.unwrap_or(String::from("<none>")).clone()
         );
         Box::new(futures::failed("Generic failure".into()))
diff --git a/rust/fatcat-api/src/client.rs b/rust/fatcat-api/src/client.rs
index d71c9dab..628d8894 100644
--- a/rust/fatcat-api/src/client.rs
+++ b/rust/fatcat-api/src/client.rs
@@ -294,15 +294,18 @@ impl Api for Client {
         &self,
         param_entity_list: &Vec<models::ContainerEntity>,
         param_autoaccept: Option<bool>,
+        param_editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> {
         // Query parameters
         let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string()));
+        let query_editgroup = param_editgroup.map_or_else(String::new, |query| format!("editgroup={editgroup}&", editgroup = query.to_string()));
 
         let url = format!(
-            "{}/v0/container/batch?{autoaccept}",
+            "{}/v0/container/batch?{autoaccept}{editgroup}",
             self.base_path,
-            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET)
+            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET),
+            editgroup = utf8_percent_encode(&query_editgroup, QUERY_ENCODE_SET)
         );
 
         let body = serde_json::to_string(&param_entity_list).expect("impossible to fail to serialize");
@@ -436,15 +439,18 @@ impl Api for Client {
         &self,
         param_entity_list: &Vec<models::CreatorEntity>,
         param_autoaccept: Option<bool>,
+        param_editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> {
         // Query parameters
         let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string()));
+        let query_editgroup = param_editgroup.map_or_else(String::new, |query| format!("editgroup={editgroup}&", editgroup = query.to_string()));
 
         let url = format!(
-            "{}/v0/creator/batch?{autoaccept}",
+            "{}/v0/creator/batch?{autoaccept}{editgroup}",
             self.base_path,
-            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET)
+            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET),
+            editgroup = utf8_percent_encode(&query_editgroup, QUERY_ENCODE_SET)
         );
 
         let body = serde_json::to_string(&param_entity_list).expect("impossible to fail to serialize");
@@ -636,12 +642,19 @@ impl Api for Client {
         &self,
         param_entity_list: &Vec<models::FileEntity>,
         param_autoaccept: Option<bool>,
+        param_editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> {
         // Query parameters
         let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string()));
+        let query_editgroup = param_editgroup.map_or_else(String::new, |query| format!("editgroup={editgroup}&", editgroup = query.to_string()));
 
-        let url = format!("{}/v0/file/batch?{autoaccept}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET));
+        let url = format!(
+            "{}/v0/file/batch?{autoaccept}{editgroup}",
+            self.base_path,
+            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET),
+            editgroup = utf8_percent_encode(&query_editgroup, QUERY_ENCODE_SET)
+        );
 
         let body = serde_json::to_string(&param_entity_list).expect("impossible to fail to serialize");
 
@@ -774,15 +787,18 @@ impl Api for Client {
         &self,
         param_entity_list: &Vec<models::ReleaseEntity>,
         param_autoaccept: Option<bool>,
+        param_editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> {
         // Query parameters
         let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string()));
+        let query_editgroup = param_editgroup.map_or_else(String::new, |query| format!("editgroup={editgroup}&", editgroup = query.to_string()));
 
         let url = format!(
-            "{}/v0/release/batch?{autoaccept}",
+            "{}/v0/release/batch?{autoaccept}{editgroup}",
             self.base_path,
-            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET)
+            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET),
+            editgroup = utf8_percent_encode(&query_editgroup, QUERY_ENCODE_SET)
         );
 
         let body = serde_json::to_string(&param_entity_list).expect("impossible to fail to serialize");
@@ -916,12 +932,19 @@ impl Api for Client {
         &self,
         param_entity_list: &Vec<models::WorkEntity>,
         param_autoaccept: Option<bool>,
+        param_editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> {
         // Query parameters
         let query_autoaccept = param_autoaccept.map_or_else(String::new, |query| format!("autoaccept={autoaccept}&", autoaccept = query.to_string()));
+        let query_editgroup = param_editgroup.map_or_else(String::new, |query| format!("editgroup={editgroup}&", editgroup = query.to_string()));
 
-        let url = format!("{}/v0/work/batch?{autoaccept}", self.base_path, autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET));
+        let url = format!(
+            "{}/v0/work/batch?{autoaccept}{editgroup}",
+            self.base_path,
+            autoaccept = utf8_percent_encode(&query_autoaccept, QUERY_ENCODE_SET),
+            editgroup = utf8_percent_encode(&query_editgroup, QUERY_ENCODE_SET)
+        );
 
         let body = serde_json::to_string(&param_entity_list).expect("impossible to fail to serialize");
 
diff --git a/rust/fatcat-api/src/lib.rs b/rust/fatcat-api/src/lib.rs
index 044b934b..5de3647b 100644
--- a/rust/fatcat-api/src/lib.rs
+++ b/rust/fatcat-api/src/lib.rs
@@ -446,26 +446,51 @@ pub trait Api {
         &self,
         entity_list: &Vec<models::ContainerEntity>,
         autoaccept: Option<bool>,
+        editgroup: Option<String>,
         context: &Context,
     ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send>;
 
     fn create_creator(&self, entity: models::CreatorEntity, context: &Context) -> Box<Future<Item = CreateCreatorResponse, Error = ApiError> + Send>;
 
-    fn create_creator_batch(&self, entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send>;
+    fn create_creator_batch(
+        &self,
+        entity_list: &Vec<models::CreatorEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send>;
 
     fn create_editgroup(&self, entity: models::Editgroup, context: &Context) -> Box<Future<Item = CreateEditgroupResponse, Error = ApiError> + Send>;
 
     fn create_file(&self, entity: models::FileEntity, context: &Context) -> Box<Future<Item = CreateFileResponse, Error = ApiError> + Send>;
 
-    fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>;
+    fn create_file_batch(
+        &self,
+        entity_list: &Vec<models::FileEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>;
 
     fn create_release(&self, entity: models::ReleaseEntity, context: &Context) -> Box<Future<Item = CreateReleaseResponse, Error = ApiError> + Send>;
 
-    fn create_release_batch(&self, entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send>;
+    fn create_release_batch(
+        &self,
+        entity_list: &Vec<models::ReleaseEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send>;
 
     fn create_work(&self, entity: models::WorkEntity, context: &Context) -> Box<Future<Item = CreateWorkResponse, Error = ApiError> + Send>;
 
-    fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, context: &Context) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>;
+    fn create_work_batch(
+        &self,
+        entity_list: &Vec<models::WorkEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+        context: &Context,
+    ) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>;
 
     fn get_changelog(&self, limit: Option<i64>, context: &Context) -> Box<Future<Item = GetChangelogResponse, Error = ApiError> + Send>;
 
@@ -520,25 +545,40 @@ pub trait ApiNoContext {
 
     fn create_container(&self, entity: models::ContainerEntity) -> Box<Future<Item = CreateContainerResponse, Error = ApiError> + Send>;
 
-    fn create_container_batch(&self, entity_list: &Vec<models::ContainerEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send>;
+    fn create_container_batch(
+        &self,
+        entity_list: &Vec<models::ContainerEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+    ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send>;
 
     fn create_creator(&self, entity: models::CreatorEntity) -> Box<Future<Item = CreateCreatorResponse, Error = ApiError> + Send>;
 
-    fn create_creator_batch(&self, entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send>;
+    fn create_creator_batch(
+        &self,
+        entity_list: &Vec<models::CreatorEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+    ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send>;
 
     fn create_editgroup(&self, entity: models::Editgroup) -> Box<Future<Item = CreateEditgroupResponse, Error = ApiError> + Send>;
 
     fn create_file(&self, entity: models::FileEntity) -> Box<Future<Item = CreateFileResponse, Error = ApiError> + Send>;
 
-    fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>;
+    fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, editgroup: Option<String>) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send>;
 
     fn create_release(&self, entity: models::ReleaseEntity) -> Box<Future<Item = CreateReleaseResponse, Error = ApiError> + Send>;
 
-    fn create_release_batch(&self, entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send>;
+    fn create_release_batch(
+        &self,
+        entity_list: &Vec<models::ReleaseEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+    ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send>;
 
     fn create_work(&self, entity: models::WorkEntity) -> Box<Future<Item = CreateWorkResponse, Error = ApiError> + Send>;
 
-    fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>;
+    fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, editgroup: Option<String>) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send>;
 
     fn get_changelog(&self, limit: Option<i64>) -> Box<Future<Item = GetChangelogResponse, Error = ApiError> + Send>;
 
@@ -611,16 +651,26 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> {
         self.api().create_container(entity, &self.context())
     }
 
-    fn create_container_batch(&self, entity_list: &Vec<models::ContainerEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> {
-        self.api().create_container_batch(entity_list, autoaccept, &self.context())
+    fn create_container_batch(
+        &self,
+        entity_list: &Vec<models::ContainerEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+    ) -> Box<Future<Item = CreateContainerBatchResponse, Error = ApiError> + Send> {
+        self.api().create_container_batch(entity_list, autoaccept, editgroup, &self.context())
     }
 
     fn create_creator(&self, entity: models::CreatorEntity) -> Box<Future<Item = CreateCreatorResponse, Error = ApiError> + Send> {
         self.api().create_creator(entity, &self.context())
     }
 
-    fn create_creator_batch(&self, entity_list: &Vec<models::CreatorEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> {
-        self.api().create_creator_batch(entity_list, autoaccept, &self.context())
+    fn create_creator_batch(
+        &self,
+        entity_list: &Vec<models::CreatorEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+    ) -> Box<Future<Item = CreateCreatorBatchResponse, Error = ApiError> + Send> {
+        self.api().create_creator_batch(entity_list, autoaccept, editgroup, &self.context())
     }
 
     fn create_editgroup(&self, entity: models::Editgroup) -> Box<Future<Item = CreateEditgroupResponse, Error = ApiError> + Send> {
@@ -631,24 +681,29 @@ impl<'a, T: Api> ApiNoContext for ContextWrapper<'a, T> {
         self.api().create_file(entity, &self.context())
     }
 
-    fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> {
-        self.api().create_file_batch(entity_list, autoaccept, &self.context())
+    fn create_file_batch(&self, entity_list: &Vec<models::FileEntity>, autoaccept: Option<bool>, editgroup: Option<String>) -> Box<Future<Item = CreateFileBatchResponse, Error = ApiError> + Send> {
+        self.api().create_file_batch(entity_list, autoaccept, editgroup, &self.context())
     }
 
     fn create_release(&self, entity: models::ReleaseEntity) -> Box<Future<Item = CreateReleaseResponse, Error = ApiError> + Send> {
         self.api().create_release(entity, &self.context())
     }
 
-    fn create_release_batch(&self, entity_list: &Vec<models::ReleaseEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> {
-        self.api().create_release_batch(entity_list, autoaccept, &self.context())
+    fn create_release_batch(
+        &self,
+        entity_list: &Vec<models::ReleaseEntity>,
+        autoaccept: Option<bool>,
+        editgroup: Option<String>,
+    ) -> Box<Future<Item = CreateReleaseBatchResponse, Error = ApiError> + Send> {
+        self.api().create_release_batch(entity_list, autoaccept, editgroup, &self.context())
     }
 
     fn create_work(&self, entity: models::WorkEntity) -> Box<Future<Item = CreateWorkResponse, Error = ApiError> + Send> {
         self.api().create_work(entity, &self.context())
     }
 
-    fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> {
-        self.api().create_work_batch(entity_list, autoaccept, &self.context())
+    fn create_work_batch(&self, entity_list: &Vec<models::WorkEntity>, autoaccept: Option<bool>, editgroup: Option<String>) -> Box<Future<Item = CreateWorkBatchResponse, Error = ApiError> + Send> {
+        self.api().create_work_batch(entity_list, autoaccept, editgroup, &self.context())
     }
 
     fn get_changelog(&self, limit: Option<i64>) -> Box<Future<Item = GetChangelogResponse, Error = ApiError> + Send> {
diff --git a/rust/fatcat-api/src/server.rs b/rust/fatcat-api/src/server.rs
index 4e41b5e9..1ba9a218 100644
--- a/rust/fatcat-api/src/server.rs
+++ b/rust/fatcat-api/src/server.rs
@@ -301,6 +301,7 @@ where
                 // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response)
                 let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default();
                 let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok());
+                let param_editgroup = query_params.get("editgroup").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok());
 
                 // Body parameters (note that non-required body parameters will ignore garbage
                 // values, rather than causing a 400 response). Produce warning header and logs for
@@ -326,7 +327,7 @@ where
                 };
                 let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?;
 
-                match api.create_container_batch(param_entity_list.as_ref(), param_autoaccept, context).wait() {
+                match api.create_container_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup, context).wait() {
                     Ok(rsp) => match rsp {
                         CreateContainerBatchResponse::CreatedEntities(body) => {
                             let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize");
@@ -517,6 +518,7 @@ where
                 // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response)
                 let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default();
                 let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok());
+                let param_editgroup = query_params.get("editgroup").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok());
 
                 // Body parameters (note that non-required body parameters will ignore garbage
                 // values, rather than causing a 400 response). Produce warning header and logs for
@@ -542,7 +544,7 @@ where
                 };
                 let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?;
 
-                match api.create_creator_batch(param_entity_list.as_ref(), param_autoaccept, context).wait() {
+                match api.create_creator_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup, context).wait() {
                     Ok(rsp) => match rsp {
                         CreateCreatorBatchResponse::CreatedEntities(body) => {
                             let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize");
@@ -826,6 +828,7 @@ where
                 // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response)
                 let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default();
                 let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok());
+                let param_editgroup = query_params.get("editgroup").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok());
 
                 // Body parameters (note that non-required body parameters will ignore garbage
                 // values, rather than causing a 400 response). Produce warning header and logs for
@@ -851,7 +854,7 @@ where
                 };
                 let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?;
 
-                match api.create_file_batch(param_entity_list.as_ref(), param_autoaccept, context).wait() {
+                match api.create_file_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup, context).wait() {
                     Ok(rsp) => match rsp {
                         CreateFileBatchResponse::CreatedEntities(body) => {
                             let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize");
@@ -1042,6 +1045,7 @@ where
                 // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response)
                 let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default();
                 let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok());
+                let param_editgroup = query_params.get("editgroup").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok());
 
                 // Body parameters (note that non-required body parameters will ignore garbage
                 // values, rather than causing a 400 response). Produce warning header and logs for
@@ -1067,7 +1071,7 @@ where
                 };
                 let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?;
 
-                match api.create_release_batch(param_entity_list.as_ref(), param_autoaccept, context).wait() {
+                match api.create_release_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup, context).wait() {
                     Ok(rsp) => match rsp {
                         CreateReleaseBatchResponse::CreatedEntities(body) => {
                             let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize");
@@ -1258,6 +1262,7 @@ where
                 // Query parameters (note that non-required or collection query parameters will ignore garbage values, rather than causing a 400 response)
                 let query_params = req.get::<UrlEncodedQuery>().unwrap_or_default();
                 let param_autoaccept = query_params.get("autoaccept").and_then(|list| list.first()).and_then(|x| x.parse::<bool>().ok());
+                let param_editgroup = query_params.get("editgroup").and_then(|list| list.first()).and_then(|x| x.parse::<String>().ok());
 
                 // Body parameters (note that non-required body parameters will ignore garbage
                 // values, rather than causing a 400 response). Produce warning header and logs for
@@ -1283,7 +1288,7 @@ where
                 };
                 let param_entity_list = param_entity_list.ok_or_else(|| Response::with((status::BadRequest, "Missing required body parameter entity_list".to_string())))?;
 
-                match api.create_work_batch(param_entity_list.as_ref(), param_autoaccept, context).wait() {
+                match api.create_work_batch(param_entity_list.as_ref(), param_autoaccept, param_editgroup, context).wait() {
                     Ok(rsp) => match rsp {
                         CreateWorkBatchResponse::CreatedEntities(body) => {
                             let body_string = serde_json::to_string(&body).expect("impossible to fail to serialize");
-- 
cgit v1.2.3


From 6a87d4b3ab252d76bb380a69ed53f21989761e9f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 24 Aug 2018 13:23:30 -0700
Subject: NOTES => rust_libraries.txt

---
 notes/rust_libraries.txt | 19 +++++++++++++++++++
 rust/NOTES.txt           | 19 -------------------
 2 files changed, 19 insertions(+), 19 deletions(-)
 create mode 100644 notes/rust_libraries.txt
 delete mode 100644 rust/NOTES.txt

(limited to 'notes')

diff --git a/notes/rust_libraries.txt b/notes/rust_libraries.txt
new file mode 100644
index 00000000..7e6f33eb
--- /dev/null
+++ b/notes/rust_libraries.txt
@@ -0,0 +1,19 @@
+
+libs:
+- iron_slog
+- testing: keep it simple: iron-test
+    => if that is annoying, shiny? mockers if needed.
+- sentry
+- start with dotenv+clap, then config-rs?
+- cadence (emits statsd)
+- frank_jwt and JWT for (simple?) auth
+
+similar:
+- https://github.com/DavidBM/templic-backend
+- https://github.com/alexanderbanks/rust-api
+- https://mgattozzi.com/diesel-powered-rocket
+- https://www.reddit.com/r/rust/comments/8j1xbs/new_to_rust_and_gitlab_ci/
+- https://crate-ci.github.io/
+
+"cool tools":
+- cargo-watch
diff --git a/rust/NOTES.txt b/rust/NOTES.txt
deleted file mode 100644
index 7e6f33eb..00000000
--- a/rust/NOTES.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-
-libs:
-- iron_slog
-- testing: keep it simple: iron-test
-    => if that is annoying, shiny? mockers if needed.
-- sentry
-- start with dotenv+clap, then config-rs?
-- cadence (emits statsd)
-- frank_jwt and JWT for (simple?) auth
-
-similar:
-- https://github.com/DavidBM/templic-backend
-- https://github.com/alexanderbanks/rust-api
-- https://mgattozzi.com/diesel-powered-rocket
-- https://www.reddit.com/r/rust/comments/8j1xbs/new_to_rust_and_gitlab_ci/
-- https://crate-ci.github.io/
-
-"cool tools":
-- cargo-watch
-- 
cgit v1.2.3


From f997c5bcbcc800a8780a62dc56a4b7f4e5b68c3c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 24 Aug 2018 13:29:29 -0700
Subject: split/move docs around

---
 notes/database_dumps_backups.txt | 31 +++++++++++++++
 rust/HACKING.md                  | 24 ++++++++++++
 rust/INSTALL.md                  | 36 +++++++++++++++++
 rust/README.md                   | 85 +---------------------------------------
 4 files changed, 92 insertions(+), 84 deletions(-)
 create mode 100644 notes/database_dumps_backups.txt
 create mode 100644 rust/HACKING.md
 create mode 100644 rust/INSTALL.md

(limited to 'notes')

diff --git a/notes/database_dumps_backups.txt b/notes/database_dumps_backups.txt
new file mode 100644
index 00000000..0b05b9b8
--- /dev/null
+++ b/notes/database_dumps_backups.txt
@@ -0,0 +1,31 @@
+
+## Dumps and Backups
+
+There are a few different database dump formats folks might want:
+
+- raw native database backups, for disaster recovery (would include
+  volatile/unsupported schema details, user API credentials, full history,
+  in-process edits, comments, etc)
+- a sanitized version of the above: roughly per-table dumps of the full state
+  of the database. Could use per-table SQL expressions with sub-queries to pull
+  in small tables ("partial transform") and export JSON for each table; would
+  be extra work to maintain, so not pursuing for now.
+- full history, full public schema exports, in a form that might be used to
+  mirror or enitrely fork the project. Propose supplying the full "changelog"
+  in API schema format, in a single file to capture all entity history, without
+  "hydrating" any inter-entity references. Rely on separate dumps of
+  non-entity, non-versioned tables (editors, abstracts, etc). Note that a
+  variant of this could use the public interface, in particular to do
+  incremental updates (though that wouldn't capture schema changes).
+- transformed exports of the current state of the database (aka, without
+  history). Useful for data analysis, search engines, etc. Propose supplying
+  just the Release table in a fully "hydrated" state to start. Unclear if
+  should be on a work or release basis; will go with release for now. Harder to
+  do using public interface because of the need for transaction locking.
+
+Backing up the entire database using `pg_dump`, with parallelism 1 (use more on
+larger machine with fast disks; try 4 or 8?), assuming the database name is
+'fatcat', and the current user has access:
+
+    pg_dump -j1 -Fd -f test-dump fatcat
+
diff --git a/rust/HACKING.md b/rust/HACKING.md
new file mode 100644
index 00000000..a399164c
--- /dev/null
+++ b/rust/HACKING.md
@@ -0,0 +1,24 @@
+
+## Updating Schemas
+
+Regenerate API schemas after editing the fatcat-openapi2 schema. This will, as
+a side-effect, also run `cargo fmt` on the whole project, so don't run it with
+your editor open!
+
+    cargo install cargo-swagger  # uses docker
+    ./codegen_openapi2.sh
+
+Update Rust database schema (after changing raw SQL schema):
+
+    diesel database reset
+    diesel print-schema > src/database_schema.rs
+
+Debug SQL schema errors (if diesel commands fail):
+
+    psql fatcat_test < migrations/2018-05-12-001226_init/up.sql
+
+## Direct API Interaction
+
+Creating entities via API:
+
+    http --json post localhost:9411/v0/container name=asdf issn=1234-5678
diff --git a/rust/INSTALL.md b/rust/INSTALL.md
new file mode 100644
index 00000000..c2b86c51
--- /dev/null
+++ b/rust/INSTALL.md
@@ -0,0 +1,36 @@
+
+Canonical IA production/QA ansible scripts are in the journal-infra repo. These
+directions are likely to end up out-of-date.
+
+## Simple Deployment
+
+To install manually, on a bare server, as root:
+
+    adduser fatcat
+    apt install postgresql-9.6 postgresql-contrib postgresql-client-9.6 \
+        nginx build-essential git pkg-config libssl-dev libpq-dev \
+        htop screen
+    mkdir -p /srv/fatcat
+    chown fatcat:fatcat /srv/fatcat
+
+    # setup new postgres user
+    su - postgres
+    createuser -P -s fatcat     # strong random password
+    # DELETE: createdb fatcat
+
+    # as fatcat user
+    su - fatcat
+    ssh-keygen
+    curl https://sh.rustup.rs -sSf | sh
+    source $HOME/.cargo/env
+    cargo install diesel_cli --no-default-features --features "postgres"
+    cd /srv/fatcat
+    git clone git@git.archive.org:webgroup/fatcat
+    cd rust
+    cargo build
+    echo "DATABASE_URL=postgres://fatcat@localhost/fatcat" > .env
+    diesel database reset
+
+    # as fatcat, in a screen or something
+    cd /srv/fatcat/fatcat/rust
+    cargo run
diff --git a/rust/README.md b/rust/README.md
index a6873345..c061a1f9 100644
--- a/rust/README.md
+++ b/rust/README.md
@@ -29,87 +29,4 @@ Tests:
 
     cargo test -- --test-threads 1
 
-## Simple Deployment
-
-Canonical ansible scripts are in the journal-infra repo. To install manually,
-on a bare server, as root:
-
-    adduser fatcat
-    apt install postgresql-9.6 postgresql-contrib postgresql-client-9.6 \
-        nginx build-essential git pkg-config libssl-dev libpq-dev \
-        htop screen
-    mkdir -p /srv/fatcat
-    chown fatcat:fatcat /srv/fatcat
-
-    # setup new postgres user
-    su - postgres
-    createuser -P -s fatcat     # strong random password
-    # DELETE: createdb fatcat
-
-    # as fatcat user
-    su - fatcat
-    ssh-keygen
-    curl https://sh.rustup.rs -sSf | sh
-    source $HOME/.cargo/env
-    cargo install diesel_cli --no-default-features --features "postgres"
-    cd /srv/fatcat
-    git clone git@git.archive.org:webgroup/fatcat
-    cd rust
-    cargo build
-    echo "DATABASE_URL=postgres://fatcat@localhost/fatcat" > .env
-    diesel database reset
-
-    # as fatcat, in a screen or something
-    cd /srv/fatcat/fatcat/rust
-    cargo run
-
-### Dumps and Backups
-
-There are a few different databaase dump formats folks might want:
-
-- raw native database backups, for disaster recovery (would include
-  volatile/unsupported schema details, user API credentials, full history,
-  in-process edits, comments, etc)
-- a sanitized version of the above: roughly per-table dumps of the full state
-  of the database. Could use per-table SQL expressions with sub-queries to pull
-  in small tables ("partial transform") and export JSON for each table; would
-  be extra work to maintain, so not pursuing for now.
-- full history, full public schema exports, in a form that might be used to
-  mirror or enitrely fork the project. Propose supplying the full "changelog"
-  in API schema format, in a single file to capture all entity history, without
-  "hydrating" any inter-entity references. Rely on separate dumps of
-  non-entity, non-versioned tables (editors, abstracts, etc). Note that a
-  variant of this could use the public interface, in particular to do
-  incremental updates (though that wouldn't capture schema changes).
-- transformed exports of the current state of the database (aka, without
-  history). Useful for data analysis, search engines, etc. Propose supplying
-  just the Release table in a fully "hydrated" state to start. Unclear if
-  should be on a work or release basis; will go with release for now. Harder to
-  do using public interface because of the need for transaction locking.
-
-Backing up the entire database using `pg_dump`, with parallelism 1 (use more on
-larger machine with fast disks; try 4 or 8?), assuming the database name is
-'fatcat', and the current user has access:
-
-    pg_dump -j1 -Fd -f test-dump fatcat
-
-### Special Tricks
-
-Regenerate API schemas (this will, as a side-effect, also run `cargo fmt` on
-the whole project, so don't run it with your editor open):
-
-    cargo install cargo-swagger  # uses docker
-    ./codegen_openapi2.sh
-
-Regenerate SQL schema:
-
-    diesel database reset
-    diesel print-schema > src/database_schema.rs
-
-Debugging SQL schema errors:
-
-    psql fatcat_test < migrations/2018-05-12-001226_init/up.sql
-
-Creating entities via API:
-
-    http --json post localhost:9411/v0/container name=asdf issn=1234-5678
+See `HACKING` for some more advanced tips and commands.
-- 
cgit v1.2.3


From 74ee26468cb0e48121d9f65e255401d34d6118c0 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 20 Aug 2018 14:49:06 -0700
Subject: notes on recent bulk import

---
 notes/import_timing_20180815.txt | 292 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 292 insertions(+)
 create mode 100644 notes/import_timing_20180815.txt

(limited to 'notes')

diff --git a/notes/import_timing_20180815.txt b/notes/import_timing_20180815.txt
new file mode 100644
index 00000000..1206cc41
--- /dev/null
+++ b/notes/import_timing_20180815.txt
@@ -0,0 +1,292 @@
+
+Schema changes since previous imports:
+- more fields (identifiers+indexes)
+- timestamps
+- UUIDs more places
+- fixed some crossref import bugs?
+- abstracts
+- file_urls as table (not single value)
+- timestamps
+- TEXT -> CHAR in a few places
+- removed many work fields
+
+## Containers
+
+(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
+
+real    1m25.292s
+user    0m12.640s
+sys     0m0.412s
+
+## Creators
+
+time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+
+(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*)
+real    22m2.465s
+user    26m41.924s
+sys     1m33.844s
+
+## Releases
+
+xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
+
+    128516.30 user
+    3905.14 system
+    44:17:05 elapsed
+    83% CPU
+
+Almost 44 hours... I think I remember more like 36 hours last time? Things
+slowed down a lot towards the end, many more ORCID cross-references?
+
+looking in htop, postgres seems to be primary bottleneck. At something like 12
+hours in, had 44 million release_ident rows, which is 1000/second.
+
+Note: seems like the more frequently `count(*)` is run, the more performant.
+Because in-memory?
+
+    2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG:  duration: 42949.549 ms  statement: select count(id) from release_ident;
+
+    fatcat_prod=# select count(*) from release_ident;
+    count   
+    ----------
+    44185608
+    (1 row)
+
+    Time: 2753.916 ms (00:02.754)
+    fatcat_prod=# select count(*) from release_ident;
+    count   
+    ----------
+    44187937
+    (1 row)
+
+    Time: 2711.670 ms (00:02.712)
+
+As expected, autovacuum very busy. Only ~150 TPS; but that includes batch
+writes? 75061172 rows.
+
+## Files
+
+    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
+
+    Done! Inserted 6607075
+
+    real    2152m28.822s => 36 hours (!)
+    user    401m46.464s
+    sys     21m45.724s
+
+
+Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table?
+
+    SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce';
+
+Created index:
+    
+    CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev);
+
+... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't
+see non-indexed queries. Maybe an ANALYZE does need to happen?
+
+This being single-threaded is going to be a problem in the future. ~50 million
+files would be ~2 weeks.
+
+## Post-Import Status
+
+    Size:  358.89G (postgres self-reported)
+    Mem.:   57.10% -    16.85G/49.14G
+
+Was 184G last time in late June; doubled in size (!).
+
+    bnewbold@wbgrp-svc500$ df -h /
+    Filesystem      Size  Used Avail Use% Mounted on
+    /dev/vda1       858G  529G  286G  65% /
+
+    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/
+    361G    /var/lib/postgresql/
+    83G     /srv/datasets/
+    77G     /srv/elastic-blah/
+
+    fatcat_prod=# select count(*) from changelog; => 2,085,067
+
+    SELECT
+        table_name,
+        pg_size_pretty(table_size) AS table_size,
+        pg_size_pretty(indexes_size) AS indexes_size,
+        pg_size_pretty(total_size) AS total_size
+    FROM (
+        SELECT
+            table_name,
+            pg_table_size(table_name) AS table_size,
+            pg_indexes_size(table_name) AS indexes_size,
+            pg_total_relation_size(table_name) AS total_size
+        FROM (
+            SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+            FROM information_schema.tables
+        ) AS all_tables
+        ORDER BY total_size DESC
+    ) AS pretty_sizes;
+
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref"                                       | 159 GB     | 47 GB        | 206 GB
+ "public"."release_rev"                                       | 40 GB      | 10 GB        | 51 GB
+ "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB
+ "public"."release_ident"                                     | 5797 MB    | 6597 MB      | 12 GB
+ "public"."work_ident"                                        | 5787 MB    | 6394 MB      | 12 GB
+ "public"."release_edit"                                      | 6674 MB    | 4646 MB      | 11 GB
+ "public"."work_edit"                                         | 6674 MB    | 4646 MB      | 11 GB
+ "public"."work_rev"                                          | 3175 MB    | 2939 MB      | 6114 MB
+ "public"."file_rev_url"                                      | 1995 MB    | 275 MB       | 2270 MB
+ "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB
+ "public"."file_rev"                                          | 829 MB     | 954 MB       | 1783 MB
+ "public"."file_ident"                                        | 498 MB     | 532 MB       | 1030 MB
+ "public"."file_release"                                      | 369 MB     | 642 MB       | 1011 MB
+ "public"."file_edit"                                         | 591 MB     | 410 MB       | 1002 MB
+ "public"."creator_rev"                                       | 337 MB     | 318 MB       | 655 MB
+ "public"."creator_ident"                                     | 280 MB     | 297 MB       | 577 MB
+ "public"."creator_edit"                                      | 316 MB     | 220 MB       | 536 MB
+ "public"."release_rev_abstract"                              | 183 MB     | 84 MB        | 267 MB
+ "public"."changelog"                                         | 123 MB     | 125 MB       | 249 MB
+ "public"."editgroup"                                         | 139 MB     | 81 MB        | 220 MB
+ "public"."container_rev"                                     | 19 MB      | 6912 kB      | 26 MB
+ "public"."container_ident"                                   | 6896 kB    | 7016 kB      | 14 MB
+ "public"."container_edit"                                    | 8056 kB    | 5240 kB      | 13 MB
+
+In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB.
+
+For many of these indexes, and the _ident tables, switching from UUID to
+BIGSERIAL would half the size.
+
+## Exports
+
+    time ./fatcat_export.py changelog - | pv -l | wc
+
+    As of:
+
+    159k 1:17:35 [34.3 /s]
+    159,740 lines
+    2,427,277,881 chars (bytes; 2.4GB)
+
+    real    77m35.183s
+    user    15m36.208s
+    sys     0m31.484s
+
+Running at about 100/sec; estimate 6 hours for completion. Could shard using
+start/end flags, but am not here.
+
+Running `quick_dump.sql` (identifier tables, in a transaction):
+
+    251M Aug 19 23:08 fatcat_ident_creators.tsv
+    5.9M Aug 19 23:08 fatcat_ident_containers.tsv
+    467M Aug 19 23:08 fatcat_ident_files.tsv
+    5.2G Aug 19 23:10 fatcat_ident_releases.tsv
+    5.2G Aug 19 23:11 fatcat_ident_works.tsv
+     12K Aug 19 23:11 .
+    1.8G Aug 19 23:12 fatcat_abstracts.json
+
+Work and Release tables in under 2 minutes each; say 5 minutes total.
+
+    time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc
+
+    172k 1:07:08 [42.7 /s]
+    172181 lines
+    1,118,166,293 chars (bytes; 1.1 GB)
+
+    real    67m8.340s
+    user    10m21.988s
+    sys     0m34.612s
+
+Running at only 10/sec or so, this would take forever even if sharded. :(
+
+Both exports/dumps are running in parallel. "Expand" queries might help with speed?
+
+## Postgres Analysis
+
+SELECT *
+FROM
+  pg_stat_statements
+ORDER BY
+  total_time DESC LIMIT 5;
+
+Summary:
+
+    SELECT "creator_ident" by ORCID
+        1,295,864 calls
+        930,305,208 total time
+        717.9 mean time     <= this should be less than a ms!
+
+    INSERT INTO release_rev
+        75144055 calls
+        111470961 total time
+        1.483 mean time
+
+    INSERT INTO work_rev
+        75,144,055 calls
+        82693994 total time
+        1.1 mean time
+
+    INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING *
+        26,008,280 calls    <= why so few? different query depending on number
+                               of rows inserted
+        18955782 total time
+        0.728 mean time
+
+    SELECT container_ident
+        78,4143 calls
+        17683156 total time
+        22.55 mean time     <= why so slow?
+
+    INSERT INTO release_contrib 
+        15,072,820 calls
+
+    INSERT INTO "release_contrib
+
+
+       relname        | too_much_seq |      case      |   rel_size   | seq_scan | idx_scan  
+----------------------+--------------+----------------+--------------+----------+-----------
+ file_rev_url         |         2391 | Missing Index? |   2091147264 |     2391 |         0
+ file_release         |       -30670 | OK             |    386899968 |        2 |     30672
+ container_rev        |      -979948 | OK             |     20242432 |   784146 |   1764094
+ file_edit            |     -2206807 | OK             |    619896832 |        6 |   2206813
+ creator_edit         |     -2206810 | OK             |    331079680 |       11 |   2206821
+ work_edit            |     -2206811 | OK             |   6996566016 |       14 |   2206825
+ release_edit         |     -2206811 | OK             |   6996582400 |       14 |   2206825
+ container_edit       |     -2206816 | OK             |      8216576 |        5 |   2206821
+ changelog            |     -2209659 | OK             |    129286144 |       10 |   2209669
+ abstracts            |     -3486466 | OK             |   1706237952 |        8 |   3486474
+ release_rev_abstract |     -4975493 | OK             |    191602688 |    42919 |   5018412
+ release_ref          |     -5032717 | OK             | 170494861312 |        3 |   5032720
+ release_contrib      |     -5032744 | OK             |  20370251776 |        3 |   5032747
+ creator_rev          |     -8400410 | OK             |    353583104 |  1296507 |   9696917
+ file_ident           |    -13483224 | OK             |    522190848 |        7 |  13483231
+ creator_ident        |    -16686744 | OK             |    293625856 |        3 |  16686747
+ file_rev             |    -32405557 | OK             |    868515840 |        4 |  32405561
+ container_ident      |    -69162337 | OK             |      7028736 |        3 |  69162340
+ work_rev             |   -150288161 | OK             |   3328589824 |        1 | 150288162
+ editgroup            |   -162783807 | OK             |    146112512 |        9 | 162783816
+ release_ident        |   -165676917 | OK             |   6076841984 |       52 | 165676969
+ work_ident           |   -229439828 | OK             |   6066814976 |        3 | 229439831
+ release_rev          |   -930140217 | OK             |  43360542720 |        9 | 930140226
+
+TODO changes:
+- don't return all as often; in particular, inserting release_contrib, release_ref
+x missing an index somewhere on file_rev_url, release_rev_abstract
+x why so many seq_scan on container_rev, creator_rev
+    => running/EXPLAIN same query on psql hits index, not seq_scan
+    => seemed to be an issue with VALUE params getting sent separately; query
+    planner only looked at query and wasn't using index on ORCID/ISSN-L because
+    it didn't know those values were not-NULL?
+    => adding NOT NULL to query seems to have sped up case of there being a
+    "hit", but no hit still slow. might need to change indices or something for
+    the (perhaps common in future) case of DOI lookups with invalid DOIs (eg,
+    CORE import)
+
+random DEBUG queries:
+
+    EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+    EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1;
+
+    EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1;
+
+    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-- 
cgit v1.2.3


From 7cc6337d57063ca17bde2ff36445df480773325e Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 20 Aug 2018 14:49:22 -0700
Subject: cost notes on hosting a fatcat mirror

---
 notes/cloud_instances.txt | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 notes/cloud_instances.txt

(limited to 'notes')

diff --git a/notes/cloud_instances.txt b/notes/cloud_instances.txt
new file mode 100644
index 00000000..4582c431
--- /dev/null
+++ b/notes/cloud_instances.txt
@@ -0,0 +1,8 @@
+
+digital ocean
+    48 GB RAM, 12 cores, 960 GB $240/month
+    (or more)
+
+aws
+    i3.2xlarge 61 GB RAM, 8 cores, 1900 GB NVMe, $455/month
+
-- 
cgit v1.2.3


From ca00cb327cb066c85a0f11a947b4497655ddf0de Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 24 Aug 2018 13:37:43 -0700
Subject: database dump notes

---
 notes/database_dumps_backups.txt | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'notes')

diff --git a/notes/database_dumps_backups.txt b/notes/database_dumps_backups.txt
index 0b05b9b8..60d4bba0 100644
--- a/notes/database_dumps_backups.txt
+++ b/notes/database_dumps_backups.txt
@@ -23,9 +23,31 @@ There are a few different database dump formats folks might want:
   should be on a work or release basis; will go with release for now. Harder to
   do using public interface because of the need for transaction locking.
 
+## Full Postgres Backup
+
 Backing up the entire database using `pg_dump`, with parallelism 1 (use more on
 larger machine with fast disks; try 4 or 8?), assuming the database name is
 'fatcat', and the current user has access:
 
     pg_dump -j1 -Fd -f test-dump fatcat
 
+## Identifier Dumps
+
+The `extras/quick_dump.sql` script will dump abstracts and identifiers as TSV
+files to `/tmp/`. Pretty quick; takes about 15 GB of disk space (uncompressed).
+
+## Releases Export
+
+    # simple command
+    ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv /tmp/releases-dump.json
+
+    # usual command
+    time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc
+
+## Changelog Export
+
+    # simple command
+    ./fatcat_export.py changelog /tmp/changelog-dump.json
+
+    # usual command
+    time ./fatcat_export.py changelog - | pv -l | wc
-- 
cgit v1.2.3


From 8cccbcdef11e7ddc761ec494cb894a8d49a0d510 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Aug 2018 20:02:26 -0700
Subject: autoaccept notes

---
 notes/autoaccept_api.txt | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 notes/autoaccept_api.txt

(limited to 'notes')

diff --git a/notes/autoaccept_api.txt b/notes/autoaccept_api.txt
new file mode 100644
index 00000000..b7e0a824
--- /dev/null
+++ b/notes/autoaccept_api.txt
@@ -0,0 +1,31 @@
+
+Currently only on batch creation (POST) for entities.
+
+For all bulk operations, optional 'editgroup' query parameter overrides
+individual editgroup parameters.
+
+If autoaccept flag is set and editgroup is not, a new editgroup is
+automatically created and overrides for all entities inserted. Note
+that this is different behavior from the "use current or create new"
+default behavior for regular creation.
+
+Unfortunately, "true" and "false" are the only values acceptable for boolean
+rust/openapi2 query parameters
+
+THOUGHT: doing an UPDATE in a transaction is probably not expensive
+
+Intent:
+- check can_autoaccept flag on editor table
+
+---------
+
+Crude benchmarking...
+
+cat /data/crossref/crossref-works.2018-01-21.badsample_5k.json | time ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+
+autoaccept: 7.47user 0.48system 0:30.64elapsed 25%CPU
+master: 5.70user 0.34system 0:25.61elapsed 23%CPU
+    batch creation: ~153ms+
+    accept: ~5ms
+
+uh...
-- 
cgit v1.2.3