41 files changed, 1966 insertions, 742 deletions
diff --git a/python/Pipfile b/python/Pipfile
index eebdab36..b04bb91a 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -32,6 +32,7 @@ python-dateutil = "*"
 sickle = "*"
 python-snappy = "*"
 pymacaroons = "*"
+ftfy= "*"
 
 [requires]
 # Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 296079f0..f2d39a99 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "c3deb49cf4c122c2aed3f4f944e9763cfcf40c85891ca3d3e9cabc3debbb9075"
+            "sha256": "8f98bb3f6a3083c8b03cb68d1ee48b25449a950dd8a9d15189f2eb4fae48f760"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -96,27 +96,27 @@
         },
         "cryptography": {
             "hashes": [
-                "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd",
-                "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038",
-                "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378",
-                "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3",
-                "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13",
-                "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc",
-                "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f",
-                "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427",
-                "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515",
-                "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b",
-                "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf",
-                "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681",
-                "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b",
-                "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5",
-                "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b",
-                "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6",
-                "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70",
-                "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e",
-                "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859"
-            ],
-            "version": "==2.4.2"
+                "sha256:05b3ded5e88747d28ee3ef493f2b92cbb947c1e45cf98cfef22e6d38bb67d4af",
+                "sha256:06826e7f72d1770e186e9c90e76b4f84d90cdb917b47ff88d8dc59a7b10e2b1e",
+                "sha256:08b753df3672b7066e74376f42ce8fc4683e4fd1358d34c80f502e939ee944d2",
+                "sha256:2cd29bd1911782baaee890544c653bb03ec7d95ebeb144d714b0f5c33deb55c7",
+                "sha256:31e5637e9036d966824edaa91bf0aa39dc6f525a1c599f39fd5c50340264e079",
+                "sha256:42fad67d7072216a49e34f923d8cbda9edacbf6633b19a79655e88a1b4857063",
+                "sha256:4946b67235b9d2ea7d31307be9d5ad5959d6c4a8f98f900157b47abddf698401",
+                "sha256:522fdb2809603ee97a4d0ef2f8d617bc791eb483313ba307cb9c0a773e5e5695",
+                "sha256:6f841c7272645dd7c65b07b7108adfa8af0aaea57f27b7f59e01d41f75444c85",
+                "sha256:7d335e35306af5b9bc0560ca39f740dfc8def72749645e193dd35be11fb323b3",
+                "sha256:8504661ffe324837f5c4607347eeee4cf0fcad689163c6e9c8d3b18cf1f4a4ad",
+                "sha256:9260b201ce584d7825d900c88700aa0bd6b40d4ebac7b213857bd2babee9dbca",
+                "sha256:9a30384cc402eac099210ab9b8801b2ae21e591831253883decdb4513b77a3cd",
+                "sha256:9e29af877c29338f0cab5f049ccc8bd3ead289a557f144376c4fbc7d1b98914f",
+                "sha256:ab50da871bc109b2d9389259aac269dd1b7c7413ee02d06fe4e486ed26882159",
+                "sha256:b13c80b877e73bcb6f012813c6f4a9334fcf4b0e96681c5a15dac578f2eedfa0",
+                "sha256:bfe66b577a7118e05b04141f0f1ed0959552d45672aa7ecb3d91e319d846001e",
+                "sha256:e091bd424567efa4b9d94287a952597c05d22155a13716bf5f9f746b9dc906d3",
+                "sha256:fa2b38c8519c5a3aa6e2b4e1cf1a549b54acda6adb25397ff542068e73d1ed00"
+            ],
+            "version": "==2.5"
         },
         "fatcat-client": {
             "editable": true,
@@ -152,6 +152,14 @@
             "index": "pypi",
             "version": "==0.2"
         },
+        "ftfy": {
+            "hashes": [
+                "sha256:84a1614190173bb447ac9d581e50185c6aa35b538754b6bedaba0cc0f83d8e80",
+                "sha256:fa74757fb7cb444366fa6a79c2feabd40281a44dfbf6eaed492a804764ee26b2"
+            ],
+            "index": "pypi",
+            "version": "==5.5.1"
+        },
         "idna": {
             "hashes": [
                 "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
@@ -366,6 +374,13 @@
             ],
             "version": "==1.24.1"
         },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        },
         "werkzeug": {
             "hashes": [
                 "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c",
@@ -558,10 +573,10 @@
         },
         "parso": {
             "hashes": [
-                "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
-                "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
+                "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d",
+                "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31"
             ],
-            "version": "==0.3.1"
+            "version": "==0.3.2"
         },
         "pathlib2": {
             "hashes": [
@@ -595,10 +610,10 @@
         },
         "pluggy": {
             "hashes": [
-                "sha256:447ba94990e8014ee25ec853339faf7b0fc8050cdc3289d4d71f7f410fb90095",
-                "sha256:bde19360a8ec4dfd8a20dcb811780a30998101f078fc7ded6162f0076f50508f"
+                "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
+                "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
             ],
-            "version": "==0.8.0"
+            "version": "==0.8.1"
         },
         "prompt-toolkit": {
             "hashes": [
@@ -610,38 +625,38 @@
         },
         "psycopg2": {
             "hashes": [
-                "sha256:10e391687b171878181e71736d0effe3772314a339d9ae30995ec8171a0c834e",
-                "sha256:1283f9d45e458c2dcb15ba89367923563f90ef636fe78ee22df75183484a0237",
-                "sha256:1a9c32e4d140bea225f9821d993b2e53c913e717ea97b851246aa9b300095d8f",
-                "sha256:1be6f2438d2b71fec7b07c3c0949dd321b04349c382907ea76b36120edec8300",
-                "sha256:20ca6f29e118b8dd7133e8708b3fba2881e70a4e0841f874ed23985b7201a076",
-                "sha256:227c115b3c1f65d61385e51ac690b91b584640aefb45bffacd4bd33d02ed7221",
-                "sha256:27959abe64ca1fc6d8cd11a71a1f421d8287831a3262bd4cacd43bbf43cc3c82",
-                "sha256:2b2daf1fe30a58300542aea679fd87d1e1c2afd36e7644837b7954fa2dbacb92",
-                "sha256:36e51a51f295fdf67bcf05e7b1877011a6b39e6622b0013fe31c5025241873a3",
-                "sha256:3992b9b914f2eb77dc07e8045d2ca979e491612808bc5c7cd68f307469acf9f6",
-                "sha256:39a11de2335ad45ececed43ab851d36a4c52843d756471b940804f301792781e",
-                "sha256:3c2afe9ef0d1649005e3ccf93c1aaccd6f8ee379530e763d3b3b77f406b7c0ae",
-                "sha256:3fb18e0e52807fe3a300dc1b5421aa492d5e759550918f597d61863419482535",
-                "sha256:55eab94de96ee9702f23283e9c8b03cfdb0001e2b14d5d2e1bd5ff8114b96b9f",
-                "sha256:7e95c0ab7e7e6e452586f35d4d8966b1e924c8dd2c23977e3ea4968770ff1d26",
-                "sha256:7f47514dbddf604f196fcfe5da955537f04691bef8124aff5632316a78d992b7",
-                "sha256:8345370356bb4bddf93acbcfd0357163dd6b09471937adcfb38a2fbb49bdce53",
-                "sha256:8bc6ecb220c0b88d3742042013129c817c44459795c97e9ce1bca70a3f37a53b",
-                "sha256:8df623f248be15d1725faf5f333791678775047f12f17a90d29b5d22573f5cdc",
-                "sha256:9645f1305e4268cc0fc88c823cd6c91de27c003e183c233a6a230e5e963039ee",
-                "sha256:a68719ed5be8373dd72c9e45d55f7a202285e05a2e392eaa8872a67ea47d7d20",
-                "sha256:aca0edf062ec09e954fdf0cc93d3a872362701210983a1442549e703aedec25d",
-                "sha256:b0dd2114d93d8f424bb8ae76e0dc540f104b70ca9163172c05e7700b1459d4c9",
-                "sha256:b2c09359d6802279efb9efb3f91a9c94567151baee95175f9b637ea628f35244",
-                "sha256:ca7bc37b1efb7cc25271bf10f398462ed975d95259af1406d38fcb268466e34f",
-                "sha256:e64235d9013ebf6319cb9654e08f5066112c34d8c4cc41186254ab9c3d6d5b9b",
-                "sha256:ec9be679c0065667503851141c31fa699e1cc69ded3ba8e5d3673dd5a6eb1370",
-                "sha256:eca00d0f91fcb44d88b12f1fd16ad138e38fa07debb79587e2b7ff1fe80d72b9",
-                "sha256:f256e807b8b2b45b6af60d7f2bb5194aab2f4acc861241c4d8ef942a55f5030d",
-                "sha256:fce7612a3bd6a7ba95799f88285653bf130bd7ca066b52674d5f850108b2aec0"
-            ],
-            "version": "==2.7.6.1"
+                "sha256:02445ebbb3a11a3fe8202c413d5e6faf38bb75b4e336203ee144ca2c46529f94",
+                "sha256:0e9873e60f98f0c52339abf8f0339d1e22bfe5aae0bcf7aabd40c055175035ec",
+                "sha256:1148a5eb29073280bf9057c7fc45468592c1bb75a28f6df1591adb93c8cb63d0",
+                "sha256:259a8324e109d4922b0fcd046e223e289830e2568d6f4132a3702439e5fd532b",
+                "sha256:28dffa9ed4595429e61bacac41d3f9671bb613d1442ff43bcbec63d4f73ed5e8",
+                "sha256:314a74302d4737a3865d40ea50e430ce1543c921ba10f39d562e807cfe2edf2a",
+                "sha256:36b60201b6d215d7658a71493fdf6bd5e60ad9a0cffed39906627ff9f4f3afd3",
+                "sha256:3f9d532bce54c4234161176ff3b8688ff337575ca441ea27597e112dfcd0ee0c",
+                "sha256:5d222983847b40af989ad96c07fc3f07e47925e463baa5de716be8f805b41d9b",
+                "sha256:6757a6d2fc58f7d8f5d471ad180a0bd7b4dd3c7d681f051504fbea7ae29c8d6f",
+                "sha256:6a0e0f1e74edb0ab57d89680e59e7bfefad2bfbdf7c80eb38304d897d43674bb",
+                "sha256:6ca703ccdf734e886a1cf53eb702261110f6a8b0ed74bcad15f1399f74d3f189",
+                "sha256:8513b953d8f443c446aa79a4cc8a898bd415fc5e29349054f03a7d696d495542",
+                "sha256:9262a5ce2038570cb81b4d6413720484cb1bc52c064b2f36228d735b1f98b794",
+                "sha256:97441f851d862a0c844d981cbee7ee62566c322ebb3d68f86d66aa99d483985b",
+                "sha256:a07feade155eb8e69b54dd6774cf6acf2d936660c61d8123b8b6b1f9247b67d6",
+                "sha256:a9b9c02c91b1e3ec1f1886b2d0a90a0ea07cc529cb7e6e472b556bc20ce658f3",
+                "sha256:ae88216f94728d691b945983140bf40d51a1ff6c7fe57def93949bf9339ed54a",
+                "sha256:b360ffd17659491f1a6ad7c928350e229c7b7bd83a2b922b6ee541245c7a776f",
+                "sha256:b4221957ceccf14b2abdabef42d806e791350be10e21b260d7c9ce49012cc19e",
+                "sha256:b90758e49d5e6b152a460d10b92f8a6ccf318fcc0ee814dcf53f3a6fc5328789",
+                "sha256:c669ea986190ed05fb289d0c100cc88064351f2b85177cbfd3564c4f4847d18c",
+                "sha256:d1b61999d15c79cf7f4f7cc9021477aef35277fc52452cf50fd13b713c84424d",
+                "sha256:de7bb043d1adaaf46e38d47e7a5f703bb3dab01376111e522b07d25e1a79c1e1",
+                "sha256:e393568e288d884b94d263f2669215197840d097c7e5b0acd1a51c1ea7d1aba8",
+                "sha256:ed7e0849337bd37d89f2c2b0216a0de863399ee5d363d31b1e5330a99044737b",
+                "sha256:f153f71c3164665d269a5d03c7fa76ba675c7a8de9dc09a4e2c2cdc9936a7b41",
+                "sha256:f1fb5a8427af099beb7f65093cbdb52e021b8e6dbdfaf020402a623f4181baf5",
+                "sha256:f36b333e9f86a2fba960c72b90c34be6ca71819e300f7b1fc3d2b0f0b2c546cd",
+                "sha256:f4526d078aedd5187d0508aa5f9a01eae6a48a470ed678406da94b4cd6524b7e"
+            ],
+            "version": "==2.7.7"
         },
         "ptyprocess": {
             "hashes": [
@@ -674,11 +689,11 @@
         },
         "pytest": {
             "hashes": [
-                "sha256:3e65a22eb0d4f1bdbc1eacccf4a3198bf8d4049dea5112d70a0c61b00e748d02",
-                "sha256:5924060b374f62608a078494b909d341720a050b5224ff87e17e12377486a71d"
+                "sha256:41568ea7ecb4a68d7f63837cf65b92ce8d0105e43196ff2b26622995bb3dc4b2",
+                "sha256:c3c573a29d7c9547fb90217ece8a8843aa0c1328a797e200290dc3d0b4b823be"
             ],
             "index": "pypi",
-            "version": "==4.1.0"
+            "version": "==4.1.1"
         },
         "pytest-cov": {
             "hashes": [
@@ -727,30 +742,30 @@
         },
         "typed-ast": {
             "hashes": [
-                "sha256:0555eca1671ebe09eb5f2176723826f6f44cca5060502fea259de9b0e893ab53",
-                "sha256:0ca96128ea66163aea13911c9b4b661cb345eb729a20be15c034271360fc7474",
-                "sha256:16ccd06d614cf81b96de42a37679af12526ea25a208bce3da2d9226f44563868",
-                "sha256:1e21ae7b49a3f744958ffad1737dfbdb43e1137503ccc59f4e32c4ac33b0bd1c",
-                "sha256:37670c6fd857b5eb68aa5d193e14098354783b5138de482afa401cc2644f5a7f",
-                "sha256:46d84c8e3806619ece595aaf4f37743083f9454c9ea68a517f1daa05126daf1d",
-                "sha256:5b972bbb3819ece283a67358103cc6671da3646397b06e7acea558444daf54b2",
-                "sha256:6306ffa64922a7b58ee2e8d6f207813460ca5a90213b4a400c2e730375049246",
-                "sha256:6cb25dc95078931ecbd6cbcc4178d1b8ae8f2b513ae9c3bd0b7f81c2191db4c6",
-                "sha256:7e19d439fee23620dea6468d85bfe529b873dace39b7e5b0c82c7099681f8a22",
-                "sha256:7f5cd83af6b3ca9757e1127d852f497d11c7b09b4716c355acfbebf783d028da",
-                "sha256:81e885a713e06faeef37223a5b1167615db87f947ecc73f815b9d1bbd6b585be",
-                "sha256:94af325c9fe354019a29f9016277c547ad5d8a2d98a02806f27a7436b2da6735",
-                "sha256:b1e5445c6075f509d5764b84ce641a1535748801253b97f3b7ea9d948a22853a",
-                "sha256:cb061a959fec9a514d243831c514b51ccb940b58a5ce572a4e209810f2507dcf",
-                "sha256:cc8d0b703d573cbabe0d51c9d68ab68df42a81409e4ed6af45a04a95484b96a5",
-                "sha256:da0afa955865920edb146926455ec49da20965389982f91e926389666f5cf86a",
-                "sha256:dc76738331d61818ce0b90647aedde17bbba3d3f9e969d83c1d9087b4f978862",
-                "sha256:e7ec9a1445d27dbd0446568035f7106fa899a36f55e52ade28020f7b3845180d",
-                "sha256:f741ba03feb480061ab91a465d1a3ed2d40b52822ada5b4017770dfcb88f839f",
-                "sha256:fe800a58547dd424cd286b7270b967b5b3316b993d86453ede184a17b5a6b17d"
+                "sha256:023625bfa9359e29bd6e24cac2a4503495b49761d48a5f1e38333fc4ac4d93fe",
+                "sha256:07591f7a5fdff50e2e566c4c1e9df545c75d21e27d98d18cb405727ed0ef329c",
+                "sha256:153e526b0f4ffbfada72d0bb5ffe8574ba02803d2f3a9c605c8cf99dfedd72a2",
+                "sha256:3ad2bdcd46a4a1518d7376e9f5016d17718a9ed3c6a3f09203d832f6c165de4a",
+                "sha256:3ea98c84df53ada97ee1c5159bb3bc784bd734231235a1ede14c8ae0775049f7",
+                "sha256:51a7141ccd076fa561af107cfb7a8b6d06a008d92451a1ac7e73149d18e9a827",
+                "sha256:52c93cd10e6c24e7ac97e8615da9f224fd75c61770515cb323316c30830ddb33",
+                "sha256:6344c84baeda3d7b33e157f0b292e4dd53d05ddb57a63f738178c01cac4635c9",
+                "sha256:64699ca1b3bd5070bdeb043e6d43bc1d0cebe08008548f4a6bee782b0ecce032",
+                "sha256:74903f2e56bbffe29282ef8a5487d207d10be0f8513b41aff787d954a4cf91c9",
+                "sha256:7891710dba83c29ee2bd51ecaa82f60f6bede40271af781110c08be134207bf2",
+                "sha256:91976c56224e26c256a0de0f76d2004ab885a29423737684b4f7ebdd2f46dde2",
+                "sha256:9bad678a576ecc71f25eba9f1e3fd8d01c28c12a2834850b458428b3e855f062",
+                "sha256:b4726339a4c180a8b6ad9d8b50d2b6dc247e1b79b38fe2290549c98e82e4fd15",
+                "sha256:ba36f6aa3f8933edf94ea35826daf92cbb3ec248b89eccdc053d4a815d285357",
+                "sha256:bbc96bde544fd19e9ef168e4dfa5c3dfe704bfa78128fa76f361d64d6b0f731a",
+                "sha256:c0c927f1e44469056f7f2dada266c79b577da378bbde3f6d2ada726d131e4824",
+                "sha256:c0f9a3708008aa59f560fa1bd22385e05b79b8e38e0721a15a8402b089243442",
+                "sha256:f0bf6f36ff9c5643004171f11d2fdc745aa3953c5aacf2536a0685db9ceb3fb1",
+                "sha256:f5be39a0146be663cbf210a4d95c3c58b2d7df7b043c9047c5448e358f0550a2",
+                "sha256:fcd198bf19d9213e5cbf2cde2b9ef20a9856e716f76f9476157f90ae6de06cc6"
             ],
             "markers": "python_version < '3.7' and implementation_name == 'cpython'",
-            "version": "==1.1.1"
+            "version": "==1.2.0"
         },
         "urllib3": {
             "hashes": [
@@ -768,9 +783,9 @@
         },
         "wrapt": {
             "hashes": [
-                "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
+                "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
             ],
-            "version": "==1.10.11"
+            "version": "==1.11.1"
         }
     }
 }
diff --git a/python/README_import.md b/python/README_import.md
index cc9a94e1..2465940b 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -26,11 +26,13 @@ the others:
     wget https://archive.org/download/ia_papers_manifest_2018-01-25/index/idents_files_urls.sqlite.gz
     wget https://archive.org/download/ia_journal_metadata_explore_2018-04-05/journal_extra_metadata.csv
     wget https://archive.org/download/issn_issnl_mappings/20180216.ISSN-to-ISSN-L.txt
-    wget https://archive.org/download/orcid-dump-2017/public_profiles_API-2.0_2017_10_json.tar.gz
+    wget https://archive.org/download/orcid-dump-2017/public_profiles_1_2_json.all.json.gz
     wget https://archive.org/download/ia_journal_pid_map_munge_20180908/release_ids.ia_munge_20180908.sqlite3.gz
     wget https://archive.org/download/ia_test_paper_matches/2018-08-27-2352.17-matchcrossref.insertable.json.gz
     wget https://archive.org/download/ia_papers_manifest_2018-01-25_matched/ia_papers_manifest_2018-01-25.matched.json.gz
 
+    gunzip public_profiles_1_2_json.all.json.gz
+
 ## ISSN
 
 From CSV file:
@@ -54,13 +56,14 @@ Usually 24 hours or so on fast production machine.
 
 ## Matched
 
-Unknown speed!
+These each take 2-4 hours:
 
     # No file update for the first import...
-    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -
+    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -
 
     # ... but do on the second
     zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
 
     # GROBID extracted (release+file)
     time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -
+
diff --git a/python/env.example b/python/env.example
index c986b9d2..75fc5238 100644
--- a/python/env.example
+++ b/python/env.example
@@ -1,4 +1,5 @@
-FLASK_SECRET_KEY=""
+FLASK_SECRET_KEY="TODO-REPLACE-ME"
+FATCAT_DOMAIN="dev.fatcat.wiki"
 # This key used in tests
 FATCAT_API_AUTH_TOKEN="AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug="
 FATCAT_API_HOST="http://localhost:9411/v0"
@@ -14,6 +15,5 @@ SENTRY_DSN=""
 # FATCAT_API_AUTH_TOKEN
 FATCAT_AUTH_WORKER_CROSSREF=""
 FATCAT_AUTH_WORKER_ORCID=""
-FATCAT_AUTH_WORKER_ISSN=""
-FATCAT_AUTH_WORKER_MATCHED=""
-FATCAT_AUTH_WORKER_GROBID_METADATA=""
+FATCAT_AUTH_WORKER_PUBMED=""
+FATCAT_AUTH_WORKER_DATACITE=""
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 0e176b2c..a47aa175 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -1,47 +1,40 @@
 #!/usr/bin/env python3
 
-"""
-"""
-
 import os, sys, argparse
 from fatcat_tools import authenticated_api
-from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \
-    IssnImporter, MatchedImporter, GrobidMetadataImporter, make_kafka_consumer
+from fatcat_tools.importers import *
 
 
 def run_crossref(args):
-    fci = CrossrefImporter(args.api, args.issn_map_file,
+    fci = CrossrefImporter(args.api,
+        args.issn_map_file,
         extid_map_file=args.extid_map_file,
-        create_containers=(not args.no_create_containers),
-        check_existing=(not args.no_release_updates))
+        edit_batch_size=args.batch_size,
+        bezerk_mode=args.bezerk_mode)
     if args.kafka_mode:
-        consumer = make_kafka_consumer(
-            args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import")
-        fci.process_batch(consumer, size=args.batch_size, decode_kafka=True)
+        KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import").run()
     else:
-        fci.process_batch(args.json_file, size=args.batch_size)
-    fci.describe_run()
+        JsonLinePusher(fci).run()
 
 def run_orcid(args):
-    foi = OrcidImporter(args.api)
-    foi.process_batch(args.json_file, size=args.batch_size)
-    foi.describe_run()
+    foi = OrcidImporter(args.api,
+        edit_batch_size=args.batch_size)
+    JsonLinePusher(foi, args.json_file).run()
 
-def run_issn(args):
-    fii = IssnImporter(args.api)
-    fii.process_csv_batch(args.csv_file, size=args.batch_size)
-    fii.describe_run()
+def run_journal_metadata(args):
+    fii = JournalMetadataImporter(args.api,
+        edit_batch_size=args.batch_size)
+    CsvLinePusher(fii, args.csv_file).run()
 
 def run_matched(args):
     fmi = MatchedImporter(args.api,
-        skip_file_updates=args.no_file_updates)
-    fmi.process_batch(args.json_file, size=args.batch_size)
-    fmi.describe_run()
+        bezerk_mode=args.bezerk_mode,
+        edit_batch_size=args.batch_size)
+    JsonLinePusher(fmi, args.json_file).run()
 
 def run_grobid_metadata(args):
-    fmi = GrobidMetadataImporter(args.api)
-    fmi.process_source(args.tsv_file, group_size=args.group_size)
-    fmi.describe_run()
+    fmi = GrobidMetadataImporter(args.api, edit_batch_size=args.batch_size, longtail_oa=args.longtail_oa)
+    LinePusher(fmi, args.tsv_file).run()
 
 def main():
     parser = argparse.ArgumentParser()
@@ -73,18 +66,15 @@ def main():
     sub_crossref.add_argument('--extid-map-file',
         help="DOI-to-other-identifiers sqlite3 database",
         default=None, type=str)
-    sub_crossref.add_argument('--no-create-containers',
-        action='store_true',
-        help="skip creation of new container entities based on ISSN")
     sub_crossref.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
     sub_crossref.add_argument('--kafka-mode',
         action='store_true',
         help="consume from kafka topic (not stdin)")
-    sub_crossref.add_argument('--no-release-updates',
+    sub_crossref.add_argument('--bezerk-mode',
         action='store_true',
-        help="don't lookup existing DOIs, just insert (only for bootstrap)")
+        help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
 
     sub_orcid = subparsers.add_parser('orcid')
     sub_orcid.set_defaults(
@@ -98,37 +88,37 @@ def main():
         help="size of batch to send",
         default=50, type=int)
 
-    sub_issn = subparsers.add_parser('issn')
-    sub_issn.set_defaults(
-        func=run_issn,
-        auth_var="FATCAT_AUTH_WORKER_ISSN",
+    sub_journal_metadata = subparsers.add_parser('journal-metadata')
+    sub_journal_metadata.set_defaults(
+        func=run_journal_metadata,
+        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
     )
-    sub_issn.add_argument('csv_file',
+    sub_journal_metadata.add_argument('csv_file',
         help="Journal ISSN CSV metadata file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_issn.add_argument('--batch-size',
+    sub_journal_metadata.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
 
     sub_matched = subparsers.add_parser('matched')
     sub_matched.set_defaults(
         func=run_matched,
-        auth_var="FATCAT_AUTH_WORKER_MATCHED",
+        auth_var="FATCAT_API_AUTH_TOKEN",
     )
     sub_matched.add_argument('json_file',
         help="JSON file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_matched.add_argument('--no-file-updates',
-        action='store_true',
-        help="don't lookup existing files, just insert (only for bootstrap)")
     sub_matched.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
+    sub_matched.add_argument('--bezerk-mode',
+        action='store_true',
+        help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
 
     sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
     sub_grobid_metadata.set_defaults(
         func=run_grobid_metadata,
-        auth_var="FATCAT_AUTH_WORKER_GROBID_METADATA",
+        auth_var="FATCAT_API_AUTH_TOKEN",
     )
     sub_grobid_metadata.add_argument('tsv_file',
         help="TSV file to import from (or stdin)",
@@ -136,6 +126,9 @@ def main():
     sub_grobid_metadata.add_argument('--group-size',
         help="editgroup group size to use",
         default=75, type=int)
+    sub_matched.add_argument('--longtail-oa',
+        action='store_true',
+        help="if this is an import of longtail OA content (sets an 'extra' flag)")
 
     args = parser.parse_args()
     if not args.__dict__.get("func"):
@@ -144,6 +137,7 @@ def main():
 
     args.api = authenticated_api(
         args.host_url,
+        # token is an optional kwarg (can be empty string, None, etc)
         token=os.environ.get(args.auth_var))
     args.func(args)
 
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index e6f081e5..70f38f5b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -1,7 +1,22 @@
 
-from .common import FatcatImporter, make_kafka_consumer
+"""
+To run an import you combine two classes; one each of:
+
+- RecordSource: somehow iterates over a source of raw records (eg, from a
+  database, Kafka, files on disk, stdin) and pushes into an entity importer.
+- EntityImporter: class that a record iterator pushes raw (unparsed) records
+  into. The entity importer parses and decides what to do (ignore, update,
+  insert, etc). There is usually a primary entity type, though related entities
+  can be created along the way. Maintains API connection and editgroup/batch
+  state.
+
+"""
+
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
 from .grobid_metadata import GrobidMetadataImporter
-from .issn import IssnImporter
+from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
 from .orcid import OrcidImporter
+#from .kafka_source import KafkaSource
+#from .file_source import FileSource
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 06897bee..89203a4f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
 import sys
 import csv
 import json
+import ftfy
 import itertools
 import subprocess
 from collections import Counter
@@ -12,30 +13,66 @@ import fatcat_client
 from fatcat_client.rest import ApiException
 
 
-# from: https://docs.python.org/3/library/itertools.html
-def grouper(iterable, n, fillvalue=None):
-    "Collect data into fixed-length chunks or blocks"
-    args = [iter(iterable)] * n
-    return itertools.zip_longest(*args, fillvalue=fillvalue)
+def clean(thing, force_xml=False):
+    """
+    This function is appropriate to be called on any random, non-markup string,
+    such as author names, titles, etc.
 
-def make_kafka_consumer(hosts, env, topic_suffix, group):
-    topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
-    client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
-    consume_topic = client.topics[topic_name]
-    print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+    It will try to clean up commong unicode mangles, HTML characters, etc.
 
-    consumer = consume_topic.get_balanced_consumer(
-        consumer_group=group.encode('utf-8'),
-        managed=True,
-        auto_commit_enable=True,
-        auto_commit_interval_ms=30000, # 30 seconds
-        compacted_topic=True,
-    )
-    return consumer
+    This will detect XML/HTML and "do the right thing" (aka, not remove
+    entities like '&amp' if there are tags in the string), unless you pass the
+    'force_xml' parameter, which might be appropriate for, eg, names and
+    titles, which generally should be projected down to plain text.
+
+    Also strips extra whitespace.
+    """
+    if not thing:
+        return thing
+    fix_entities = 'auto'
+    if force_xml:
+        fix_entities = True
+    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+    if not fixed:
+        # wasn't zero-length before, but is now; return None
+        return None
+    return fixed
+
+def test_clean():
 
-class FatcatImporter:
+    assert clean(None) == None
+    assert clean('') == ''
+    assert clean('123') == '123'
+    assert clean('a&amp;b') == 'a&b'
+    assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+    assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+class EntityImporter:
     """
-    Base class for fatcat importers
+    Base class for fatcat entity importers.
+    
+    The API exposed to record iterator is:
+
+        push_record(raw_record)
+        finish()
+
+    The API that implementations are expected to fill in are:
+
+        want(raw_record) -> boolean
+        parse(raw_record) -> entity
+        try_update(entity) -> boolean
+        insert_batch([entity]) -> None
+
+    This class exposes helpers for implementations:
+
+        self.api
+        self.create_<entity>(entity) -> EntityEdit
+            for related entity types
+        self.push_entity(entity)
+        self.counts['exists'] += 1
+            if didn't update or insert because of existing)
+        self.counts['update'] += 1
+            if updated an entity
     """
 
     def __init__(self, api, **kwargs):
@@ -43,87 +80,135 @@ class FatcatImporter:
         eg_extra = kwargs.get('editgroup_extra', dict())
         eg_extra['git_rev'] = eg_extra.get('git_rev',
             subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter')
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
         
         self.api = api
-        self._editgroup_description = kwargs.get('editgroup_description')
-        self._editgroup_extra = kwargs.get('editgroup_extra')
-        issn_map_file = kwargs.get('issn_map_file')
+        self.bezerk_mode = kwargs.get('bezerk_mode', False)
+        self.edit_batch_size = kwargs.get('edit_batch_size', 100)
+        self.editgroup_description = kwargs.get('editgroup_description')
+        self.editgroup_extra = kwargs.get('editgroup_extra')
+        self.reset()
 
         self._issnl_id_map = dict()
         self._orcid_id_map = dict()
-        self._doi_id_map = dict()
-        if issn_map_file:
-            self.read_issn_map_file(issn_map_file)
         self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
-        self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0})
+        self._doi_id_map = dict()
 
-    def _editgroup(self):
-        eg = fatcat_client.Editgroup(
-            description=self._editgroup_description,
-            extra=self._editgroup_extra,
-        )
-        return self.api.create_editgroup(eg)
+    def reset(self):
+        self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+        self._edit_count = 0
+        self._editgroup_id = None
+        self._entity_queue = []
 
-    def describe_run(self):
-        print("Processed {} lines, inserted {}, updated {}.".format(
-            self.counts['processed_lines'], self.counts['insert'], self.counts['update']))
+    def push_record(self, raw_record):
+        """
+        Returns nothing.
+        """
+        if (not raw_record) or (not self.want(raw_record)):
+            self.counts['skip'] += 1
+            return
+        entity = self.parse_record(raw_record)
+        if not entity:
+            self.counts['skip'] += 1
+            return
+        if self.bezerk_mode:
+            self.push_entity(entity)
+            return
+        if self.try_update(entity):
+            self.push_entity(entity)
+        return
 
-    def create_row(self, row, editgroup_id=None):
-        # sub-classes expected to implement this
-        raise NotImplementedError
+    def finish(self):
+        if self._edit_count > 0:
+            self.api.accept_editgroup(self._editgroup_id)
+            self._editgroup_id = None
+            self._edit_count = 0
+
+        if self._entity_queue:
+            self.insert_batch(self._entity_queue)
+            self.counts['insert'] += len(self._entity_queue)
+            self._entity_queue =  []
+
+        self.counts['total'] = 0
+        for key in ('skip', 'insert', 'update', 'exists'):
+            self.counts['total'] += self.counts[key]
+        return self.counts
+
+    def _get_editgroup(self, edits=1):
+        if self._edit_count >= self.edit_batch_size:
+            self.api.accept_editgroup(self._editgroup_id)
+            self._editgroup_id = None
+            self._edit_count = 0
 
-    def create_batch(self, rows, editgroup_id=None):
-        # sub-classes expected to implement this
+        if not self._editgroup_id:
+            eg = self.api.create_editgroup(
+                fatcat_client.Editgroup(
+                    description=self.editgroup_description,
+                    extra=self.editgroup_extra))
+            self._editgroup_id = eg.editgroup_id
+
+        self._edit_count += edits
+        return self._editgroup_id
+
+    def create_container(self, entity):
+        eg_id = self._get_editgroup()
+        self.counts['inserted.container'] += 1
+        return self.api.create_container(entity, editgroup_id=eg_id)
+
+    def create_release(self, entity):
+        eg_id = self._get_editgroup()
+        self.counts['inserted.release'] += 1
+        return self.api.create_release(entity, editgroup_id=eg_id)
+
+    def create_file(self, entity):
+        eg_id = self._get_editgroup()
+        self.counts['inserted.file'] += 1
+        return self.api.create_file(entity, editgroup_id=eg_id)
+
+    def updated(self):
+        """
+        Implementations should call this from try_update() if the update was successful
+        """
+        self.counts['update'] += 1
+
+    def push_entity(self, entity):
+        self._entity_queue.append(entity)
+        if len(self._entity_queue) >= self.edit_batch_size:
+            self.insert_batch(self._entity_queue)
+            self.counts['insert'] += len(_entity_queue)
+            self._entity_queue = 0
+
+    def want(self, raw_record):
+        """
+        Implementations can override for optional fast-path to drop a record.
+        Must have no side-effects; returns bool.
+        """
+        return True
+
+    def parse(self, raw_record):
+        """
+        Returns an entity class type, or None if we should skip this one.
+
+        May have side-effects (eg, create related entities), but shouldn't
+        update/mutate the actual entity.
+        """
         raise NotImplementedError
 
-    def process_source(self, source, group_size=100):
-        """Creates and auto-accepts editgroup every group_size rows"""
-        eg = self._editgroup()
-        i = 0
-        for i, row in enumerate(source):
-            self.create_row(row, editgroup_id=eg.editgroup_id)
-            if i > 0 and (i % group_size) == 0:
-                self.api.accept_editgroup(eg.editgroup_id)
-                eg = self._editgroup()
-            self.counts['processed_lines'] += 1
-        if i == 0 or (i % group_size) != 0:
-            self.api.accept_editgroup(eg.editgroup_id)
-
-    def process_batch(self, source, size=50, decode_kafka=False):
-        """Reads and processes in batches (not API-call-per-)"""
-        for rows in grouper(source, size):
-            if decode_kafka:
-                rows = [msg.value.decode('utf-8') for msg in rows]
-            self.counts['processed_lines'] += len(rows)
-            #eg = self._editgroup()
-            #self.create_batch(rows, editgroup_id=eg.editgroup_id)
-            self.create_batch(rows)
-
-    def process_csv_source(self, source, group_size=100, delimiter=','):
-        reader = csv.DictReader(source, delimiter=delimiter)
-        self.process_source(reader, group_size)
-
-    def process_csv_batch(self, source, size=50, delimiter=','):
-        reader = csv.DictReader(source, delimiter=delimiter)
-        self.process_batch(reader, size)
+    def try_update(self, raw_record):
+        """
+        Passed the output of parse(). Should try to find an existing entity and
+        update it (PUT), decide we should do nothing (based on the existing
+        record), or create a new one.
 
-    def is_issnl(self, issnl):
-        return len(issnl) == 9 and issnl[4] == '-'
+        Implementations must update the exists/updated/skip counts
+        appropriately in this method.
 
-    def lookup_issnl(self, issnl):
-        """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
-        if issnl in self._issnl_id_map:
-            return self._issnl_id_map[issnl]
-        container_id = None
-        try:
-            rv = self.api.lookup_container(issnl=issnl)
-            container_id = rv.ident
-        except ApiException as ae:
-            # If anything other than a 404 (not found), something is wrong
-            assert ae.status == 404
-        self._issnl_id_map[issnl] = container_id # might be None
-        return container_id
+        Returns boolean: True if the entity should still be inserted, False otherwise
+        """
+        raise NotImplementedError
+
+    def insert_batch(self, raw_record):
+        raise NotImplementedError
 
     def is_orcid(self, orcid):
         return self._orcid_regex.match(orcid) is not None
@@ -163,6 +248,23 @@ class FatcatImporter:
         self._doi_id_map[doi] = release_id # might be None
         return release_id
 
+    def is_issnl(self, issnl):
+        return len(issnl) == 9 and issnl[4] == '-'
+
+    def lookup_issnl(self, issnl):
+        """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+        if issnl in self._issnl_id_map:
+            return self._issnl_id_map[issnl]
+        container_id = None
+        try:
+            rv = self.api.lookup_container(issnl=issnl)
+            container_id = rv.ident
+        except ApiException as ae:
+            # If anything other than a 404 (not found), something is wrong
+            assert ae.status == 404
+        self._issnl_id_map[issnl] = container_id # might be None
+        return container_id
+
     def read_issn_map_file(self, issn_map_file):
         print("Loading ISSN map file...")
         self._issn_issnl_map = dict()
@@ -179,3 +281,117 @@ class FatcatImporter:
         if issn is None:
             return None
         return self._issn_issnl_map.get(issn)
+
+
+class RecordPusher:
+    """
+    Base class for different importer sources. Pretty trivial interface, just
+    wraps an importer and pushes records in to it.
+    """
+
+    def __init__(self, importer, **kwargs):
+        self.importer = importer
+
+    def run(self):
+        """
+        This will look something like:
+
+            for line in sys.stdin:
+                record = json.loads(line)
+                self.importer.push_record(record)
+            print(self.importer.finish())
+        """
+        raise NotImplementedError
+
+
+class JsonLinePusher(RecordPusher):
+
+    def __init__(self, importer, json_file, **kwargs):
+        self.importer = importer
+        self.json_file = json_file
+
+    def run(self):
+        for line in self.json_file:
+            if not line:
+                continue
+            record = json.loads(line)
+            self.importer.push_record(record)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+class CsvPusher(RecordPusher):
+
+    def __init__(self, importer, csv_file, **kwargs):
+        self.importer = importer
+        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+
+    def run(self):
+        for line in self.reader:
+            if not line:
+                continue
+            self.importer.push_record(line)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+class LinePusher(RecordPusher):
+
+    def __init__(self, importer, text_file, **kwargs):
+        self.importer = importer
+        self.text_file = text_file
+
+    def run(self):
+        for line in self.text_file:
+            if not line:
+                continue
+            self.importer.push_record(line)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+class KafkaJsonPusher(RecordPusher):
+
+    def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+        self.importer = importer
+        self.consumer = make_kafka_consumer(
+            kafka_hosts,
+            kafka_env,
+            topic_suffix,
+            group,
+        )
+
+    def run(self):
+        count = 0
+        for msg in self.consumer:
+            if not msg:
+                continue
+            record = json.loads(msg.value.decode('utf-8'))
+            self.importer.push_record(record)
+            count += 1
+            if count % 500 == 0:
+                print("Import counts: {}".format(self.importer.counts))
+        # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
+        # commit the current batch if it has been lingering
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+def make_kafka_consumer(hosts, env, topic_suffix, group):
+    topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
+    client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
+    consume_topic = client.topics[topic_name]
+    print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+
+    consumer = consume_topic.get_balanced_consumer(
+        consumer_group=group.encode('utf-8'),
+        managed=True,
+        auto_commit_enable=True,
+        auto_commit_interval_ms=30000, # 30 seconds
+        compacted_topic=True,
+    )
+    return consumer
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 6365e491..00c719f1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
 import itertools
 import subprocess
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
 
 # The docs/guide should be the cannonical home for these mappings; update there
@@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = {
     'standard': 'standard',
 }
 
-class CrossrefImporter(FatcatImporter):
+CONTAINER_TYPE_MAP = {
+    'article-journal': 'journal',
+    'paper-conference': 'conference',
+    'book': 'book-series',
+}
+
+# TODO:
+LICENSE_SLUG_MAP = {
+    "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+    "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+    "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+    "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # http://www.springer.com/tdm doesn't seem like a license
+}
+
+class CrossrefImporter(EntityImporter):
     """
     Importer for Crossref metadata.
 
@@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter):
             issn_map_file=issn_map_file,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra)
+
+        self.create_containers = kwargs.get('create_containers')
         extid_map_file = kwargs.get('extid_map_file')
-        create_containers = kwargs.get('create_containers')
-        check_existing = kwargs.get('check_existing')
         self.extid_map_db = None
         if extid_map_file:
             db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter):
             self.extid_map_db = sqlite3.connect(db_uri, uri=True)
         else:
             print("Not using external ID map")
-        self.create_containers = create_containers
-        self.check_existing = check_existing
+
+        self.read_issn_map_file(issn_map_file)
 
     def lookup_ext_ids(self, doi):
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
         row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
             [doi.lower()]).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
         row = [str(cell or '') or None for cell in row]
         return dict(
             core_id=row[0],
             pmid=row[1],
             pmcid=row[2],
-            wikidata_qid=row[3])
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
 
     def map_release_type(self, crossref_type):
         return CROSSREF_TYPE_MAP.get(crossref_type)
 
-    def parse_crossref_dict(self, obj):
+    def map_container_type(self, crossref_type):
+        return CONTAINER_TYPE_MAP.get(crossref_type)
+
+    def want(self, obj):
+        if not obj.get('title'):
+            return False
+
+        # do most of these checks in-line below
+        return True
+
+    def parse_record(self, obj):
         """
         obj is a python dict (parsed from json).
         returns a ReleaseEntity
         """
 
-        # Do require the 'title' keys to exsit, as release entities do
-        if (not 'title' in obj) or (not obj['title']):
-            return None
-
         # Ways to be out of scope (provisionally)
         # journal-issue and journal-volume map to None, but allowed for now
         if obj.get('type') in (None, 'journal', 'proceedings',
@@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter):
                 'book-track', 'proceedings-series'):
             return None
 
-        # lookup existing DOI
-        existing_release = None
-        if self.check_existing:
-            try:
-                existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
-            except fatcat_client.rest.ApiException as err:
-                if err.status != 404:
-                    raise err
-
-        # eventually we'll want to support "updates", but for now just skip if
-        # entity already exists
-        if existing_release:
+        # Do require the 'title' keys to exsit, as release entities do
+        if (not 'title' in obj) or (not obj['title']):
             return None
 
+        release_type = self.map_release_type(obj['type'])
+
         # contribs
         def do_contribs(obj_list, ctype):
             contribs = []
@@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter):
                     index = i
                 else:
                     index = None
+                raw_affiliation = None
                 if am.get('affiliation'):
-                    # note: affiliation => affiliations
-                    extra['affiliations'] = am.get('affiliation')
+                    if len(am.get('affiliation')) > 0:
+                        raw_affiliation = am.get('affiliation')[0]['name']
+                    if len(am.get('affiliation')) > 1:
+                        # note: affiliation => more_affiliations
+                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
                 if am.get('sequence') and am.get('sequence') != "additional":
-                    extra['sequence'] = am.get('sequence')
+                    extra['seq'] = clean(am.get('sequence'))
                 if not extra:
                     extra = None
                 assert ctype in ("author", "editor", "translator")
                 contribs.append(fatcat_client.ReleaseContrib(
                     creator_id=creator_id,
                     index=index,
-                    raw_name=raw_name,
+                    raw_name=clean(raw_name),
+                    raw_affiliation=clean(raw_affiliation),
                     role=ctype,
                     extra=extra))
             return contribs
@@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter):
             container_id = self.lookup_issnl(issnl)
         publisher = obj.get('publisher')
 
-        ce = None
         if (container_id is None and self.create_containers and (issnl is not None)
             and obj.get('container-title') and len(obj['container-title']) > 0):
             ce = fatcat_client.ContainerEntity(
                 issnl=issnl,
-                publisher=publisher,
-                name=obj['container-title'][0])
+                publisher=clean(publisher),
+                container_type=self.map_container_type(release_type),
+                name=clean(obj['container-title'][0], force_xml=True))
+            ce_edit = self.create_container(ce)
+            container_id = ce_edit.ident
+
+        # license slug
+        license_slug = None
+        license_extra = []
+        for l in obj.get('license', []):
+            if l['content-version'] not in ('vor', 'unspecified'):
+                continue
+            slug = LICENSE_SLUG_MAP.get(l['URL'])
+            if slug:
+                license_slug = slug
+            if 'start' in l:
+                l['start'] = l['start']['date-time']
+            license_extra.append(l)
 
         # references
         refs = []
         for i, rm in enumerate(obj.get('reference', [])):
             try:
                 year = int(rm.get('year'))
-                # NOTE: will need to update/config in the future!
+                # TODO: will need to update/config in the future!
                 # NOTE: are there crossref works with year < 100?
                 if year > 2025 or year < 100:
                     year = None
             except:
                 year = None
-            extra = rm.copy()
-            if rm.get('DOI'):
-                extra['doi'] = rm.get('DOI').lower()
             key = rm.get('key')
             if key and key.startswith(obj['DOI'].upper()):
                 key = key.replace(obj['DOI'].upper() + "-", '')
@@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter):
             container_name = rm.get('volume-title')
             if not container_name:
                 container_name = rm.get('journal-title')
-            extra.pop('DOI', None)
-            extra.pop('key', None)
-            extra.pop('year', None)
-            extra.pop('volume-name', None)
-            extra.pop('journal-title', None)
-            extra.pop('title', None)
-            extra.pop('first-page', None)
-            extra.pop('doi-asserted-by', None)
+            elif rm.get('journal-title'):
+                extra['journal-title'] = rm['journal-title']
+            extra = dict()
+            if rm.get('DOI'):
+                extra['doi'] = rm.get('DOI').lower()
+            # TODO: what fields here? CSL citation stuff
+            for k in ('author', 'editor', 'edition', 'authority', 'version',
+                    'genre', 'url', 'event', 'issue', 'volume', 'date',
+                    'accessed_date', 'issued', 'page', 'medium',
+                    'collection_title', 'chapter_number'):
+                if clean(rm.get(k)):
+                    extra[k] = clean(rm[k])
             if extra:
                 extra = dict(crossref=extra)
             else:
@@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter):
                 target_release_id=None,
                 key=key,
                 year=year,
-                container_name=container_name,
-                title=rm.get('title'),
-                locator=rm.get('first-page'),
+                container_name=clean(container_name),
+                title=clean(rm.get('title')),
+                locator=clean(rm.get('first-page')),
                 # TODO: just dump JSON somewhere here?
                 extra=extra))
 
@@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter):
         if obj.get('abstract') != None:
             abstracts.append(fatcat_client.ReleaseEntityAbstracts(
                 mimetype="application/xml+jats",
-                content=obj.get('abstract')))
+                content=clean(obj.get('abstract'))))
 
         # extra fields
         extra = dict()
-        for key in ('subject', 'type', 'license', 'alternative-id',
-                'container-title', 'original-title', 'subtitle', 'archive',
-                'funder', 'group-title'):
-            # TODO: unpack "container-title" array
+        for key in ('subject', 'type', 'alternative-id', 'container-title',
+                'subtitle', 'archive', 'funder', 'group-title'):
+            # TODO: unpack "container-title" array?
             val = obj.get(key)
             if val:
-                extra[key] = val
-        if 'license' in extra and extra['license']:
-            for i in range(len(extra['license'])):
-                if 'start' in extra['license'][i]:
-                    extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
+                if type(val) == str:
+                    extra[key] = clean(val)
+                else:
+                    extra[key] = val
+        if license_extra:
+            extra['license'] = license_extra
+
         if len(obj['title']) > 1:
-            extra['other-titles'] = obj['title'][1:]
-        # TODO: this should be top-level
-        extra['is_kept'] = len(obj.get('archive', [])) > 0
+            extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
 
         # ISBN
         isbn13 = None
@@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter):
 
         re = fatcat_client.ReleaseEntity(
             work_id=None,
-            title=obj.get('title', [None])[0],
-            contribs=contribs,
-            refs=refs,
             container_id=container_id,
-            publisher=publisher,
-            release_type=self.map_release_type(obj['type']),
+            title=clean(obj.get('title', [None])[0], force_xml=True),
+            original_title=clean(obj.get('original-title', [None])[0]),
+            release_type=release_type,
             release_status=release_status,
+            release_date=release_date,
+            release_year=release_year,
+            publisher=clean(publisher),
             doi=obj['DOI'].lower(),
-            isbn13=isbn13,
-            core_id=extids['core_id'],
             pmid=extids['pmid'],
             pmcid=extids['pmcid'],
             wikidata_qid=extids['wikidata_qid'],
-            release_date=release_date,
-            release_year=release_year,
-            issue=obj.get('issue'),
-            volume=obj.get('volume'),
-            pages=obj.get('page'),
+            isbn13=isbn13,
+            core_id=extids['core_id'],
+            arxiv_id=extids['arxiv_id'],
+            jstor_id=extids['jstor_id'],
+            volume=clean(obj.get('volume')),
+            issue=clean(obj.get('issue')),
+            pages=clean(obj.get('page')),
+            language=None,  # crossref doesn't supply language info
+            license_slug=license_slug,
+            extra=dict(crossref=extra),
             abstracts=abstracts,
-            extra=dict(crossref=extra))
-        return (re, ce)
+            contribs=contribs,
+            refs=refs,
+        )
+        return re
+
+    def try_update(self, re):
+
+        # lookup existing DOI (don't need to try other ext idents for crossref)
+        existing = None
+        try:
+            existing = self.api.lookup_release(doi=re.doi)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_release_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
 
-    def create_row(self, row, editgroup_id=None):
-        if row is None:
-            return
-        obj = json.loads(row)
-        entities = self.parse_crossref_dict(obj)
-        if entities is not None:
-            (re, ce) = entities
-            if ce is not None:
-                container = self.api.create_container(ce, editgroup_id=editgroup_id)
-                re.container_id = container.ident
-                self._issnl_id_map[ce.issnl] = container.ident
-            self.api.create_release(re, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
-
-    def create_batch(self, batch):
-        """Current work/release pairing disallows batch creation of releases.
-        Could do batch work creation and then match against releases, but meh."""
-        release_batch = []
-        for row in batch:
-            if row is None:
-                continue
-            obj = json.loads(row)
-            entities = self.parse_crossref_dict(obj)
-            if entities is not None:
-                (re, ce) = entities
-                if ce is not None:
-                    ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())
-                    container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)
-                    self.api.accept_editgroup(ce_eg.editgroup_id)
-                    re.container_id = container.ident
-                    self._issnl_id_map[ce.issnl] = container.ident
-                release_batch.append(re)
-        self.api.create_release_batch(release_batch, autoaccept="true")
-        self.counts['insert'] += len(release_batch)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 5e61a154..9d95fe0b 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,22 @@ import json
 import base64
 import datetime
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
 MAX_ABSTRACT_BYTES=4096
 
 
-class GrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(EntityImporter):
+    """
+    This is a complex case: we need to parse and create both file and release entities.
+
+    The "primary" entity here is really File, not Release. If a matching File
+    exists, we bail in want(); if not we insert the Release during parsing, and
+    insert both.
+
+    TODO: should instead check if the File has any releases; if not, insert and update.
+    TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
+    """
 
     def __init__(self, api, **kwargs):
 
@@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter):
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
+        self.longtail_oa = kwargs.get("longtail_oa", False)
+
+    def want(self, raw_record):
+        return True
+
+    def parse_record(self, row):
+
+        fields = row.split('\t')
+        sha1_key = fields[0]
+        cdx = json.loads(fields[1])
+        mimetype = fields[2]
+        file_size = int(fields[3])
+        grobid_meta = json.loads(fields[4])
+        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+        re = self.parse_grobid_json(grobid_meta)
+
+        if not (fe and re):
+            return None
+
+        # lookup existing file SHA1
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # if file is already in here, presumably not actually long-tail
+        # HACK: this is doing an exists check in parse_record(), which is weird
+        # TODO: this is where we should check if the file actually has
+        # release_ids and/or URLs associated with it
+        if existing and not self.bezerk_mode:
+            self.counts['exists'] += 1
+            self.counts['skip'] -= 1
+            return None
+
+        release_edit = self.create_release(re)
+        fe.release_ids.append(release_edit.ident)
+        return fe
 
     def parse_grobid_json(self, obj):
 
@@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter):
             abobj = dict(
                 mimetype="text/plain",
                 language=None,
-                content=obj.get('abstract').strip())
+                content=clean(obj.get('abstract')))
             abstracts = [abobj]
         else:
             abstracts = None
@@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter):
         for i, a in enumerate(obj.get('authors', [])):
             contribs.append(fatcat_client.ReleaseContrib(
                 index=i,
-                raw_name=a['name'],
+                raw_name=clean(a['name']),
                 role="author",
                 extra=None))
 
+        # XXX: why is this a dict()? not covered by tests?
         refs = []
         for raw in obj.get('citations', []):
             cite_extra = dict()
             ref = dict()
-            ref['key'] = raw.get('id')
+            ref['key'] = clean(raw.get('id'))
             if raw.get('title'):
-                ref['title'] = raw['title'].strip()
+                ref['title'] = clean(raw['title'])
             if raw.get('date'):
                 try:
                     year = int(raw['date'].strip()[:4])
@@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter):
                     pass
             for key in ('volume', 'url', 'issue', 'publisher'):
                 if raw.get(key):
-                    cite_extra[key] = raw[key].strip()
+                    cite_extra[key] = clean(raw[key])
             if raw.get('authors'):
-                cite_extra['authors'] = [a['name'] for a in raw['authors']]
+                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
             if cite_extra:
                 cite_extra = dict(grobid=cite_extra)
             else:
@@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter):
         if obj.get('doi'):
             extra['doi'] = obj['doi']
         if obj['journal'] and obj['journal'].get('name'):
-            extra['container_name'] = obj['journal']['name']
-
-        extra['is_longtail_oa'] = True
+            extra['container_name'] = clean(obj['journal']['name'])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
+        if self.longtail_oa:
+            extra['longtail_oa'] = True
+
         if extra:
             extra = dict(grobid=extra)
         else:
             extra = None
 
         re = fatcat_client.ReleaseEntity(
-            title=obj['title'].strip(),
+            title=clean(obj['title'], force_xml=True),
             release_type="article-journal",
             release_date=release_date,
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=obj['journal'].get('publisher'),
-            volume=obj['journal'].get('volume'),
-            issue=obj['journal'].get('issue'),
+            publisher=clean(obj['journal'].get('publisher')),
+            volume=clean(obj['journal'].get('volume')),
+            issue=clean(obj['journal'].get('issue')),
             abstracts=abstracts,
             extra=extra)
         return re
@@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter):
 
         sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
 
-        # lookup existing SHA1, or create new entity
-        try:
-            existing_file = self.api.lookup_file(sha1=sha1)
-        except fatcat_client.rest.ApiException as err:
-            if err.status != 404:
-                raise err
-            existing_file = None
-
-        if existing_file:
-            # if file is already in here, presumably not actually long-tail
-            return None
         fe = fatcat_client.FileEntity(
             sha1=sha1,
             size=int(file_size),
@@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter):
 
         # parse URLs and CDX
         original = cdx['url']
+        assert len(cdx['dt']) >= 8
         wayback = "https://web.archive.org/web/{}/{}".format(
             cdx['dt'],
             original)
@@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter):
 
         return fe
 
-    def create_row(self, row, editgroup_id=None):
-        if not row:
-            return
-        fields = row.split('\t')
-        sha1_key = fields[0]
-        cdx = json.loads(fields[1])
-        mimetype = fields[2]
-        file_size = int(fields[3])
-        grobid_meta = json.loads(fields[4])
-        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
-        re = self.parse_grobid_json(grobid_meta)
-        if fe and re:
-            release_entity = self.api.create_release(re, editgroup_id=editgroup_id)
-            # release ident can't already be in release list because we just
-            # created it
-            fe.release_ids.append(release_entity.ident)
-            file_entity = self.api.create_file(fe, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
-
-    # NB: batch mode not implemented
+    def try_update(self, entity):
+        # did the exists check in 'parse_record()', because we needed to create a release
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_file_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
deleted file mode 100644
index f4d525a4..00000000
--- a/python/fatcat_tools/importers/issn.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import sys
-import json
-import itertools
-import fatcat_client
-from .common import FatcatImporter
-
-
-def or_none(s):
-    if s is None:
-        return None
-    if len(s) == 0:
-        return None
-    return s
-
-def truthy(s):
-    if s is None:
-        return None
-    s = s.lower()
-
-    if s in ('true', 't', 'yes', 'y', '1'):
-        return True
-    elif s in ('false', 'f', 'no', 'n', '0'):
-        return False
-    else:
-        return None
-
-class IssnImporter(FatcatImporter):
-    """
-    Imports journal metadata ("containers") by ISSN, currently from a custom
-    (data munged) .csv file format
-
-    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-
-        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
-    """
-
-    def __init__(self, api, **kwargs):
-
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra)
-
-    def parse_issn_row(self, row):
-        """
-        row is a python dict (parsed from CSV).
-        returns a ContainerEntity (or None if invalid or couldn't parse)
-        """
-        title = or_none(row['title'])
-        issnl = or_none(row['ISSN-L'])
-        if title is None or issnl is None:
-            return None
-        extra = dict(
-            in_doaj=truthy(row['in_doaj']),
-            in_road=truthy(row['in_road']),
-            in_norwegian=truthy(row['in_norwegian']),
-            language=or_none(row['lang']),
-            url=or_none(row['url']),
-            ISSNp=or_none(row['ISSN-print']),
-            ISSNe=or_none(row['ISSN-electronic']),
-            is_oa=truthy(row['is_oa']),
-            is_kept=truthy(row['is_kept']),
-        )
-        ce = fatcat_client.ContainerEntity(
-            issnl=issnl,
-            name=title,
-            publisher=or_none(row['publisher']),
-            abbrev=None,
-            coden=None,
-            extra=extra)
-        return ce
-
-    def create_row(self, row, editgroup_id=None):
-        ce = self.parse_issn_row(row)
-        if ce is not None:
-            self.api.create_container(ce, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
-
-    def create_batch(self, batch):
-        """Reads and processes in batches (not API-call-per-line)"""
-        objects = [self.parse_issn_row(l)
-                   for l in batch if (l is not None)]
-        objects = [o for o in objects if (o is not None)]
-        self.api.create_container_batch(objects, autoaccept="true")
-        self.counts['insert'] += len(objects)
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
new file mode 100644
index 00000000..cf3971b5
--- /dev/null
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -0,0 +1,183 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+def or_none(s):
+    if s is None:
+        return None
+    if len(s) == 0:
+        return None
+    return s
+
+def truthy(s):
+    if s is None:
+        return None
+    s = s.lower()
+
+    if s in ('true', 't', 'yes', 'y', '1'):
+        return True
+    elif s in ('false', 'f', 'no', 'n', '0'):
+        return False
+    else:
+        return None
+
+class JournalMetadataImporter(EntityImporter):
+    """
+    Imports journal metadata ("containers") by ISSN, currently from a custom
+    (data munged) .csv file format
+
+    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+    'extra' fields:
+
+        doaj
+            as_of: datetime of most recent check; if not set, not actually in DOAJ
+            seal: bool
+            work_level: bool (are work-level publications deposited with DOAJ?)
+            archiving: array, can include 'library' or 'other'
+        road
+            as_of: datetime of most recent check; if not set, not actually in ROAD
+        pubmed (TODO: delete?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+        norwegian (TODO: drop this?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+            id (integer)
+            level (integer; 0-2)
+        kbart
+            lockss
+                year_rle
+                volume_rle
+            portico
+                ...
+            clockss
+                ...
+        sherpa_romeo
+            color
+        jstor
+            year_rle
+            volume_rle
+        scopus
+            id
+            TODO: print/electronic distinction?
+        wos
+            id
+        doi
+            crossref_doi: DOI of the title in crossref (if exists)
+            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+        ia
+            sim
+                nap_id
+                year_rle
+                volume_rle
+            longtail: boolean
+            homepage
+                as_of: datetime of last attempt
+                url
+                status: HTTP/heritrix status of homepage crawl
+
+        issnp: string
+        issne: string
+        coden: string
+        abbrev: string
+        oclc_id: string (TODO: lookup?)
+        lccn_id: string (TODO: lookup?)
+        dblb_id: string
+        default_license: slug
+        original_name: native name (if name is translated)
+        platform: hosting platform: OJS, wordpress, scielo, etc
+        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+        first_year: year (integer)
+        last_year: if publishing has stopped
+        primary_language: single ISO code, or 'mixed'
+        languages: array of ISO codes
+        region: TODO: continent/world-region
+        nation: shortcode of nation
+        discipline: TODO: highest-level subject; "life science", "humanities", etc
+        field: TODO: narrower description of field
+        subjects: TODO?
+        url: homepage
+        is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+        TODO: domains, if exclusive?
+        TODO: fulltext_regex, if a known pattern?
+
+    For KBART, etc:
+        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+        year and volume spans are run-length-encoded arrays, using integers:
+            - if an integer, means that year is preserved
+            - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra)
+
+    def want(self, raw_record):
+        if raw_record.get('ISSN-L'):
+            return True
+        return False
+
+    def parse_record(self, row):
+        """
+        row is a python dict (parsed from CSV).
+        returns a ContainerEntity (or None if invalid or couldn't parse)
+        """
+        title = or_none(row['title'])
+        issnl = or_none(row['ISSN-L'])
+        if title is None or issnl is None:
+            return None
+        extra = dict(
+            in_doaj=truthy(row['in_doaj']),
+            in_road=truthy(row['in_road']),
+            in_norwegian=truthy(row['in_norwegian']),
+            language=or_none(row['lang']),
+            url=or_none(row['url']),
+            ISSNp=or_none(row['ISSN-print']),
+            ISSNe=or_none(row['ISSN-electronic']),
+            is_oa=truthy(row['is_oa']),
+            is_kept=truthy(row['is_kept']),
+        )
+        ce = fatcat_client.ContainerEntity(
+            issnl=issnl,
+            name=clean(title),
+            publisher=or_none(clean(row['publisher'])),
+            extra=extra)
+        return ce
+
+    def try_update(self, ce):
+
+        existing = None
+        try:
+            existing = self.api.lookup_container(issnl=ce.issnl)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_container_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 1e5c22f7..2ec6c95d 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,16 +4,10 @@ import json
 import sqlite3
 import itertools
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
-#row = row.split('\t')
-#assert len(row) == 2
-#sha1 = row[0].replace('sha1:')
-#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
-#print(sha1)
-#dois = [d.lower() for d in json.loads(row[1])]
 
-class MatchedImporter(FatcatImporter):
+class MatchedImporter(EntityImporter):
     """
     Importer for "file to crossref DOI" matches.
 
@@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter):
             editgroup_extra=eg_extra)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         self.default_mime = kwargs.get("default_mime", None)
-        self.skip_file_updates = kwargs.get("skip_file_updates", False)
 
     def make_url(self, raw):
         rel = self.default_link_rel
@@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter):
             rel = "repository"
         elif "//web.archive.org/" in raw or "//archive.is/" in raw:
             rel = "webarchive"
-        return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+        return (rel, raw)
 
-    def parse_matched_dict(self, obj):
-        sha1 = obj['sha1']
-        dois = [d.lower() for d in obj.get('dois', [])]
+    def want(self, raw_record):
+        return True
 
-        # lookup sha1, or create new entity
-        fe = None
-        if not self.skip_file_updates:
-            try:
-                fe = self.api.lookup_file(sha1=sha1)
-            except fatcat_client.rest.ApiException as err:
-                if err.status != 404:
-                    raise err
-        if fe is None:
-            fe = fatcat_client.FileEntity(
-                sha1=sha1,
-                release_ids=[],
-                urls=[],
-            )
+    def parse_record(self, obj):
+        dois = [d.lower() for d in obj.get('dois', [])]
 
         # lookup dois
         re_list = set()
@@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter):
                 print("DOI not found: {}".format(doi))
             else:
                 re_list.add(re.ident)
-        if len(re_list) == 0:
+        release_ids = list(re_list)
+        if len(release_ids) == 0:
             return None
-        if fe.release_ids == set(re_list):
-            return None
-        re_list.update(fe.release_ids)
-        fe.release_ids = list(re_list)
 
         # parse URLs and CDX
-        existing_urls = [feu.url for feu in fe.urls]
+        urls = set()
         for url in obj.get('url', []):
-            if url not in existing_urls:
-                url = self.make_url(url)
-                if url != None:
-                    fe.urls.append(url)
+            url = self.make_url(url)
+            if url != None:
+                urls.add(url)
         for cdx in obj.get('cdx', []):
             original = cdx['url']
             wayback = "https://web.archive.org/web/{}/{}".format(
                 cdx['dt'],
                 original)
-            if wayback not in existing_urls:
-                fe.urls.append(
-                    fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
-            if original not in existing_urls:
-                url = self.make_url(original)
-                if url != None:
-                    fe.urls.append(url)
-
-        if obj.get('size') != None:
-            fe.size = int(obj['size'])
-        fe.sha256 = obj.get('sha256', fe.sha256)
-        fe.md5 = obj.get('md5', fe.sha256)
-        if obj.get('mimetype') is None:
-            if fe.mimetype is None:
-                fe.mimetype = self.default_mime
-        else:
-            fe.mimetype = obj.get('mimetype')
+            urls.add(("webarchive", wayback))
+            url = self.make_url(original)
+            if url != None:
+                urls.add(url)
+        urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls]
+        if len(urls) == 0:
+            return None
+
+        size = obj.get('size')
+        if size:
+            size = int(size)
+
+        fe = fatcat_client.FileEntity(
+            md5=obj.get('md5'),
+            sha1=obj['sha1'],
+            sha256=obj.get('sha256'),
+            size=size,
+            mimetype=obj.get('mimetype'),
+            release_ids=release_ids,
+            urls=urls,
+        )
         return fe
 
-    def create_row(self, row, editgroup_id=None):
-        obj = json.loads(row)
-        fe = self.parse_matched_dict(obj)
-        if fe is not None:
-            if fe.ident is None:
-                self.api.create_file(fe, editgroup_id=editgroup_id)
-                self.counts['insert'] += 1
-            else:
-                self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id)
-                self.counts['update'] += 1
-
-    def create_batch(self, batch):
-        """Reads and processes in batches (not API-call-per-line)"""
-        objects = [self.parse_matched_dict(json.loads(l))
-                   for l in batch if l != None]
-        new_objects = [o for o in objects if o != None and o.ident == None]
-        update_objects = [o for o in objects if o != None and o.ident != None]
-        if len(update_objects):
-            update_eg = self._editgroup().editgroup_id
-            for obj in update_objects:
-                self.api.update_file(obj.ident, obj, editgroup_id=update_eg)
-            self.api.accept_editgroup(update_eg)
-        if len(new_objects) > 0:
-            self.api.create_file_batch(new_objects, autoaccept="true")
-        self.counts['update'] += len(update_objects)
-        self.counts['insert'] += len(new_objects)
+    def try_update(self, fe):
+        # lookup sha1, or create new entity
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            return True
+
+        fe.release_ids = list(set(fe.release_ids + existing.release_ids))
+        if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
+            # no new release matches *and* there are already existing URLs
+            self.counts['exists'] += 1
+            return False
+
+        # merge the existing into this one and update
+        existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
+        existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+        existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+        existing.mimetype = existing.mimetype or fe.mimetype
+        existing.size = existing.size or fe.size
+        existing.md5 = existing.md5 or fe.md5
+        existing.sha256 = existing.sha256 or fe.sha256
+        self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup())
+        self.counts['update'] += 1
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_file_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 0c8b1d62..02c9bf00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
 import json
 import itertools
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
 def value_or_none(e):
     if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
             return None
     return e
 
-class OrcidImporter(FatcatImporter):
+class OrcidImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
@@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter):
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra)
 
-    def parse_orcid_dict(self, obj):
+    def want(self, raw_record):
+        return True
+
+    def parse_record(self, obj):
         """
         obj is a python dict (parsed from json).
         returns a CreatorEntity
         """
         name = obj['person']['name']
-        if name is None:
-            return None
+        assert name
         extra = None
         given = value_or_none(name.get('given-names'))
         sur = value_or_none(name.get('family-name'))
@@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter):
             return None
         ce = fatcat_client.CreatorEntity(
             orcid=orcid,
-            given_name=given,
-            surname=sur,
-            display_name=display,
+            given_name=clean(given),
+            surname=clean(sur),
+            display_name=clean(display),
             extra=extra)
         return ce
 
-    def create_row(self, row, editgroup_id=None):
-        obj = json.loads(row)
-        ce = self.parse_orcid_dict(obj)
-        if ce is not None:
-            self.api.create_creator(ce, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
+    def try_update(self, raw_record):
+        existing = None
+        try:
+            existing = self.api.lookup_creator(orcid=raw_record.orcid)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
 
-    def create_batch(self, batch):
-        """Reads and processes in batches (not API-call-per-line)"""
-        objects = [self.parse_orcid_dict(json.loads(l))
-                   for l in batch if l != None]
-        objects = [o for o in objects if o != None]
-        self.api.create_creator_batch(objects, autoaccept="true")
-        self.counts['insert'] += len(objects)
+    def insert_batch(self, batch):
+        self.api.create_creator_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 0f957f9a..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
 
+
 import collections
 from fatcat_client import ReleaseEntity, ApiClient
 
@@ -26,25 +27,43 @@ def release_to_elasticsearch(release):
     Raises exception on error (never returns None)
     """
 
-    if release.state != 'active':
-        raise ValueError("Entity is not 'active'")
+    if release.state in ('redirect', 'deleted'):
+        return dict(
+            ident = release.ident,
+            state = release.state,
+        )
+    elif release.state != 'active':
+        raise ValueError("Unhandled release state: {}".format(release.state))
 
     # First, the easy ones (direct copy)
     t = dict(
         ident = release.ident,
+        state = release.state,
         revision = release.revision,
         title = release.title,
+        original_title = release.original_title,
         release_type = release.release_type,
         release_status = release.release_status,
         language = release.language,
+        license = release.license_slug,
         doi = release.doi,
         pmid = release.pmid,
         pmcid = release.pmcid,
         isbn13 = release.isbn13,
+        wikidata_qid = release.wikidata_qid,
         core_id = release.core_id,
-        wikidata_qid = release.wikidata_qid
+        arxiv_id = release.core_id,
+        jstor_id = release.jstor_id,
     )
 
+    is_oa = None
+    is_longtail_oa = None
+    in_kbart = None
+    in_web = False
+    in_dweb = False
+    in_ia = False
+    in_shadow = False
+
     if release.release_date:
         # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
         t['release_date'] = release.release_date.isoformat()
@@ -53,52 +72,99 @@ def release_to_elasticsearch(release):
     if release.release_year is not None:
         t['release_year'] = release.release_year
 
+    t['any_abstract'] = len(release.abstracts) > 0
+    t['ref_count'] = len(release.refs or [])
+    t['contrib_count'] = len(release.contribs or [])
+    contrib_names = []
+    for c in (release.contribs or []):
+        if c.raw_name:
+            contrib_names.append(c.raw_name)
+    t['contrib_names'] = contrib_names
+
     container = release.container
-    container_is_kept = False
     if container:
         t['publisher'] = container.publisher
         t['container_name'] = container.name
         t['container_issnl'] = container.issnl
-        container_extra = container.extra
-        if container_extra:
-            t['container_is_oa'] = container_extra.get('is_oa')
-            container_is_kept = container_extra.get('is_kept', False)
-            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+        t['container_type'] = container.container_type
+        if container.extra:
+            if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+                is_oa = True
+            if container.extra.get('in_kbart'):
+                # TODO: better KBART check goes here
+                in_kbart = True
+            if container.extra.get('ia'):
+                # TODO: container longtail check goes here
+                # TODO: sim/microfilm check goes here
+                pass
+            # TODO: SHERPA/Romeo goes here
     else:
         t['publisher'] = release.publisher
 
     files = release.files or []
     t['file_count'] = len(files)
-    in_wa = False
-    in_ia = False
-    t['file_pdf_url'] = None
+    t['fileset_count'] = len(release.filesets or [])
+    t['webcapture_count'] = len(release.webcaptures or [])
+    any_pdf_url = None
+    good_pdf_url = None
+    best_pdf_url = None
+    ia_pdf_url = None
     for f in files:
+        if f.extra and f.extra.get('shadows'):
+            # TODO: shadow check goes here
+            in_shadows = True
         is_pdf = 'pdf' in (f.mimetype or '')
         for url in (f.urls or []):
-            if url.rel == 'webarchive':
-                in_wa = True
-            if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+            if url.url.lower().startswith('http'):
+                in_web = True
+            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+                # TODO: not sure what rel will be
+                in_dweb = True
+            if is_pdf:
+                any_pdf_url = url.url
+            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+                is_preserved = True
+                good_pdf_url = url.url
+            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
                 in_ia = True
                 if is_pdf:
-                    t['file_pdf_url'] = url.url
-            if not t['file_pdf_url'] and is_pdf:
-                t['file_pdf_url'] = url.url
-    t['file_in_webarchive'] = in_wa
-    t['file_in_ia'] = in_ia
+                    best_pdf_url = url.url
+                    ia_pdf_url = url.url
+    # here is where we bake-in priority; IA-specific
+    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+    t['ia_pdf_url'] = ia_pdf_url
+
+    if release.license_slug:
+        # TODO: more/better checks here, particularly strict *not* OA licenses
+        if release.license_slug.startswith("CC-"):
+            is_oa = True
 
     extra = release.extra or dict()
     if extra:
-        t['in_shadow'] = extra.get('in_shadow')
-        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
-            t['container_is_longtail_oa'] = True
-    t['any_abstract'] = bool(release.abstracts)
-    t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+        # TODO: longtail OA check from GROBID here
+        if extra.get('in_kbart'):
+            # NOTE: not actually setting this anywhere
+            in_kbart = True
+        if extra.get('is_oa'):
+            # NOTE: not actually setting this anywhere
+            is_oa = True
+        if extra.get('grobid'):
+            if not t.get('container_name'):
+                t['container_name'] = extra['grobid'].get('container_name')
+            if extra['grobid'].get('longtail_oa'):
+                is_longtail_oa = True
+        if extra.get('crossref'):
+            if extra['crossref'].get('archive'):
+                # all crossref archives are KBART, I believe
+                in_kbart = True
 
-    t['ref_count'] = len(release.refs or [])
-    t['contrib_count'] = len(release.contribs or [])
-    contrib_names = []
-    for c in (release.contribs or []):
-        if c.raw_name:
-            contrib_names.append(c.raw_name)
-    t['contrib_names'] = contrib_names
+    if is_longtail_oa:
+        is_oa = True
+    t['is_oa'] = is_oa
+    t['is_longtail_oa'] = is_longtail_oa
+    t['in_kbart'] = in_kbart
+    t['in_web'] = in_web
+    t['in_dweb'] = in_dweb
+    t['in_ia'] = in_ia
+    t['is_preserved'] = in_ia or in_kbart
     return t
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 8690a791..636ed304 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker):
                 release_edits = cle['editgroup']['edits']['releases']
                 for re in release_edits:
                     ident = re['ident']
-                    release = self.api.get_release(ident, expand="files,container")
+                    release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
                     release_dict = self.api.api_client.sanitize_for_serialization(release)
                     producer.produce(
                         message=json.dumps(release_dict).encode('utf-8'),
diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py
index 8035cbe5..03964c92 100644
--- a/python/fatcat_web/auth.py
+++ b/python/fatcat_web/auth.py
@@ -90,7 +90,10 @@ def handle_ia_xauth(email, password):
             'secret': Config.IA_XAUTH_CLIENT_SECRET,
         })
     if resp.status_code == 401 or (not resp.json().get('success')):
-        flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason']))
+        try:
+            flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason']))
+        except:
+            print("IA XAuth fail: {}".format(resp.content))
         return render_template('auth_ia_login.html', email=email), resp.status_code
     elif resp.status_code != 200:
         flash("Internet Archive login failed (internal error?)")
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index a5927d9b..926d5340 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -4,7 +4,7 @@ import json
 from flask import Flask, render_template, send_from_directory, request, \
     url_for, abort, g, redirect, jsonify, session, flash
 from flask_login import login_required
-from fatcat_web import app, api, auth_api
+from fatcat_web import app, api, auth_api, priv_api
 from fatcat_web.auth import handle_token_login, handle_logout, load_user, handle_ia_xauth
 from fatcat_client.rest import ApiException
 from fatcat_web.search import do_search
@@ -368,6 +368,8 @@ def search():
 @app.route('/auth/login')
 def login():
     # show the user a list of login options
+    if not priv_api:
+        flash("This web interface not configured with credentials to actually allow login (other than via token)")
     return render_template('auth_login.html')
 
 @app.route('/auth/ia/login', methods=['GET', 'POST'])
diff --git a/python/fatcat_web/templates/container_view.html b/python/fatcat_web/templates/container_view.html
index 29f0b9d9..4a175a5d 100644
--- a/python/fatcat_web/templates/container_view.html
+++ b/python/fatcat_web/templates/container_view.html
@@ -15,12 +15,6 @@
 
 <p><b>Publisher:</b>
 {% if container.publisher != None %}{{ container.publisher }}{% else %}<i>Unknown</i>{% endif %}
-{% if container.coden != None %}
-<br><b>CODEN<sup><a href="https://en.wikipedia.org/wiki/CODEN">?</a></sup>:</b> &nbsp;<code>{{ container.coden }}</code>
-{% endif %}
-{% if container.abbrev != None %}
-<br><b>Abbrev.:</b> &nbsp;<code>{{ container.abbrev }}</code>
-{% endif %}
 {% if (container.extra != None) and (container.extra['url'] != None) and (container.extra['url']|length > 0) %}
 <br><b>Homepage:</b> <a href="{{ container.extra['url'] }}">&nbsp;<code>{{ container.extra['url'] }}</code></a>
 {% endif %}
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index fd86b7c9..4e24b281 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -143,7 +143,7 @@ Raw Object:
 {% endif %}
 
 <br>
-{% if release.refs.size != 0 %}
+{% if release.refs != None and release.refs.size != 0 %}
 <h3>References</h3>
 This release citing other releases.
 <ol>
diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py
index cbe519b0..9ce32ed7 100644
--- a/python/fatcat_web/web_config.py
+++ b/python/fatcat_web/web_config.py
@@ -19,7 +19,7 @@ class Config(object):
     GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip().decode('utf-8')
 
     # This is, effectively, the QA/PROD flag
-    FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="qa.fatcat.wiki")
+    FATCAT_DOMAIN = os.environ.get("FATCAT_DOMAIN", default="dev.fatcat.wiki")
     FATCAT_API_AUTH_TOKEN = os.environ.get("FATCAT_API_AUTH_TOKEN", default=None)
     FATCAT_API_HOST = os.environ.get("FATCAT_API_HOST", default="https://{}/v0".format(FATCAT_DOMAIN))
 
@@ -39,10 +39,11 @@ class Config(object):
     IA_XAUTH_CLIENT_SECRET = os.environ.get("IA_XAUTH_CLIENT_SECRET", default=None)
 
     # protect cookies (which include API tokens)
-    SESSION_COOKIE_HTTPONLY = True
-    SESSION_COOKIE_SECURE = True
-    SESSION_COOKIE_SAMESITE = 'Lax'
-    PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds
+    if FATCAT_DOMAIN != "dev.fatcat.wiki":
+        SESSION_COOKIE_HTTPONLY = True
+        SESSION_COOKIE_SECURE = True
+        SESSION_COOKIE_SAMESITE = 'Lax'
+        PERMANENT_SESSION_LIFETIME = 2678400 # 31 days, in seconds
 
     try:
         GIT_RELEASE = raven.fetch_git_sha('..')
diff --git a/python/tests/api_annotations.py b/python/tests/api_annotations.py
new file mode 100644
index 00000000..0d3c5046
--- /dev/null
+++ b/python/tests/api_annotations.py
@@ -0,0 +1,39 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_annotations(api):
+
+    eg = quick_eg(api)
+
+    # ensure no annotations on this object
+    a = api.get_editgroup_annotations(eg.editgroup_id)
+    assert a == []
+
+    # create an annotation!
+    api.create_editgroup_annotation(
+        eg.editgroup_id,
+        EditgroupAnnotation(
+            comment_markdown="some *annotation*",
+            extra=dict(thing="thang")))
+
+    # check that we can fetch it all sorts of ways
+    a = api.get_editgroup_annotations(eg.editgroup_id)
+    assert len(a) == 1
+    assert a[0].extra['thing'] == "thang"
+
+    # the editor persists, so this is a hack to find a "recent" one
+    a2 = api.get_editor_annotations(eg.editor_id, limit=100)
+    found = None
+    for thing in a2:
+        if thing.annotation_id == a[0].annotation_id:
+            found = thing
+            break
+    assert thing
+    assert thing.extra['thing'] == "thang"
diff --git a/python/tests/api_containers.py b/python/tests/api_containers.py
new file mode 100644
index 00000000..674ae3b8
--- /dev/null
+++ b/python/tests/api_containers.py
@@ -0,0 +1,48 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_container(api):
+    eg = quick_eg(api)
+
+    # all the fields!
+    c1 = ContainerEntity(
+        name="some container name",
+        container_type="journal",
+        publisher="some container publisher",
+        issnl="1234-567X",
+        wikidata_qid="Q954248",
+        extra=dict(a=1, b=2),
+    )
+
+    c1edit = api.create_container(c1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    c2 = api.get_container(c1edit.ident)
+
+    # check that fields match
+    assert c1.name == c2.name
+    assert c1.container_type == c2.container_type
+    assert c1.publisher == c2.publisher
+    assert c1.issnl == c2.issnl
+    assert c1.wikidata_qid == c2.wikidata_qid
+    assert c1.extra == c2.extra
+
+    # expansion
+    # TODO: via release
+    # lookup
+    # TODO: via issnl; but need to generate random identifiers
+
+def test_container_examples(api):
+
+    api.lookup_container(issnl='1549-1277')
+
+    c1 = api.get_container('aaaaaaaaaaaaaeiraaaaaaaaam')
+    assert c1.name == "PLOS Medicine"
+    assert c1.issnl == "1549-1277"
+
diff --git a/python/tests/api_creators.py b/python/tests/api_creators.py
new file mode 100644
index 00000000..7443675b
--- /dev/null
+++ b/python/tests/api_creators.py
@@ -0,0 +1,44 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_creators(api):
+    eg = quick_eg(api)
+
+    # all the fields!
+    c1 = CreatorEntity(
+        display_name="Emma Smith",
+        given_name="emma",
+        surname="smith",
+        orcid="0000-0002-1825-0097",
+        wikidata_qid="Q9542248",
+        extra=dict(a=1, b=5),
+    )
+
+    c1edit = api.create_creator(c1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    c2 = api.get_creator(c1edit.ident)
+
+    # check that fields match
+    assert c1.display_name == c2.display_name
+    assert c1.given_name == c2.given_name
+    assert c1.surname == c2.surname
+    assert c1.orcid == c2.orcid
+    assert c1.wikidata_qid == c2.wikidata_qid
+    assert c1.extra == c2.extra
+
+    # expansion
+    # TODO: via release
+    # lookup
+    # TODO: via issnl; but need to generate random identifiers
+
+def test_creators_examples(api):
+    # TODO: aaaaaaaaaaaaaircaaaaaaaaam
+
+    api.lookup_creator(orcid='0000-0003-3118-6859')
diff --git a/python/tests/api_editgroups.py b/python/tests/api_editgroups.py
new file mode 100644
index 00000000..722d8686
--- /dev/null
+++ b/python/tests/api_editgroups.py
@@ -0,0 +1,140 @@
+
+import json
+import pytest
+import datetime
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_editgroup_submit(api):
+    # 1. check that edit group can be submitted/unsubmitted, and shows up in reviewable appropriately
+    # 2. accepted edits don't show up as reviewable and can't be submitted
+
+    c1 = CreatorEntity(display_name="test updates")
+    eg = quick_eg(api)
+    c1 = api.get_creator(api.create_creator(c1, editgroup_id=eg.editgroup_id).ident)
+
+    eg2 = api.get_editgroup(eg.editgroup_id)
+    assert not eg2.submitted
+    assert not eg2.changelog_index
+
+    reviewable = api.get_editgroups_reviewable(limit=100)
+    assert eg.editgroup_id not in [v.editgroup_id for v in reviewable]
+    wip = api.get_editor_editgroups(eg.editor_id, limit=100)
+    assert eg.editgroup_id in [v.editgroup_id for v in wip]
+
+    api.update_editgroup(eg.editgroup_id, eg2, submit=True)
+    eg3 = api.get_editgroup(eg.editgroup_id)
+    assert eg3.submitted
+    reviewable = api.get_editgroups_reviewable(limit=100)
+    assert eg.editgroup_id in [v.editgroup_id for v in reviewable]
+
+    api.update_editgroup(eg.editgroup_id, eg2, submit=False)
+    eg3 = api.get_editgroup(eg.editgroup_id)
+    assert not eg3.submitted
+    reviewable = api.get_editgroups_reviewable(limit=100)
+    assert eg.editgroup_id not in [v.editgroup_id for v in reviewable]
+
+    # put back in reviewable
+    api.update_editgroup(eg.editgroup_id, eg2, submit=True)
+    reviewable = api.get_editgroups_reviewable(limit=100)
+    assert eg.editgroup_id in [v.editgroup_id for v in reviewable]
+
+    # shouldn't be reviewable if accepted
+    api.accept_editgroup(eg.editgroup_id)
+    reviewable = api.get_editgroups_reviewable(limit=100)
+    assert eg.editgroup_id not in [v.editgroup_id for v in reviewable]
+    eg3 = api.get_editgroup(eg.editgroup_id)
+    #print(eg3)
+    assert eg3.submitted
+    assert eg3.changelog_index
+
+    with pytest.raises(fatcat_client.rest.ApiException):
+        api.update_editgroup(eg.editgroup_id, eg3, submit=True)
+    with pytest.raises(fatcat_client.rest.ApiException):
+        eg3.description = "something"
+        api.update_editgroup(eg.editgroup_id, eg3)
+
+
+def test_editgroup_ordering(api):
+
+    eg1 = quick_eg(api)
+    eg2 = quick_eg(api)
+    api.update_editgroup(
+        eg1.editgroup_id,
+        Editgroup(editgroup_id=eg1.editgroup_id, description="FAIL"),
+        submit=True)
+    api.update_editgroup(
+        eg2.editgroup_id,
+        Editgroup(editgroup_id=eg2.editgroup_id, description="FAIL"),
+        submit=True)
+
+    r1 = api.get_editgroups_reviewable()
+    #print(r1)
+    assert not r1[0].description
+    assert not r1[1].description
+    assert r1[0].submitted >= r1[1].submitted
+
+    # should be no editgroups "in the future" (since now + 1sec)
+    r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() + datetime.timedelta(seconds=1)).isoformat()+"Z")
+    assert not r1
+
+    r1 = api.get_editgroups_reviewable(since=(datetime.datetime.utcnow() - datetime.timedelta(seconds=5)).isoformat()+"Z")
+    assert r1[0].submitted <= r1[1].submitted
+
+
+def test_editgroup_autoaccept(api):
+    # autoaccept changes: editgroups required when, in what combination
+
+    eg = quick_eg(api)
+    c1 = CreatorEntity(display_name="test autoaccept")
+    c2 = CreatorEntity(display_name="test another autoaccept")
+
+    with pytest.raises(fatcat_client.rest.ApiException):
+        edits = api.create_creator_batch([c1, c2])
+
+    with pytest.raises(fatcat_client.rest.ApiException):
+        edits = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id, autoaccept=True)
+
+    edits1 = api.create_creator_batch([c1, c2], editgroup_id=eg.editgroup_id)
+    edits2 = api.create_creator_batch([c1, c2], autoaccept=True)
+
+    assert edits1[0].editgroup_id == eg.editgroup_id
+    assert edits1[0].editgroup_id != edits2[1].editgroup_id
+    eg1 = api.get_editgroup(edits1[0].editgroup_id)
+    eg2 = api.get_editgroup(edits2[0].editgroup_id)
+
+    assert not eg1.changelog_index
+    assert eg2.changelog_index
+    #print(edits1)
+    #print(eg1.edits.creators)
+    assert eg1.edits.creators[0].ident in [t.ident for t in edits1]
+    assert eg2.edits.creators[0].ident in [t.ident for t in edits2]
+
+
+def test_batch_params(api):
+
+    eg = quick_eg(api)
+    c1 = CreatorEntity(display_name="test autoaccept")
+    c2 = CreatorEntity(display_name="test another autoaccept")
+
+    with pytest.raises(fatcat_client.rest.ApiException):
+        edits = api.create_creator_batch([c1, c2])
+
+    desc = "test description"
+    extra = dict(a=75, q="thing")
+    edits = api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=json.dumps(extra))
+    eg = api.get_editgroup(edits[0].editgroup_id)
+
+    assert eg.description == desc
+    assert eg.extra == extra
+
+    # currently must manually json dumps() extra field
+    with pytest.raises(fatcat_client.rest.ApiException):
+        api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra=extra)
+
+    with pytest.raises(fatcat_client.rest.ApiException):
+        api.create_creator_batch([c1, c2], autoaccept=True, description=desc, extra="{")
diff --git a/python/tests/api_files.py b/python/tests/api_files.py
new file mode 100644
index 00000000..033538ef
--- /dev/null
+++ b/python/tests/api_files.py
@@ -0,0 +1,52 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_file(api):
+
+    eg = quick_eg(api)
+
+    # all the fields!
+    f1 = FileEntity(
+        size=89238,
+        md5="7ce6615b2a5904939576d9567bd5f68e",
+        sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2",
+        sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3",
+        mimetype="application/pdf",
+        extra=dict(a=2, b=5),
+        urls=[
+            FileEntityUrls(url="https://web.archive.org/web/12345542/something.com/blah.pdf", rel="webarchive"),
+        ],
+        release_ids=[],
+    )
+
+    f1edit = api.create_file(f1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    f2 = api.get_file(f1edit.ident)
+
+    # check that fields match
+    assert f1.size == f2.size
+    assert f1.md5 == f2.md5
+    assert f1.sha1 == f2.sha1
+    assert f1.sha256 == f2.sha256
+    assert f1.mimetype == f2.mimetype
+    assert f1.extra == f2.extra
+    assert f1.urls == f2.urls
+    assert f1.release_ids == f2.release_ids
+
+    # expansion
+    # TODO: via release
+    # lookup
+    # TODO: via hashes; but need to generate random?
+
+def test_file_examples(api):
+
+    api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362')
+
+    f1 = api.get_file('aaaaaaaaaaaaamztaaaaaaaaam')
diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py
new file mode 100644
index 00000000..966b85ca
--- /dev/null
+++ b/python/tests/api_filesets.py
@@ -0,0 +1,79 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_fileset(api):
+
+    eg = quick_eg(api)
+    r1 = ReleaseEntity(title="test fileset release")
+    r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id)
+
+    fs1 = FilesetEntity(
+        manifest = [
+            FilesetEntityManifest(
+                path="data/thing.tar.gz",
+                size=54321,
+                md5="540da3ea6e448d8dfb057c05225f853a",
+                sha1="1dab6a0e110f9b5d70b18db0abf051f7f93faf06",
+                sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a",
+                extra={"a": 1, "b": 3},
+            ),
+            FilesetEntityManifest(
+                path="README.md",
+                size=54210,
+                md5="5f83592b5249671719bbed6ce91ecfa8",
+                sha1="455face3598611458efe1f072e58624790a67266",
+                sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905",
+                extra={"x": 1, "y": "q"},
+            ),
+        ],
+        urls = [
+            FileEntityUrls(url="https://archive.org/download/fileset-123/", rel="repository"),
+            FileEntityUrls(url="https://humble-host.com/~user123/dataset/", rel="web"),
+        ],
+        release_ids = [r1edit.ident],
+    )
+
+    fs1edit = api.create_fileset(fs1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    fs2 = api.get_fileset(fs1edit.ident)
+
+    # check that fields match
+    assert fs1.urls == fs2.urls
+    assert fs1.manifest == fs2.manifest
+    assert fs1.release_ids == fs2.release_ids
+
+    # expansion
+    r1 = api.get_release(r1edit.ident, expand="filesets")
+    assert r1.filesets[0].manifest == fs1.manifest
+
+def test_fileset_examples(api):
+    fs3 = api.get_fileset('aaaaaaaaaaaaaztgaaaaaaaaam')
+
+    assert fs3.urls[0].url == 'http://other-personal-blog.name/dataset/'
+    assert fs3.urls[1].rel == 'archive'
+    assert fs3.manifest[1].md5 == 'f4de91152c7ab9fdc2a128f962faebff'
+    assert fs3.manifest[1].extra['mimetype'] == 'application/gzip'
+
+def test_bad_fileset(api):
+
+    eg = quick_eg(api)
+
+    bad_list = [
+        # good (for testing test itself)
+        #FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size=1234)]),
+        #FilesetEntity(urls=[FileEntityUrls(url="thing", rel="blah")]),
+        FilesetEntity(manifest=[FilesetEntityManifest(path="123.jpg", size="big")]),
+        FilesetEntity(release_ids=["asdf"]),
+    ]
+
+    for b in bad_list:
+        with pytest.raises(fatcat_client.rest.ApiException):
+            api.create_fileset(b, editgroup_id=eg.editgroup_id)
+
diff --git a/python/tests/api_misc.py b/python/tests/api_misc.py
index 3510ea82..0a0f16da 100644
--- a/python/tests/api_misc.py
+++ b/python/tests/api_misc.py
@@ -8,14 +8,6 @@ from fatcat_client.rest import ApiException
 from fixtures import *
 
 
-def test_lookups(api):
-
-    api.lookup_creator(orcid='0000-0003-3118-6859')
-    api.lookup_container(issnl='1549-1277')
-    api.lookup_file(sha256='ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362')
-    api.lookup_release(pmid='54321')
-    api.lookup_release(isbn13='978-3-16-148410-0')
-
 def test_lookup_hide_extend(api):
 
     r = api.lookup_release(doi='10.1371/journal.pmed.0020124')
diff --git a/python/tests/api_releases.py b/python/tests/api_releases.py
new file mode 100644
index 00000000..ed6f24a4
--- /dev/null
+++ b/python/tests/api_releases.py
@@ -0,0 +1,103 @@
+
+import json
+import pytest
+import datetime
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_release(api):
+
+    eg = quick_eg(api)
+
+    # all the fields!
+    r1 = ReleaseEntity(
+        title="some title",
+        original_title="оригинальное название",
+        release_type="post-weblog",
+        release_status="pre-print",
+        release_date=datetime.datetime.utcnow().date(),
+        release_year=2015,
+        doi="10.5555/12345678",
+        pmid="12345",
+        pmcid="PMC4321",
+        wikidata_qid="Q1234",
+        isbn13="978-3-16-148410-0",
+        core_id="187348",
+        arxiv_id="aslkdjfh",
+        jstor_id="8328424",
+        volume="84",
+        issue="XII",
+        pages="4-99",
+        publisher="some publisher",
+        language="en",
+        license_slug="CC-0",
+        extra=dict(a=1, b=2),
+        contribs=[],
+        refs=[],
+        abstracts=[
+            ReleaseEntityAbstracts(
+                content="this is some abstract",
+                mimetype="text/plain",
+                lang="en"),
+            ReleaseEntityAbstracts(
+                content="this is some other abstract",
+                mimetype="text/plain",
+                lang="de"),
+        ],
+    )
+
+    r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    r2 = api.get_release(r1edit.ident)
+
+    # check that fields match
+    assert r1.title == r2.title
+    assert r1.original_title == r2.original_title
+    assert r1.release_type == r2.release_type
+    assert r1.release_date == r2.release_date
+    assert r1.release_year == r2.release_year
+    assert r1.doi == r2.doi
+    assert r1.pmid == r2.pmid
+    assert r1.pmcid == r2.pmcid
+    assert r1.wikidata_qid == r2.wikidata_qid
+    assert r1.isbn13 == r2.isbn13
+    assert r1.core_id == r2.core_id
+    assert r1.arxiv_id == r2.arxiv_id
+    assert r1.jstor_id == r2.jstor_id
+    assert r1.volume == r2.volume
+    assert r1.issue == r2.issue
+    assert r1.pages == r2.pages
+    assert r1.publisher == r2.publisher
+    assert r1.language == r2.language
+    assert r1.license_slug == r2.license_slug
+    assert r1.extra == r2.extra
+
+    for i in range(len(r1.abstracts)):
+        r1.abstracts[i].content == r2.abstracts[i].content
+        r1.abstracts[i].mimetype == r2.abstracts[i].mimetype
+        r1.abstracts[i].lang == r2.abstracts[i].lang
+    for i in range(len(r1.contribs)):
+        r1.contribs[i] == r2.contribs[i]
+    for i in range(len(r1.refs)):
+        r1.refs[i] == r2.refs[i]
+
+    # expansion
+    # TODO: via work
+    # lookup
+    # TODO: via all; but need to generate random identifiers
+
+def test_release_examples(api):
+
+    api.lookup_release(pmid='54321')
+    api.lookup_release(isbn13='978-3-16-148410-0')
+
+    r1 = api.get_release('aaaaaaaaaaaaarceaaaaaaaaai')
+    assert r1.title == "bigger example"
+    assert len(r1.refs) == 5
+    assert r1.contribs[0].role == "editor"
+    assert r1.abstracts[0].mimetype == "application/xml+jats"
+
diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py
new file mode 100644
index 00000000..dc1754b3
--- /dev/null
+++ b/python/tests/api_webcaptures.py
@@ -0,0 +1,96 @@
+
+import json
+import pytest
+import datetime
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_webcapture(api):
+
+    eg = quick_eg(api)
+    r1 = ReleaseEntity(title="test webcapture release")
+    r1edit = api.create_release(r1, editgroup_id=eg.editgroup_id)
+
+    wc1 = WebcaptureEntity(
+        original_url = "http://example.site",
+        #timestamp = "2012-01-02T03:04:05Z",
+        timestamp = datetime.datetime.now(datetime.timezone.utc),
+        cdx = [
+            WebcaptureEntityCdx(
+                surt="site,example,)/data/thing.tar.gz",
+                #timestamp="2012-01-02T03:04:05Z",
+                timestamp=datetime.datetime.now(datetime.timezone.utc),
+                url="http://example.site/data/thing.tar.gz",
+                mimetype="application/gzip",
+                status_code=200,
+                sha1="455face3598611458efe1f072e58624790a67266",
+                sha256="c7b49f3e84cd1b7cb0b0e3e9f632b7be7e21b4dc229df23331f880a8a7dfa75a",
+            ),
+            WebcaptureEntityCdx(
+                surt="site,example,)/README.md",
+                #timestamp="2012-01-02T03:04:05Z",
+                timestamp=datetime.datetime.now(datetime.timezone.utc),
+                url="http://example.site/README.md",
+                mimetype="text/markdown",
+                status_code=200,
+                sha1="455face3598611458efe1f072e58624790a67266",
+                sha256="429bcafa4d3d0072d5b2511e12c85c1aac1d304011d1c406da14707f7b9cd905",
+            ),
+        ],
+        archive_urls = [
+            FileEntityUrls(rel="wayback", url="https://web.archive.org/web/"),
+        ],
+        release_ids = [r1edit.ident],
+    )
+
+    wc1edit = api.create_webcapture(wc1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    wc2 = api.get_webcapture(wc1edit.ident)
+
+    # check that fields match
+    # I don't know why these aren't equal...
+    #print(wc1.archive_urls)
+    #print(wc2.archive_urls)
+    #assert wc1.archive_urls == wc2.archive_urls
+    assert wc1.archive_urls[0].rel == wc2.archive_urls[0].rel
+    assert wc1.archive_urls[0].url == wc2.archive_urls[0].url
+    assert wc1.cdx == wc2.cdx
+    assert wc1.release_ids == wc2.release_ids
+    assert wc1.timestamp == wc2.timestamp
+    assert wc1.original_url == wc2.original_url
+
+    # TODO: check release expansion
+    r1 = api.get_release(r1edit.ident, expand="webcaptures")
+    print(r1)
+    assert r1.webcaptures[0].cdx == wc1.cdx
+
+def test_webcapture_examples(api):
+    wc3 = api.get_webcapture('aaaaaaaaaaaaa53xaaaaaaaaam')
+
+    assert wc3.cdx[0].surt == 'org,asheesh)/'
+    assert wc3.cdx[1].sha1 == 'a637f1d27d9bcb237310ed29f19c07e1c8cf0aa5'
+    assert wc3.archive_urls[1].rel == 'warc'
+
+
+def test_bad_webcapture(api):
+
+    eg = quick_eg(api)
+
+    bad_list = [
+        # good (for testing test itself)
+        WebcaptureEntity(cdx=[
+            WebcaptureEntityCdx(
+                surt="site,example,)/123.jpg",
+                url="http://example.site/123.jpg",
+                sha1="455face3598611458efe1f072e58624790a67266",
+                timestamp=201506071122)]),
+    ]
+
+    for b in bad_list:
+        with pytest.raises(fatcat_client.rest.ApiException):
+            api.create_webcapture(b, editgroup_id=eg.editgroup_id)
+
diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py
new file mode 100644
index 00000000..fe5006cc
--- /dev/null
+++ b/python/tests/citation_efficiency.py
@@ -0,0 +1,113 @@
+
+import json
+import pytest
+from copy import copy
+
+from fatcat_client import *
+from fatcat_client.rest import ApiException
+from fixtures import *
+
+
+def test_citation_indexing(api):
+    # indexing is consistent and reacts to change
+
+    eg = quick_eg(api)
+    r1 = ReleaseEntity(title="the target")
+    r1.refs = [
+        ReleaseRef(key="first", title="the first title"),
+        ReleaseRef(key="second", title="the second title"),
+        ReleaseRef(key="third", title="a third title"),
+    ]
+    r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+    api.accept_editgroup(eg.editgroup_id)
+
+    assert r1.refs[0].index == 0
+    assert r1.refs[0].key == "first"
+    assert r1.refs[1].index == 1
+    assert r1.refs[1].key == "second"
+    assert r1.refs[2].index == 2
+    assert r1.refs[2].key == "third"
+
+    r1.refs.pop(1)
+    eg = quick_eg(api)
+    api.update_release(r1.ident, r1, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    r1 = api.get_release(r1.ident)
+
+    assert r1.refs[0].index == 0
+    assert r1.refs[0].key == "first"
+    assert r1.refs[1].index == 1
+    assert r1.refs[1].key == "third"
+
+def test_citation_targets(api):
+    # invariant to linking citations
+    # also, updates work
+
+    eg = quick_eg(api)
+    r1 = ReleaseEntity(title="the target")
+    r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+    r2 = ReleaseEntity(title="the citer")
+    r2.refs = [
+        ReleaseRef(key="first", title="something else"),
+        ReleaseRef(key="second", title="the target title"),
+    ]
+    r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident)
+    api.accept_editgroup(eg.editgroup_id)
+
+    eg = quick_eg(api)
+    r2.refs[1].target_release_id = r1.ident
+    api.update_release(r2.ident, r2, editgroup_id=eg.editgroup_id)
+    api.accept_editgroup(eg.editgroup_id)
+    r2 = api.get_release(r2.ident)
+    assert r2.refs[0].key == "first"
+    assert r2.refs[1].key == "second"
+    assert r2.refs[0].index == 0 # TODO: one-indexing?
+    assert r2.refs[1].index == 1
+    assert r2.refs[0].target_release_id == None
+    assert r2.refs[1].target_release_id == r1.ident
+    assert len(r2.refs) == 2
+
+def test_citation_empty_array(api):
+    # distinction between empty array (no citations) and no array (hidden)
+
+    r1 = ReleaseEntity(title="citation null")
+    r2 = ReleaseEntity(title="citation empty array")
+    r1.refs = None
+    r2.refs = []
+
+    eg = quick_eg(api)
+    r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+    r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident)
+    api.accept_editgroup(eg.editgroup_id)
+
+    print(r1.refs)
+    print(r2.refs)
+    assert r1.refs == []
+    assert r1.refs == r2.refs
+
+    r1b = api.get_release(r1.ident, hide="refs")
+    assert r1b.refs == None
+
+def test_citation_encoding(api):
+    # escape-only changes (eg, \u1234 whatever for ASCII)
+
+    r1 = ReleaseEntity(title="citation encoding")
+    title = "title-unicode \\u0050 \\\" "
+    container = "container-unicode ☃︎ ä ö ü スティー"
+    extra = extra={'a': 1, 'b': 2, 'ö': 3}
+    locator = "p123"
+    r1.refs = [
+        ReleaseRef(key="1", year=1923, title=title, container_name=container,
+            extra=extra, locator=locator),
+        ReleaseRef(key="2"),
+    ]
+
+    eg = quick_eg(api)
+    r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident)
+    api.accept_editgroup(eg.editgroup_id)
+
+    assert title == r1.refs[0].title
+    assert container == r1.refs[0].container_name
+    assert extra == r1.refs[0].extra
+    assert locator == r1.refs[0].locator
+
diff --git a/python/tests/cli.sh b/python/tests/cli.sh
index eba6d3a7..19d8a85b 100755
--- a/python/tests/cli.sh
+++ b/python/tests/cli.sh
@@ -14,7 +14,7 @@ set -x
 
 ./fatcat_import.py crossref tests/files/crossref-works.2018-01-21.badsample.json tests/files/ISSN-to-ISSN-L.snip.txt
 ./fatcat_import.py orcid tests/files/0000-0001-8254-7103.json
-./fatcat_import.py issn tests/files/journal_extra_metadata.snip.csv
+./fatcat_import.py journal-metadata tests/files/journal_extra_metadata.snip.csv
 ./fatcat_import.py matched tests/files/matched_sample.json
 ./fatcat_import.py matched tests/files/example_matched.json
 ./fatcat_import.py grobid-metadata tests/files/example_grobid_metadata_lines.tsv
diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json
index 2af2b358..e3d2e05c 100644
--- a/python/tests/files/crossref-works.single.json
+++ b/python/tests/files/crossref-works.single.json
@@ -84,7 +84,7 @@
     {
       "given": "Carlos G.",
       "family": "Diaz",
-      "affiliation": ["Some University"]
+      "affiliation": [{"name": "Some University"}, {"name": "Some Department"}]
     },
     {
       "given": "Francisco M.",
diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index 6a880c48..3cc275b3 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -28,6 +28,7 @@ def api():
     conf.api_key["Authorization"] = os.getenv("FATCAT_API_AUTH_TOKEN")
     conf.api_key_prefix["Authorization"] = "Bearer"
     api_client = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+    api_client.editor_id = "aaaaaaaaaaaabkvkaaaaaaaaae"
     return api_client
 
 def test_get_changelog_entry(api):
@@ -38,33 +39,6 @@ def test_get_changelog_entry(api):
 ## Helpers ##################################################################
 
 def quick_eg(api_inst):
-    eg = api_inst.create_editgroup(
-        fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+    eg = api_inst.create_editgroup(fatcat_client.Editgroup())
     return eg
 
-# TODO: what are these even here for?
-def check_entity_fields(e):
-    for key in ('rev', 'is_live', 'redirect_id'):
-        assert key in e
-    for key in ('id',):
-        assert e[key] is not None
-
-def check_release(e):
-    for key in ('work', 'release_type'):
-        assert key in e
-    for key in ('title', ):
-        assert e[key] is not None
-    for key in ('refs', 'creators'):
-        assert type(e[key]) == list
-
-def check_creator(e):
-    for key in ('name',):
-        assert e[key] is not None
-
-def check_container(e):
-    for key in ('name',):
-        assert e[key] is not None
-
-def check_file(e):
-    for key in ('size', 'sha1'):
-        assert e[key] is not None
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index e2ca6122..193f78f6 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,35 +1,51 @@
 
 import json
 import pytest
-from fatcat_tools.importers import CrossrefImporter
+from fatcat_tools.importers import CrossrefImporter, JsonLinePusher
 from fixtures import api
 
 
 @pytest.fixture(scope="function")
 def crossref_importer(api):
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=False)
+        yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=True)
 
 @pytest.fixture(scope="function")
 def crossref_importer_existing(api):
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', check_existing=True)
+        yield CrossrefImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', bezerk_mode=False)
 
 def test_crossref_importer_batch(crossref_importer):
     with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
-        crossref_importer.process_batch(f)
+        JsonLinePusher(crossref_importer, f).run()
 
 def test_crossref_importer(crossref_importer):
+    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
     with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
-        crossref_importer.process_source(f)
+        crossref_importer.bezerk_mode = True
+        counts = JsonLinePusher(crossref_importer, f).run()
+    assert counts['insert'] == 14
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
     # fetch most recent editgroup
-    changes = crossref_importer.api.get_changelog(limit=1)
-    eg = changes[0].editgroup
+    change = crossref_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
     assert eg.description
     assert "crossref" in eg.description.lower()
     assert eg.extra['git_rev']
     assert "fatcat_tools.CrossrefImporter" in eg.extra['agent']
 
+    last_index = crossref_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
+        crossref_importer.bezerk_mode = False
+        crossref_importer.reset()
+        counts = JsonLinePusher(crossref_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 14
+    assert counts['skip'] == 0
+    assert last_index == crossref_importer.api.get_changelog(limit=1)[0].index
+
 def test_crossref_mappings(crossref_importer):
     assert crossref_importer.map_release_type('journal-article') == "article-journal"
     assert crossref_importer.map_release_type('asdf') is None
@@ -39,13 +55,13 @@ def test_crossref_mappings(crossref_importer):
 def test_crossref_importer_create(crossref_importer):
     crossref_importer.create_containers = True
     with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
-        crossref_importer.process_source(f)
+        JsonLinePusher(crossref_importer, f).run()
 
 def test_crossref_dict_parse(crossref_importer):
     with open('tests/files/crossref-works.single.json', 'r') as f:
         # not a single line
         raw = json.loads(f.read())
-        (r, c) = crossref_importer.parse_crossref_dict(raw)
+        r = crossref_importer.parse_record(raw)
         extra = r.extra['crossref']
         assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators"
         assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t"
@@ -61,7 +77,8 @@ def test_crossref_dict_parse(crossref_importer):
         assert len(r.contribs) == 5
         assert r.contribs[0].raw_name == "Marcelo D. Radicioni"
         assert r.contribs[0].index == 0
-        assert r.contribs[1].extra['affiliations'] == ["Some University"]
+        assert r.contribs[1].raw_affiliation == "Some University"
+        assert r.contribs[1].extra['more_affiliations'] == ["Some Department"]
         assert r.contribs[1].role == "author"
         assert r.contribs[3].role == "editor"
         assert r.contribs[3].index is None
@@ -78,8 +95,10 @@ def test_crossref_dict_parse(crossref_importer):
 def test_stateful_checking(crossref_importer_existing):
     with open('tests/files/crossref-works.single.json', 'r') as f:
         # not a single line, a whole document
-        raw = json.loads(f.read())
+        raw = f.read()
         # might not exist yet...
-        crossref_importer_existing.process_source([json.dumps(raw)])
-        # ok, make sure we get 'None' back
-        assert crossref_importer_existing.parse_crossref_dict(raw) is None
+        crossref_importer_existing.push_record(json.loads(raw))
+        crossref_importer_existing.finish()
+        # make sure we wouldn't insert again
+        entity = crossref_importer_existing.parse_record(json.loads(raw))
+        assert crossref_importer_existing.try_update(entity) is False
diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py
index 97ebcaef..4fed4aaa 100644
--- a/python/tests/import_grobid_metadata.py
+++ b/python/tests/import_grobid_metadata.py
@@ -3,7 +3,7 @@ import os
 import json
 import base64
 import pytest
-from fatcat_tools.importers import GrobidMetadataImporter
+from fatcat_tools.importers import GrobidMetadataImporter, LinePusher
 from fixtures import api
 
 """
@@ -15,10 +15,6 @@ side-effects. Should probably be disabled or re-written.
 def grobid_metadata_importer(api):
     yield GrobidMetadataImporter(api)
 
-# TODO: use API to check that entities actually created...
-#def test_grobid_metadata_importer_batch(grobid_metadata_importer):
-#    with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
-#        grobid_metadata_importer.process_batch(f)
 
 def test_grobid_metadata_parse(grobid_metadata_importer):
     with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
@@ -30,7 +26,8 @@ def test_grobid_metadata_parse(grobid_metadata_importer):
         print(re.contribs)
         assert re.contribs[0].raw_name == "Wahyu Ary"
         assert re.publisher == None
-        assert re.extra.get('container_name') == None
+        if re.extra:
+            assert re.extra.get('container_name') == None
         assert len(re.refs) == 27
 
 def test_file_metadata_parse(grobid_metadata_importer):
@@ -53,13 +50,28 @@ def test_file_metadata_parse(grobid_metadata_importer):
         assert len(fe.release_ids) == 0
 
 def test_grobid_metadata_importer(grobid_metadata_importer):
+    last_index = grobid_metadata_importer.api.get_changelog(limit=1)[0].index
     with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
-        grobid_metadata_importer.process_source(f)
+        grobid_metadata_importer.bezerk_mode = True
+        counts = LinePusher(grobid_metadata_importer, f).run()
+    assert counts['insert'] == 10
+    assert counts['inserted.release'] == 10
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
 
     # fetch most recent editgroup
-    changes = grobid_metadata_importer.api.get_changelog(limit=1)
-    eg = changes[0].editgroup
+    change = grobid_metadata_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
     assert eg.description
     assert "grobid" in eg.description.lower()
     assert eg.extra['git_rev']
     assert "fatcat_tools.GrobidMetadataImporter" in eg.extra['agent']
+
+    with open('tests/files/example_grobid_metadata_lines.tsv', 'r') as f:
+        grobid_metadata_importer.reset()
+        grobid_metadata_importer.bezerk_mode = False
+        counts = LinePusher(grobid_metadata_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['inserted.release'] == 0
+    assert counts['exists'] == 10
+    assert counts['skip'] == 0
diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py
deleted file mode 100644
index 6b5978d9..00000000
--- a/python/tests/import_issn.py
+++ /dev/null
@@ -1,26 +0,0 @@
-
-import pytest
-from fatcat_tools.importers import IssnImporter
-from fixtures import api
-
-
-@pytest.fixture(scope="function")
-def issn_importer(api):
-    yield IssnImporter(api)
-
-# TODO: use API to check that entities actually created...
-def test_issn_importer_batch(issn_importer):
-    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
-        issn_importer.process_csv_batch(f)
-
-def test_issn_importer(issn_importer):
-    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
-        issn_importer.process_csv_source(f)
-
-    # fetch most recent editgroup
-    changes = issn_importer.api.get_changelog(limit=1)
-    eg = changes[0].editgroup
-    assert eg.description
-    assert "container" in eg.description.lower()
-    assert eg.extra['git_rev']
-    assert "fatcat_tools.IssnImporter" in eg.extra['agent']
diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py
new file mode 100644
index 00000000..a2b10a65
--- /dev/null
+++ b/python/tests/import_journal_metadata.py
@@ -0,0 +1,39 @@
+
+import pytest
+from fatcat_tools.importers import JournalMetadataImporter, CsvPusher
+from fixtures import api
+
+
+@pytest.fixture(scope="function")
+def journal_metadata_importer(api):
+    yield JournalMetadataImporter(api)
+
+# TODO: use API to check that entities actually created...
+def test_journal_metadata_importer_batch(journal_metadata_importer):
+    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+        CsvPusher(journal_metadata_importer, f).run()
+
+def test_journal_metadata_importer(journal_metadata_importer):
+    last_index = journal_metadata_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+        journal_metadata_importer.bezerk_mode = True
+        counts = CsvPusher(journal_metadata_importer, f).run()
+    assert counts['insert'] == 9
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = journal_metadata_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "container" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent']
+
+    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f:
+        journal_metadata_importer.reset()
+        journal_metadata_importer.bezerk_mode = False
+        counts = CsvPusher(journal_metadata_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 9
+    assert counts['skip'] == 0
diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py
index 080674ac..8f694456 100644
--- a/python/tests/import_matched.py
+++ b/python/tests/import_matched.py
@@ -1,7 +1,7 @@
 
 import json
 import pytest
-from fatcat_tools.importers import MatchedImporter
+from fatcat_tools.importers import MatchedImporter, JsonLinePusher
 from fixtures import api
 
 
@@ -10,26 +10,40 @@ def matched_importer(api):
     yield MatchedImporter(api)
 
 # TODO: use API to check that entities actually created...
-def test_matched_importer_batch(matched_importer):
+def test_matched_importer(matched_importer):
     with open('tests/files/example_matched.json', 'r') as f:
-        matched_importer.process_batch(f)
+        JsonLinePusher(matched_importer, f).run()
 
 def test_matched_importer(matched_importer):
+    last_index = matched_importer.api.get_changelog(limit=1)[0].index
     with open('tests/files/example_matched.json', 'r') as f:
-        matched_importer.process_source(f)
+        matched_importer.bezerk_mode = True
+        counts = JsonLinePusher(matched_importer, f).run()
+    assert counts['insert'] == 2
+    assert counts['exists'] == 0
+    assert counts['skip'] == 11
 
     # fetch most recent editgroup
-    changes = matched_importer.api.get_changelog(limit=1)
-    eg = changes[0].editgroup
+    change = matched_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
     assert eg.description
     assert "file-to-release" in eg.description.lower()
     assert eg.extra['git_rev']
     assert "fatcat_tools.MatchedImporter" in eg.extra['agent']
 
+    # re-insert; should skip
+    with open('tests/files/example_matched.json', 'r') as f:
+        matched_importer.reset()
+        matched_importer.bezerk_mode = False
+        counts = JsonLinePusher(matched_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 2
+    assert counts['skip'] == 11
+
 def test_matched_dict_parse(matched_importer):
     with open('tests/files/example_matched.json', 'r') as f:
         raw = json.loads(f.readline())
-        f = matched_importer.parse_matched_dict(raw)
+        f = matched_importer.parse_record(raw)
         assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"
         assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff"
         assert f.mimetype == "application/pdf"
diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py
index 717a1328..57886b52 100644
--- a/python/tests/import_orcid.py
+++ b/python/tests/import_orcid.py
@@ -1,7 +1,7 @@
 
 import json
 import pytest
-from fatcat_tools.importers import OrcidImporter
+from fatcat_tools.importers import OrcidImporter, JsonLinePusher
 from fixtures import api
 
 
@@ -9,37 +9,46 @@ from fixtures import api
 def orcid_importer(api):
     yield OrcidImporter(api)
 
-# TODO: use API to check that entities actually created...
-def test_orcid_importer_batch(orcid_importer):
-    with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
-        orcid_importer.process_batch(f)
-
 def test_orcid_importer_badid(orcid_importer):
     with open('tests/files/0000-0001-8254-710X.json', 'r') as f:
-        orcid_importer.process_batch(f)
+        JsonLinePusher(orcid_importer, f).run()
 
+# TODO: use API to check that entities actually created...
 def test_orcid_importer(orcid_importer):
+    last_index = orcid_importer.api.get_changelog(limit=1)[0].index
     with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
-        orcid_importer.process_source(f)
+        orcid_importer.bezerk_mode = True
+        counts = JsonLinePusher(orcid_importer, f).run()
+    assert counts['insert'] == 1
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
 
     # fetch most recent editgroup
-    changes = orcid_importer.api.get_changelog(limit=1)
-    eg = changes[0].editgroup
+    change = orcid_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
     assert eg.description
     assert "orcid" in eg.description.lower()
     assert eg.extra['git_rev']
     assert "fatcat_tools.OrcidImporter" in eg.extra['agent']
 
+    with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
+        orcid_importer.reset()
+        orcid_importer.bezerk_mode = False
+        counts = JsonLinePusher(orcid_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 1
+    assert counts['skip'] == 0
+
 def test_orcid_importer_x(orcid_importer):
     with open('tests/files/0000-0003-3953-765X.json', 'r') as f:
-        orcid_importer.process_source(f)
+        JsonLinePusher(orcid_importer, f).run()
     c = orcid_importer.api.lookup_creator(orcid="0000-0003-3953-765X")
     assert c is not None
 
 def test_orcid_dict_parse(orcid_importer):
     with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
         raw = json.loads(f.readline())
-        c = orcid_importer.parse_orcid_dict(raw)
+        c = orcid_importer.parse_record(raw)
         assert c.given_name == "Man-Hui"
         assert c.surname == "Li"
         assert c.display_name == "Man-Hui Li"
diff --git a/python/tests/importer.py b/python/tests/importer.py
index 34efa5d8..9308ba84 100644
--- a/python/tests/importer.py
+++ b/python/tests/importer.py
@@ -1,13 +1,13 @@
 
 
 import pytest
-from fatcat_tools.importers import FatcatImporter
+from fatcat_tools.importers import CrossrefImporter, OrcidImporter
 from fixtures import api
 
 
 def test_issnl_mapping_lookup(api):
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        fi = FatcatImporter(api, issn_map_file=issn_file)
+        fi = CrossrefImporter(api, issn_map_file=issn_file)
 
     assert fi.issn2issnl('0000-0027') == '0002-0027'
     assert fi.issn2issnl('0002-0027') == '0002-0027'
@@ -18,20 +18,18 @@ def test_issnl_mapping_lookup(api):
 def test_identifiers(api):
 
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        fi = FatcatImporter(api, issn_map_file=issn_file)
-
-    assert fi.is_issnl("1234-5678") == True
-    assert fi.is_issnl("1234-5678.") == False
-    assert fi.is_issnl("12345678") == False
-    assert fi.is_issnl("1-2345678") == False
-
-    assert fi.is_doi("10.1234/56789") == True
-    assert fi.is_doi("101234/56789") == False
-    assert fi.is_doi("10.1234_56789") == False
-
-    assert fi.is_orcid("0000-0003-3118-6591") == True
-    assert fi.is_orcid("0000-0003-3953-765X") == True
-    assert fi.is_orcid("0000-00x3-3118-659") == False
-    assert fi.is_orcid("0000-00033118-659") == False
-    assert fi.is_orcid("0000-0003-3118-659.") == False
+        ci = CrossrefImporter(api, issn_map_file=issn_file)
+
+    assert ci.is_issnl("1234-5678") == True
+    assert ci.is_issnl("1234-5678.") == False
+    assert ci.is_issnl("12345678") == False
+    assert ci.is_issnl("1-2345678") == False
+
+    oi = OrcidImporter(api)
+
+    assert oi.is_orcid("0000-0003-3118-6591") == True
+    assert oi.is_orcid("0000-0003-3953-765X") == True
+    assert oi.is_orcid("0000-00x3-3118-659") == False
+    assert oi.is_orcid("0000-00033118-659") == False
+    assert oi.is_orcid("0000-0003-3118-659.") == False
 
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
index e9d23250..6d6c6c82 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_tests.py
@@ -11,7 +11,7 @@ def test_elasticsearch_convert(crossref_importer):
     with open('tests/files/crossref-works.single.json', 'r') as f:
         # not a single line
         raw = json.loads(f.read())
-        (r, c) = crossref_importer.parse_crossref_dict(raw)
+        r = crossref_importer.parse_record(raw)
     r.state = 'active'
     release_to_elasticsearch(r)