23 files changed, 330 insertions, 32 deletions
diff --git a/extra/stats/2019-01-07-prod-stats.json b/extra/stats/2019-01-07-prod-stats.json
new file mode 100644
index 00000000..6a1d93a1
--- /dev/null
+++ b/extra/stats/2019-01-07-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":3091330,"timestamp":"2020-01-08T00:06:10.718421+00:00"}},"container":{"total":147021},"papers":{"in_kbart":60418765,"in_web":18284474,"in_web_not_kbart":8356539,"is_oa":10473319,"total":100551479},"release":{"refs_total":804237857,"total":124697305}}
diff --git a/extra/stats/2019-01-07-prod-table-sizes.txt b/extra/stats/2019-01-07-prod-table-sizes.txt
new file mode 100644
index 00000000..4c205d38
--- /dev/null
+++ b/extra/stats/2019-01-07-prod-table-sizes.txt
@@ -0,0 +1,35 @@
+                          table_name                          | table_size | indexes_size | total_size 
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."refs_blob"                                         | 83 GB      | 2875 MB      | 86 GB
+ "public"."release_rev"                                       | 47 GB      | 29 GB        | 76 GB
+ "public"."release_contrib"                                   | 35 GB      | 31 GB        | 66 GB
+ "public"."release_edit"                                      | 12 GB      | 19 GB        | 31 GB
+ "public"."work_edit"                                         | 12 GB      | 19 GB        | 30 GB
+ "public"."release_ident"                                     | 8257 MB    | 14 GB        | 22 GB
+ "public"."work_ident"                                        | 8054 MB    | 14 GB        | 22 GB
+ "public"."abstracts"                                         | 12 GB      | 918 MB       | 13 GB
+ "public"."file_rev_url"                                      | 9398 MB    | 3469 MB      | 13 GB
+ "public"."work_rev"                                          | 5226 MB    | 5825 MB      | 11 GB
+ "public"."release_ref"                                       | 3950 MB    | 5626 MB      | 9577 MB
+ "public"."file_rev"                                          | 3081 MB    | 4450 MB      | 7531 MB
+ "public"."file_edit"                                         | 2544 MB    | 3805 MB      | 6348 MB
+ "public"."file_ident"                                        | 1668 MB    | 2360 MB      | 4028 MB
+ "public"."file_rev_release"                                  | 1538 MB    | 2387 MB      | 3926 MB
+ "public"."release_rev_abstract"                              | 1370 MB    | 1697 MB      | 3067 MB
+ "public"."creator_edit"                                      | 702 MB     | 942 MB       | 1643 MB
+ "public"."creator_rev"                                       | 695 MB     | 719 MB       | 1413 MB
+ "public"."creator_ident"                                     | 474 MB     | 648 MB       | 1121 MB
+ "public"."editgroup"                                         | 663 MB     | 369 MB       | 1032 MB
+ "public"."release_rev_extid"                                 | 200 MB     | 312 MB       | 512 MB
+ "public"."changelog"                                         | 187 MB     | 205 MB       | 393 MB
+ "public"."container_rev"                                     | 75 MB      | 22 MB        | 97 MB
+ "public"."container_edit"                                    | 24 MB      | 31 MB        | 55 MB
+ "public"."container_ident"                                   | 11 MB      | 19 MB        | 29 MB
+ "public"."webcapture_rev_cdx"                                | 64 kB      | 32 kB        | 96 kB
+ "public"."fileset_rev_file"                                  | 48 kB      | 32 kB        | 80 kB
+ "public"."editor"                                            | 16 kB      | 48 kB        | 64 kB
+ "public"."editgroup_annotation"                              | 16 kB      | 48 kB        | 64 kB
+ "public"."auth_oidc"                                         | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_edit"                                      | 16 kB      | 48 kB        | 64 kB
+ "public"."webcapture_edit"                                   | 16 kB      | 48 kB        | 64 kB
+[...]
diff --git a/guide/src/entity_release.md b/guide/src/entity_release.md
index f6233c13..242a3c72 100644
--- a/guide/src/entity_release.md
+++ b/guide/src/entity_release.md
@@ -24,11 +24,11 @@
   publicly available. Blank if only year is known.
 - `release_year` (integer): year when this release was first made
   publicly available; should match `release_date` if both are known.
-- `withdrawn_status` (string, controlled set):
-- `release_date` (string, ISO date format): when this release was first made
-  publicly available. Blank if only year is known.
-- `release_year` (integer): year when this release was first made
-  publicly available; should match `release_date` if both are known.
+- `withdrawn_status` (optional, string, controlled set):
+- `withdrawn_date` (optional, string, ISO date format): when this release was withdrawn.
+  Blank if only year is known.
+- `withdrawn_year` (optional, integer): year when this release was withdrawn; should
+  match `withdrawn_date` if both are known.
 - `ext_ids` (key/value object of string-to-string mappings): external
   identifiers. At least an empty `ext_ids` object is always required for
   release entities, so individual identifiers can be accessed directly.
diff --git a/notes/bulk_edits/2019-12-20_orcid.md b/notes/bulk_edits/2019-12-20_orcid.md
new file mode 100644
index 00000000..33dde32f
--- /dev/null
+++ b/notes/bulk_edits/2019-12-20_orcid.md
@@ -0,0 +1,43 @@
+
+Newer ORCID dumps are XML, not JSON. But there is a conversion tool!
+
+    https://github.com/ORCID/orcid-conversion-lib
+
+Commands:
+
+    wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-0.0.2-full.jar
+    java -jar orcid-conversion-lib-0.0.2-full.jar OPTIONS
+
+    java -jar orcid-conversion-lib-0.0.2-full.jar --tarball -i ORCID_2019_summaries.tar.gz -v v3_0rc1 -o ORCID_2019_summaries_json.tar.gz
+
+    # [...]
+    # Sat Dec 21 04:43:50 UTC 2019 done 7300000
+    # Sat Dec 21 04:44:08 UTC 2019 done 7310000
+    # Sat Dec 21 04:44:17 UTC 2019 finished  errors 0
+
+Importing in QA, ran in to some lines like:
+
+    {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-0014-6598","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+    {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-3750-5654","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+    {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0003-1424-4826","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+    {"response-code":409,"developer-message":"409 Conflict: The ORCID record is locked and cannot be edited. ORCID https://orcid.org/0000-0002-5340-9665","user-message":"The ORCID record is locked.","error-code":9018,"more-info":"https://members.orcid.org/api/resources/troubleshooting"}
+
+Needed to patch to filter those out. Then ran ok like:
+
+    zcat /srv/fatcat/datasets/ORCID_2019_summaries.sample_10k.json.gz | ./fatcat_import.py orcid -
+    Counter({'total': 10000, 'exists': 5323, 'insert': 4493, 'skip': 184, 'skip-no-person': 160, 'update': 0})
+
+New dump is about 7.3 million rows, so expecting about 3.2 million new
+entities, 250k skips.
+
+Doing bulk run like:
+
+    time zcat /srv/fatcat/datasets/ORCID_2019_summaries.json.gz | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid -
+
+Prod timing:
+
+    Counter({'total': 910643, 'exists': 476812, 'insert': 416583, 'skip': 17248, 'update': 0})
+
+    real    47m27.658s
+    user    245m44.272s
+    sys     14m50.836s
diff --git a/notes/bulk_edits/2019-12-20_updates.md b/notes/bulk_edits/2019-12-20_updates.md
index a8f62ea9..83c8d9da 100644
--- a/notes/bulk_edits/2019-12-20_updates.md
+++ b/notes/bulk_edits/2019-12-20_updates.md
@@ -80,3 +80,13 @@ x fix bad DOI error (real error, skip these)
 x remove newline after "unparsable medline date" error
 x remove extra line like "existing.ident, existing.ext_ids.pmid, re.ext_ids.pmid))" in warning
 
+## Chocula
+
+Command:
+
+    export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...]
+    ./fatcat_import.py chocula /srv/fatcat/datasets/export_fatcat.2019-12-26.json
+
+Result:
+
+    Counter({'total': 144455, 'exists': 139807, 'insert': 2384, 'skip': 2264, 'skip-unknown-new-issnl': 2264, 'exists-by-issnl': 306, 'update': 0})
diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md
index 80760938..2db0c72d 100644
--- a/notes/bulk_edits/CHANGELOG.md
+++ b/notes/bulk_edits/CHANGELOG.md
@@ -9,8 +9,19 @@ this file should probably get merged into the guide at some point.
 
 This file should not turn in to a TODO list!
 
+## 2020-01
+
+Imported around 2,500 new containers (journals, by ISSN-L) from chocula
+analysis script.
+
 ## 2019-12
 
+Started continuous harvesting Datacite DOI metadata; first date harvested was
+`2019-12-13`. No importer running yet.
+
+Imported about 3.3m new ORCID identifiers from 2019 bulk dump (after converting
+from XML to JSON): <https://archive.org/details/orcid-dump-2019>
+
 Inserted about 154k new arxiv release entities. Still no automatic daily
 harvesting.
 
@@ -45,22 +56,9 @@ invalid ISSN checksum).
 Imported files (matched to releases by DOI) from Semantic Scholar
 (`DIRECT-OA-CRAWL-2019` crawl).
 
-    Arabesque importer
-    crawl-bot
-    `s2_doi.sqlite`
-    TODO: archive.org link
-    TODO: rough count
-    TODO: date
-
 Imported files (matched to releases by DOI) from pre-1923/pre-1909 items uploaded
 by a user to archive.org.
 
-    Matched importer
-    internetarchive-bot (TODO:)
-    TODO: archive.org link
-    TODO: counts
-    TODO: date
-
 Imported files (matched to releases by DOI) from CORE.ac.uk
 (`DIRECT-OA-CRAWL-2019` crawl).
 
diff --git a/proposals/20190509_schema_tweaks.md b/proposals/20190509_v03_schema_tweaks.md
index 7e372959..150ce525 100644
--- a/proposals/20190509_schema_tweaks.md
+++ b/proposals/20190509_v03_schema_tweaks.md
@@ -1,4 +1,6 @@
 
+Status: implemented
+
 # SQL (and API) schema changes
 
 Intend to make these changes at the same time as bumping OpenAPI schema from
@@ -139,4 +141,4 @@ Do these as separate commits, after merging back in to master, for v0.3:
 
 `release_month`: apprently pretty common to know the year and month but not
 date. I have avoided so far, seems like unnecessary complexity. Could start
-as an `extra_json` field?
+as an `extra_json` field? NOT IMPLEMENTED
diff --git a/proposals/20190510_editgroup_endpoint_prefix.md b/proposals/20190510_editgroup_endpoint_prefix.md
index f517383b..6794266e 100644
--- a/proposals/20190510_editgroup_endpoint_prefix.md
+++ b/proposals/20190510_editgroup_endpoint_prefix.md
@@ -1,4 +1,6 @@
 
+Status: implemented
+
 # Editgroup API Endpoint Prefixes
 
 In summary, change the API URL design such that entity mutations (create,
diff --git a/proposals/20190510_release_ext_ids.md b/proposals/20190510_release_ext_ids.md
index 1d2b912a..8953448c 100644
--- a/proposals/20190510_release_ext_ids.md
+++ b/proposals/20190510_release_ext_ids.md
@@ -1,4 +1,6 @@
 
+Status: implemented
+
 # Release External ID Refactor
 
 Goal is to make the external identifier "namespace" (number of external
diff --git a/proposals/20190514_fatcat_identifiers.md b/proposals/20190514_fatcat_identifiers.md
new file mode 100644
index 00000000..325e48f5
--- /dev/null
+++ b/proposals/20190514_fatcat_identifiers.md
@@ -0,0 +1,27 @@
+
+Status: brainstorm
+
+Fatcat Identifiers
+=======================
+
+AKA, `fcid`
+
+## Public Use / Reference
+
+When referencing identifiers in external databases, should prefix with the
+entity type. Eg:
+
+    release_hsmo6p4smrganpb3fndaj2lon4
+    editgroup_qinmjr2lbvgd3mbt7mifir23fy
+
+Or with a prefix:
+
+    fatcat:release_hsmo6p4smrganpb3fndaj2lon4
+
+As a usability affordance, the public web interface (though not API) should do
+permanent redirects HTTP (301 or 308) to the canonical page like:
+
+    https://fatcat.wiki/release_hsmo6p4smrganpb3fndaj2lon4
+    HTTP 301 => https://fatcat.wiki/release/hsmo6p4smrganpb3fndaj2lon4
+
+However, no intention to use identifiers in this schema in the API itself?
diff --git a/proposals/20190911_search_query_parsing.md b/proposals/20190911_search_query_parsing.md
new file mode 100644
index 00000000..f1fb0128
--- /dev/null
+++ b/proposals/20190911_search_query_parsing.md
@@ -0,0 +1,28 @@
+
+Status: brainstorm
+
+## Search Query Parsing
+
+The default "release" search on fatcat.wiki currently uses the elasticsearch
+built-in `query_string` parser, which is explicitly not recommended for
+public/production use.
+
+The best way forward is likely a custom query parser (eg, PEG-generated parser)
+that generates a complete elasticsearch query JSON structure.
+
+A couple search issues this would help with:
+
+- better parsing of keywords (year, year-range, DOI, ISSN, etc) in complex
+  queries and turning these in to keyword term sub-queries
+- queries including terms from multiple fields which aren't explicitly tagged
+  (eg, "lovelace computer" vs. "author:lovelace title:computer")
+- avoiding unsustainably expensive queries (eg, prefix wildcard, regex)
+- handling single-character mispellings and synonyms
+- collapsing multiple releases under the same work in search results
+
+In the near future, we may also create a fulltext search index, which will have
+it's own issues.
+
+## Tech Changes
+
+If we haven't already, should also switch to using elasticsearch client library.
diff --git a/proposals/20190911_v04_schema_tweaks.md b/proposals/20190911_v04_schema_tweaks.md
index 8ccbac79..eaf39474 100644
--- a/proposals/20190911_v04_schema_tweaks.md
+++ b/proposals/20190911_v04_schema_tweaks.md
@@ -1,5 +1,7 @@
 
-status: work-in-progress
+Status: planned
+
+## Schema Changes for v0.4 Release
 
 Proposed schema changes for next fatcat iteration (v0.4? v0.5?).
 
@@ -17,6 +19,9 @@ SQL (and API, and elasticsearch):
 - TODO: release: switch how pages work? first/last?
 - TODO: indication of peer-review process? at release or container level?
 - TODO: container: separate canonical and disambiguating titles (?)
+- TODO: release inter-references using SCHOLIX/Datacite schema
+    https://zenodo.org/record/1120265
+    https://support.datacite.org/docs/connecting-research-outputs#section-related-identifiers
 
 API tweaks:
 
diff --git a/proposals/20191018_bigger_db.md b/proposals/20191018_bigger_db.md
index cd5f6e7b..7a5216d0 100644
--- a/proposals/20191018_bigger_db.md
+++ b/proposals/20191018_bigger_db.md
@@ -1,4 +1,8 @@
 
+Status: brainstorm
+
+## Catalog Database Scaling
+
 How can we scale the fatcat backend to support:
 
 - one billion release entities
diff --git a/proposals/20200103_py37_refactors.md b/proposals/20200103_py37_refactors.md
new file mode 100644
index 00000000..f0321b33
--- /dev/null
+++ b/proposals/20200103_py37_refactors.md
@@ -0,0 +1,101 @@
+
+status: planning
+
+If we update fatcat python code to python3.7, what code refactoring changes can
+we make? We currently use/require python3.5.
+
+Nice features in python3 I know of are:
+
+- dataclasses (python3.7)
+- async/await (mature in python3.7?)
+- type annotations (python3.5)
+- format strings (python3.6)
+- walrus assignment (python3.8)
+
+Not sure if the walrus operator is worth jumping all the way to python3.8.
+
+While we might be at it, what other superficial factorings might we want to do?
+
+- strict lint style (eg, maximum column width) with `black` (python3.6)
+- logging/debugging/verbose
+- type annotations and checking
+- use named dicts or structs in place of dicts
+
+## Linux Distro Support
+
+The default python version shipped by current and planned linux releases are:
+
+- ubuntu xenial 16.04 LTS:  python3.5
+- ubuntu bionic 18.04 LTS:  python3.6
+- ubuntu focal  20.04 LTS:  python3.8 (planned)
+- debian buster 10 2019:    python3.7
+
+Python 3.7 is the default in debian buster (10).
+
+There are apt PPA package repositories that allow backporting newer pythons to
+older releases. As far as I know this is safe and doesn't override any system
+usage if we are careful not to set the defaults (aka, `python3` command should
+be the older version unless inside a virtualenv).
+
+It would also be possible to use `pyenv` to have `virtualenv`s with custom
+python versions. We should probably do that for OS X and/or windows support if
+we wanted those. But having a system package is probably a lot faster to
+install.
+
+## Dataclasses
+
+`dataclasses` are a user-friendly way to create struct-like objects. They are
+pretty similar to the existing `namedtuple`, but can be mutable and have
+methods attached to them (they are just classes), plus several other usability
+improvements.
+
+Most places we are throwing around dicts with structure we could be using
+dataclasses instead. There are some instances of this in fatcat, but many more
+in sandcrawler.
+
+## Async/Await
+
+Where might we actually use async/await? I think more in sandcrawler than in
+the python tools or web apps. The GROBID, ingest, and ML workers in particular
+should be async over batches, as should all fetches from CDX/wayback.
+
+Some of the kafka workers *could* be aync, but i'm not sure how much speedup
+there would actually be. For example, the entity updates worker could fetch
+entities for an editgroup concurrently.
+
+Inserts (importers) should probably mostly happen serially, at least the kafka
+importers, one editgroup at a time, so progress is correctly recorded in kafka.
+Parallelization should probably happen at the partition level; would need to
+think through whether async would actually help with code simplicity vs. thread
+or process parallelization.
+
+## Type Annotations
+
+The meta-goals of (gradual) type annotations would be catching more bugs at
+development time, and having code be more self-documenting and easier to
+understand.
+
+The two big wins I see with type annotation would be having annotations
+auto-generated for the openapi classes and API calls, and to make string
+munging in importer code less buggy.
+
+## Format Strings
+
+Eg, replace code like:
+
+    "There are {} out of {} objects".format(found, total)
+
+With:
+
+    f"There are {found} out of {total} objects"
+
+## Walrus Operator
+
+New operator allows checking and assignment together:
+
+    if (n := len(a)) > 10:
+        print(f"List is too long ({n} elements, expected <= 10)")
+
+I feel like we would actually use this pattern *a ton* in importer code, where
+we do a lot of lookups or cleaning then check if we got a `None`.
+
diff --git a/proposals/README.md b/proposals/README.md
new file mode 100644
index 00000000..5e6747b1
--- /dev/null
+++ b/proposals/README.md
@@ -0,0 +1,11 @@
+
+This folder contains proposals for larger changes to the fatcat system. These
+might be schema changes, new projects, technical details, etc. Any change which
+is large enough to require planning and documentation.
+
+Each should be tagged with a date first drafted, and labeled with a status:
+
+- brainstorm: just putting ideas down; might not even happen
+- planned: commited to happening, but not started yet
+- work-in-progress: currently being worked on
+- implemented: completed, merged to master/production/live
diff --git a/python/Pipfile b/python/Pipfile
index 01c1eb3d..1a19a145 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -8,7 +8,7 @@ verify_ssl = true
 name = "pypi"
 
 [dev-packages]
-pytest = ">=4,<5.0.0"
+pytest = ">=5,<6.0.0"
 pytest-pythonpath = "*"
 pytest-pylint = "*"
 ipython = "<7.0.0"
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 35125b67..a4408cdd 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -1,7 +1,11 @@
 {
     "_meta": {
         "hash": {
+<<<<<<< HEAD
+            "sha256": "03fc6c65c7bcbf96a5ef90afba8b6a0264a248a67b31ed339f399470b5f3d5fc"
+=======
             "sha256": "fb9c3d2307483efe01d9c28a306bad319c84a94a4253d5c7c25bcfe2dad20c5d"
+>>>>>>> martin-datacite-import
         },
         "pipfile-spec": 6,
         "requires": {
@@ -298,6 +302,8 @@
             ],
             "version": "==2.5.0"
         },
+<<<<<<< HEAD
+=======
         "langdetect": {
             "hashes": [
                 "sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"
@@ -305,6 +311,7 @@
             "index": "pypi",
             "version": "==1.0.7"
         },
+>>>>>>> martin-datacite-import
         "loginpass": {
             "hashes": [
                 "sha256:717c87c1870a7e00547fd9d989aea9b22232b2f48826f552d79c34a47f9618c9",
@@ -617,7 +624,12 @@
         },
         "wcwidth": {
             "hashes": [
+<<<<<<< HEAD
+                "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603",
+                "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"
+=======
                 "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"
+>>>>>>> martin-datacite-import
             ],
             "version": "==0.1.8"
         },
@@ -645,13 +657,6 @@
             ],
             "version": "==2.3.3"
         },
-        "atomicwrites": {
-            "hashes": [
-                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
-                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
-            ],
-            "version": "==1.3.0"
-        },
         "attrs": {
             "hashes": [
                 "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
@@ -805,7 +810,6 @@
                 "sha256:b84b238cce0d9adad5ed87e745778d20a3f8487d0f0cb8b8a586816c7496458d",
                 "sha256:c833ef592a0324bcc6a60e48440da07645063c453880c9477ceb22490aec1564"
             ],
-            "markers": "python_version > '2.7'",
             "version": "==8.0.2"
         },
         "packaging": {
@@ -923,11 +927,19 @@
         },
         "pytest": {
             "hashes": [
+<<<<<<< HEAD
+                "sha256:6b571215b5a790f9b41f19f3531c53a45cf6bb8ef2988bc1ff9afb38270b25fa",
+                "sha256:e41d489ff43948babd0fad7ad5e49b8735d5d55e26628a58673c39ff61d95de4"
+            ],
+            "index": "pypi",
+            "version": "==5.3.2"
+=======
                 "sha256:6192875be8af57b694b7c4904e909680102befcb99e610ef3d9f786952f795aa",
                 "sha256:f8447ebf8fd3d362868a5d3f43a9df786dfdfe9608843bd9002a2d47a104808f"
             ],
             "index": "pypi",
             "version": "==4.6.8"
+>>>>>>> martin-datacite-import
         },
         "pytest-cov": {
             "hashes": [
@@ -1032,7 +1044,12 @@
         },
         "wcwidth": {
             "hashes": [
+<<<<<<< HEAD
+                "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603",
+                "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"
+=======
                 "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603"
+>>>>>>> martin-datacite-import
             ],
             "version": "==0.1.8"
         },
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ea7e12f2..fb8830ca 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -61,7 +61,8 @@ def run_journal_metadata(args):
 
 def run_chocula(args):
     fii = ChoculaImporter(args.api,
-        edit_batch_size=args.batch_size)
+        edit_batch_size=args.batch_size,
+        do_updates=args.do_updates)
     JsonLinePusher(fii, args.json_file).run()
 
 def run_matched(args):
@@ -315,6 +316,9 @@ def main():
     sub_chocula.add_argument('json_file',
         help="chocula JSON entities file (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
+    sub_chocula.add_argument('--do-updates',
+        action='store_true',
+        help="update pre-existing container entities")
 
     sub_matched = subparsers.add_parser('matched',
         help="add file entities matched against existing releases; custom JSON format")
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index acfc2b87..c71b33e9 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -47,6 +47,7 @@ class ArabesqueMatchImporter(EntityImporter):
         eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.ArabesqueMatchImporter')
         if kwargs.get('crawl_id'):
             eg_extra['crawl_id'] = kwargs.get('crawl_id')
+        kwargs['do_updates'] = kwargs.get("do_updates", False)
         super().__init__(api,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
@@ -56,7 +57,6 @@ class ArabesqueMatchImporter(EntityImporter):
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         assert self.default_link_rel
         self.default_mimetype = kwargs.get("default_mimetype", None)
-        self.do_updates = kwargs.get("do_updates", False)
         self.require_grobid = require_grobid
         if self.require_grobid:
             print("Requiring GROBID status == 200")
diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
index eea50314..375b6051 100644
--- a/python/fatcat_tools/importers/chocula.py
+++ b/python/fatcat_tools/importers/chocula.py
@@ -109,6 +109,9 @@ class ChoculaImporter(EntityImporter):
 
         # decide whether to update
         do_update = False
+        if not self.do_updates:
+            self.counts['exists'] += 1
+            return False
         if not existing.extra:
             existing.extra = dict()
         if set(ce.extra.get('urls', [])) != set(existing.extra.get('urls', [])):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index be5db8d8..8d103372 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -287,6 +287,7 @@ class EntityImporter:
         eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
 
         self.api = api
+        self.do_updates = bool(kwargs.get('do_updates', True))
         self.bezerk_mode = kwargs.get('bezerk_mode', False)
         self.submit_mode = kwargs.get('submit_mode', False)
         self.edit_batch_size = kwargs.get('edit_batch_size', 100)
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 33c40eff..16643eb5 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -14,13 +14,13 @@ class IngestFileResultImporter(EntityImporter):
         eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
         eg_extra = kwargs.pop('editgroup_extra', dict())
         eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
+        kwargs['do_updates'] = kwargs.get("do_updates", False)
         super().__init__(api,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra,
             **kwargs)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         assert self.default_link_rel
-        self.do_updates = kwargs.get("do_updates", False)
         self.require_grobid = require_grobid
         if self.require_grobid:
             print("Requiring GROBID status == 200")
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 3611a299..c32ce34a 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -715,6 +715,10 @@ class PubmedImporter(EntityImporter):
                 re.ext_ids.doi = None
                 re.work_id = existing.work_id
 
+        if existing and not self.do_updates:
+            self.counts['exists'] += 1
+            return False
+
         if existing and existing.ext_ids.pmid and (existing.refs or not re.refs):
             # TODO: any other reasons to do an update?
             # don't update if it already has PMID