10 files changed, 933 insertions, 467 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index e6f081e5..70f38f5b 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -1,7 +1,22 @@
 
-from .common import FatcatImporter, make_kafka_consumer
+"""
+To run an import you combine two classes; one each of:
+
+- RecordSource: somehow iterates over a source of raw records (eg, from a
+  database, Kafka, files on disk, stdin) and pushes into an entity importer.
+- EntityImporter: class that a record iterator pushes raw (unparsed) records
+  into. The entity importer parses and decides what to do (ignore, update,
+  insert, etc). There is usually a primary entity type, though related entities
+  can be created along the way. Maintains API connection and editgroup/batch
+  state.
+
+"""
+
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, KafkaJsonPusher, make_kafka_consumer, clean
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
 from .grobid_metadata import GrobidMetadataImporter
-from .issn import IssnImporter
+from .journal_metadata import JournalMetadataImporter
 from .matched import MatchedImporter
 from .orcid import OrcidImporter
+#from .kafka_source import KafkaSource
+#from .file_source import FileSource
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 06897bee..89203a4f 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,6 +3,7 @@ import re
 import sys
 import csv
 import json
+import ftfy
 import itertools
 import subprocess
 from collections import Counter
@@ -12,30 +13,66 @@ import fatcat_client
 from fatcat_client.rest import ApiException
 
 
-# from: https://docs.python.org/3/library/itertools.html
-def grouper(iterable, n, fillvalue=None):
-    "Collect data into fixed-length chunks or blocks"
-    args = [iter(iterable)] * n
-    return itertools.zip_longest(*args, fillvalue=fillvalue)
+def clean(thing, force_xml=False):
+    """
+    This function is appropriate to be called on any random, non-markup string,
+    such as author names, titles, etc.
 
-def make_kafka_consumer(hosts, env, topic_suffix, group):
-    topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
-    client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
-    consume_topic = client.topics[topic_name]
-    print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+    It will try to clean up commong unicode mangles, HTML characters, etc.
 
-    consumer = consume_topic.get_balanced_consumer(
-        consumer_group=group.encode('utf-8'),
-        managed=True,
-        auto_commit_enable=True,
-        auto_commit_interval_ms=30000, # 30 seconds
-        compacted_topic=True,
-    )
-    return consumer
+    This will detect XML/HTML and "do the right thing" (aka, not remove
+    entities like '&amp' if there are tags in the string), unless you pass the
+    'force_xml' parameter, which might be appropriate for, eg, names and
+    titles, which generally should be projected down to plain text.
+
+    Also strips extra whitespace.
+    """
+    if not thing:
+        return thing
+    fix_entities = 'auto'
+    if force_xml:
+        fix_entities = True
+    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+    if not fixed:
+        # wasn't zero-length before, but is now; return None
+        return None
+    return fixed
+
+def test_clean():
 
-class FatcatImporter:
+    assert clean(None) == None
+    assert clean('') == ''
+    assert clean('123') == '123'
+    assert clean('a&amp;b') == 'a&b'
+    assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+    assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+class EntityImporter:
     """
-    Base class for fatcat importers
+    Base class for fatcat entity importers.
+    
+    The API exposed to record iterator is:
+
+        push_record(raw_record)
+        finish()
+
+    The API that implementations are expected to fill in are:
+
+        want(raw_record) -> boolean
+        parse(raw_record) -> entity
+        try_update(entity) -> boolean
+        insert_batch([entity]) -> None
+
+    This class exposes helpers for implementations:
+
+        self.api
+        self.create_<entity>(entity) -> EntityEdit
+            for related entity types
+        self.push_entity(entity)
+        self.counts['exists'] += 1
+            if didn't update or insert because of existing)
+        self.counts['update'] += 1
+            if updated an entity
     """
 
     def __init__(self, api, **kwargs):
@@ -43,87 +80,135 @@ class FatcatImporter:
         eg_extra = kwargs.get('editgroup_extra', dict())
         eg_extra['git_rev'] = eg_extra.get('git_rev',
             subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter')
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityImporter')
         
         self.api = api
-        self._editgroup_description = kwargs.get('editgroup_description')
-        self._editgroup_extra = kwargs.get('editgroup_extra')
-        issn_map_file = kwargs.get('issn_map_file')
+        self.bezerk_mode = kwargs.get('bezerk_mode', False)
+        self.edit_batch_size = kwargs.get('edit_batch_size', 100)
+        self.editgroup_description = kwargs.get('editgroup_description')
+        self.editgroup_extra = kwargs.get('editgroup_extra')
+        self.reset()
 
         self._issnl_id_map = dict()
         self._orcid_id_map = dict()
-        self._doi_id_map = dict()
-        if issn_map_file:
-            self.read_issn_map_file(issn_map_file)
         self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
-        self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0})
+        self._doi_id_map = dict()
 
-    def _editgroup(self):
-        eg = fatcat_client.Editgroup(
-            description=self._editgroup_description,
-            extra=self._editgroup_extra,
-        )
-        return self.api.create_editgroup(eg)
+    def reset(self):
+        self.counts = Counter({'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+        self._edit_count = 0
+        self._editgroup_id = None
+        self._entity_queue = []
 
-    def describe_run(self):
-        print("Processed {} lines, inserted {}, updated {}.".format(
-            self.counts['processed_lines'], self.counts['insert'], self.counts['update']))
+    def push_record(self, raw_record):
+        """
+        Returns nothing.
+        """
+        if (not raw_record) or (not self.want(raw_record)):
+            self.counts['skip'] += 1
+            return
+        entity = self.parse_record(raw_record)
+        if not entity:
+            self.counts['skip'] += 1
+            return
+        if self.bezerk_mode:
+            self.push_entity(entity)
+            return
+        if self.try_update(entity):
+            self.push_entity(entity)
+        return
 
-    def create_row(self, row, editgroup_id=None):
-        # sub-classes expected to implement this
-        raise NotImplementedError
+    def finish(self):
+        if self._edit_count > 0:
+            self.api.accept_editgroup(self._editgroup_id)
+            self._editgroup_id = None
+            self._edit_count = 0
+
+        if self._entity_queue:
+            self.insert_batch(self._entity_queue)
+            self.counts['insert'] += len(self._entity_queue)
+            self._entity_queue =  []
+
+        self.counts['total'] = 0
+        for key in ('skip', 'insert', 'update', 'exists'):
+            self.counts['total'] += self.counts[key]
+        return self.counts
+
+    def _get_editgroup(self, edits=1):
+        if self._edit_count >= self.edit_batch_size:
+            self.api.accept_editgroup(self._editgroup_id)
+            self._editgroup_id = None
+            self._edit_count = 0
 
-    def create_batch(self, rows, editgroup_id=None):
-        # sub-classes expected to implement this
+        if not self._editgroup_id:
+            eg = self.api.create_editgroup(
+                fatcat_client.Editgroup(
+                    description=self.editgroup_description,
+                    extra=self.editgroup_extra))
+            self._editgroup_id = eg.editgroup_id
+
+        self._edit_count += edits
+        return self._editgroup_id
+
+    def create_container(self, entity):
+        eg_id = self._get_editgroup()
+        self.counts['inserted.container'] += 1
+        return self.api.create_container(entity, editgroup_id=eg_id)
+
+    def create_release(self, entity):
+        eg_id = self._get_editgroup()
+        self.counts['inserted.release'] += 1
+        return self.api.create_release(entity, editgroup_id=eg_id)
+
+    def create_file(self, entity):
+        eg_id = self._get_editgroup()
+        self.counts['inserted.file'] += 1
+        return self.api.create_file(entity, editgroup_id=eg_id)
+
+    def updated(self):
+        """
+        Implementations should call this from try_update() if the update was successful
+        """
+        self.counts['update'] += 1
+
+    def push_entity(self, entity):
+        self._entity_queue.append(entity)
+        if len(self._entity_queue) >= self.edit_batch_size:
+            self.insert_batch(self._entity_queue)
+            self.counts['insert'] += len(_entity_queue)
+            self._entity_queue = 0
+
+    def want(self, raw_record):
+        """
+        Implementations can override for optional fast-path to drop a record.
+        Must have no side-effects; returns bool.
+        """
+        return True
+
+    def parse(self, raw_record):
+        """
+        Returns an entity class type, or None if we should skip this one.
+
+        May have side-effects (eg, create related entities), but shouldn't
+        update/mutate the actual entity.
+        """
         raise NotImplementedError
 
-    def process_source(self, source, group_size=100):
-        """Creates and auto-accepts editgroup every group_size rows"""
-        eg = self._editgroup()
-        i = 0
-        for i, row in enumerate(source):
-            self.create_row(row, editgroup_id=eg.editgroup_id)
-            if i > 0 and (i % group_size) == 0:
-                self.api.accept_editgroup(eg.editgroup_id)
-                eg = self._editgroup()
-            self.counts['processed_lines'] += 1
-        if i == 0 or (i % group_size) != 0:
-            self.api.accept_editgroup(eg.editgroup_id)
-
-    def process_batch(self, source, size=50, decode_kafka=False):
-        """Reads and processes in batches (not API-call-per-)"""
-        for rows in grouper(source, size):
-            if decode_kafka:
-                rows = [msg.value.decode('utf-8') for msg in rows]
-            self.counts['processed_lines'] += len(rows)
-            #eg = self._editgroup()
-            #self.create_batch(rows, editgroup_id=eg.editgroup_id)
-            self.create_batch(rows)
-
-    def process_csv_source(self, source, group_size=100, delimiter=','):
-        reader = csv.DictReader(source, delimiter=delimiter)
-        self.process_source(reader, group_size)
-
-    def process_csv_batch(self, source, size=50, delimiter=','):
-        reader = csv.DictReader(source, delimiter=delimiter)
-        self.process_batch(reader, size)
+    def try_update(self, raw_record):
+        """
+        Passed the output of parse(). Should try to find an existing entity and
+        update it (PUT), decide we should do nothing (based on the existing
+        record), or create a new one.
 
-    def is_issnl(self, issnl):
-        return len(issnl) == 9 and issnl[4] == '-'
+        Implementations must update the exists/updated/skip counts
+        appropriately in this method.
 
-    def lookup_issnl(self, issnl):
-        """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
-        if issnl in self._issnl_id_map:
-            return self._issnl_id_map[issnl]
-        container_id = None
-        try:
-            rv = self.api.lookup_container(issnl=issnl)
-            container_id = rv.ident
-        except ApiException as ae:
-            # If anything other than a 404 (not found), something is wrong
-            assert ae.status == 404
-        self._issnl_id_map[issnl] = container_id # might be None
-        return container_id
+        Returns boolean: True if the entity should still be inserted, False otherwise
+        """
+        raise NotImplementedError
+
+    def insert_batch(self, raw_record):
+        raise NotImplementedError
 
     def is_orcid(self, orcid):
         return self._orcid_regex.match(orcid) is not None
@@ -163,6 +248,23 @@ class FatcatImporter:
         self._doi_id_map[doi] = release_id # might be None
         return release_id
 
+    def is_issnl(self, issnl):
+        return len(issnl) == 9 and issnl[4] == '-'
+
+    def lookup_issnl(self, issnl):
+        """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+        if issnl in self._issnl_id_map:
+            return self._issnl_id_map[issnl]
+        container_id = None
+        try:
+            rv = self.api.lookup_container(issnl=issnl)
+            container_id = rv.ident
+        except ApiException as ae:
+            # If anything other than a 404 (not found), something is wrong
+            assert ae.status == 404
+        self._issnl_id_map[issnl] = container_id # might be None
+        return container_id
+
     def read_issn_map_file(self, issn_map_file):
         print("Loading ISSN map file...")
         self._issn_issnl_map = dict()
@@ -179,3 +281,117 @@ class FatcatImporter:
         if issn is None:
             return None
         return self._issn_issnl_map.get(issn)
+
+
+class RecordPusher:
+    """
+    Base class for different importer sources. Pretty trivial interface, just
+    wraps an importer and pushes records in to it.
+    """
+
+    def __init__(self, importer, **kwargs):
+        self.importer = importer
+
+    def run(self):
+        """
+        This will look something like:
+
+            for line in sys.stdin:
+                record = json.loads(line)
+                self.importer.push_record(record)
+            print(self.importer.finish())
+        """
+        raise NotImplementedError
+
+
+class JsonLinePusher(RecordPusher):
+
+    def __init__(self, importer, json_file, **kwargs):
+        self.importer = importer
+        self.json_file = json_file
+
+    def run(self):
+        for line in self.json_file:
+            if not line:
+                continue
+            record = json.loads(line)
+            self.importer.push_record(record)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+class CsvPusher(RecordPusher):
+
+    def __init__(self, importer, csv_file, **kwargs):
+        self.importer = importer
+        self.reader = csv.DictReader(csv_file, delimiter=kwargs.get('delimiter', ','))
+
+    def run(self):
+        for line in self.reader:
+            if not line:
+                continue
+            self.importer.push_record(line)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+class LinePusher(RecordPusher):
+
+    def __init__(self, importer, text_file, **kwargs):
+        self.importer = importer
+        self.text_file = text_file
+
+    def run(self):
+        for line in self.text_file:
+            if not line:
+                continue
+            self.importer.push_record(line)
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+class KafkaJsonPusher(RecordPusher):
+
+    def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):
+        self.importer = importer
+        self.consumer = make_kafka_consumer(
+            kafka_hosts,
+            kafka_env,
+            topic_suffix,
+            group,
+        )
+
+    def run(self):
+        count = 0
+        for msg in self.consumer:
+            if not msg:
+                continue
+            record = json.loads(msg.value.decode('utf-8'))
+            self.importer.push_record(record)
+            count += 1
+            if count % 500 == 0:
+                print("Import counts: {}".format(self.importer.counts))
+        # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
+        # commit the current batch if it has been lingering
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
+def make_kafka_consumer(hosts, env, topic_suffix, group):
+    topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
+    client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
+    consume_topic = client.topics[topic_name]
+    print("Consuming from kafka topic {}, group {}".format(topic_name, group))
+
+    consumer = consume_topic.get_balanced_consumer(
+        consumer_group=group.encode('utf-8'),
+        managed=True,
+        auto_commit_enable=True,
+        auto_commit_interval_ms=30000, # 30 seconds
+        compacted_topic=True,
+    )
+    return consumer
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 6365e491..00c719f1 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -6,7 +6,7 @@ import datetime
 import itertools
 import subprocess
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
 
 # The docs/guide should be the cannonical home for these mappings; update there
@@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = {
     'standard': 'standard',
 }
 
-class CrossrefImporter(FatcatImporter):
+CONTAINER_TYPE_MAP = {
+    'article-journal': 'journal',
+    'paper-conference': 'conference',
+    'book': 'book-series',
+}
+
+# TODO:
+LICENSE_SLUG_MAP = {
+    "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+    "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+    "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+    "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # http://www.springer.com/tdm doesn't seem like a license
+}
+
+class CrossrefImporter(EntityImporter):
     """
     Importer for Crossref metadata.
 
@@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter):
             issn_map_file=issn_map_file,
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra)
+
+        self.create_containers = kwargs.get('create_containers')
         extid_map_file = kwargs.get('extid_map_file')
-        create_containers = kwargs.get('create_containers')
-        check_existing = kwargs.get('check_existing')
         self.extid_map_db = None
         if extid_map_file:
             db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter):
             self.extid_map_db = sqlite3.connect(db_uri, uri=True)
         else:
             print("Not using external ID map")
-        self.create_containers = create_containers
-        self.check_existing = check_existing
+
+        self.read_issn_map_file(issn_map_file)
 
     def lookup_ext_ids(self, doi):
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
         row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
             [doi.lower()]).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
         row = [str(cell or '') or None for cell in row]
         return dict(
             core_id=row[0],
             pmid=row[1],
             pmcid=row[2],
-            wikidata_qid=row[3])
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
 
     def map_release_type(self, crossref_type):
         return CROSSREF_TYPE_MAP.get(crossref_type)
 
-    def parse_crossref_dict(self, obj):
+    def map_container_type(self, crossref_type):
+        return CONTAINER_TYPE_MAP.get(crossref_type)
+
+    def want(self, obj):
+        if not obj.get('title'):
+            return False
+
+        # do most of these checks in-line below
+        return True
+
+    def parse_record(self, obj):
         """
         obj is a python dict (parsed from json).
         returns a ReleaseEntity
         """
 
-        # Do require the 'title' keys to exsit, as release entities do
-        if (not 'title' in obj) or (not obj['title']):
-            return None
-
         # Ways to be out of scope (provisionally)
         # journal-issue and journal-volume map to None, but allowed for now
         if obj.get('type') in (None, 'journal', 'proceedings',
@@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter):
                 'book-track', 'proceedings-series'):
             return None
 
-        # lookup existing DOI
-        existing_release = None
-        if self.check_existing:
-            try:
-                existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
-            except fatcat_client.rest.ApiException as err:
-                if err.status != 404:
-                    raise err
-
-        # eventually we'll want to support "updates", but for now just skip if
-        # entity already exists
-        if existing_release:
+        # Do require the 'title' keys to exsit, as release entities do
+        if (not 'title' in obj) or (not obj['title']):
             return None
 
+        release_type = self.map_release_type(obj['type'])
+
         # contribs
         def do_contribs(obj_list, ctype):
             contribs = []
@@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter):
                     index = i
                 else:
                     index = None
+                raw_affiliation = None
                 if am.get('affiliation'):
-                    # note: affiliation => affiliations
-                    extra['affiliations'] = am.get('affiliation')
+                    if len(am.get('affiliation')) > 0:
+                        raw_affiliation = am.get('affiliation')[0]['name']
+                    if len(am.get('affiliation')) > 1:
+                        # note: affiliation => more_affiliations
+                        extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]]
                 if am.get('sequence') and am.get('sequence') != "additional":
-                    extra['sequence'] = am.get('sequence')
+                    extra['seq'] = clean(am.get('sequence'))
                 if not extra:
                     extra = None
                 assert ctype in ("author", "editor", "translator")
                 contribs.append(fatcat_client.ReleaseContrib(
                     creator_id=creator_id,
                     index=index,
-                    raw_name=raw_name,
+                    raw_name=clean(raw_name),
+                    raw_affiliation=clean(raw_affiliation),
                     role=ctype,
                     extra=extra))
             return contribs
@@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter):
             container_id = self.lookup_issnl(issnl)
         publisher = obj.get('publisher')
 
-        ce = None
         if (container_id is None and self.create_containers and (issnl is not None)
             and obj.get('container-title') and len(obj['container-title']) > 0):
             ce = fatcat_client.ContainerEntity(
                 issnl=issnl,
-                publisher=publisher,
-                name=obj['container-title'][0])
+                publisher=clean(publisher),
+                container_type=self.map_container_type(release_type),
+                name=clean(obj['container-title'][0], force_xml=True))
+            ce_edit = self.create_container(ce)
+            container_id = ce_edit.ident
+
+        # license slug
+        license_slug = None
+        license_extra = []
+        for l in obj.get('license', []):
+            if l['content-version'] not in ('vor', 'unspecified'):
+                continue
+            slug = LICENSE_SLUG_MAP.get(l['URL'])
+            if slug:
+                license_slug = slug
+            if 'start' in l:
+                l['start'] = l['start']['date-time']
+            license_extra.append(l)
 
         # references
         refs = []
         for i, rm in enumerate(obj.get('reference', [])):
             try:
                 year = int(rm.get('year'))
-                # NOTE: will need to update/config in the future!
+                # TODO: will need to update/config in the future!
                 # NOTE: are there crossref works with year < 100?
                 if year > 2025 or year < 100:
                     year = None
             except:
                 year = None
-            extra = rm.copy()
-            if rm.get('DOI'):
-                extra['doi'] = rm.get('DOI').lower()
             key = rm.get('key')
             if key and key.startswith(obj['DOI'].upper()):
                 key = key.replace(obj['DOI'].upper() + "-", '')
@@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter):
             container_name = rm.get('volume-title')
             if not container_name:
                 container_name = rm.get('journal-title')
-            extra.pop('DOI', None)
-            extra.pop('key', None)
-            extra.pop('year', None)
-            extra.pop('volume-name', None)
-            extra.pop('journal-title', None)
-            extra.pop('title', None)
-            extra.pop('first-page', None)
-            extra.pop('doi-asserted-by', None)
+            elif rm.get('journal-title'):
+                extra['journal-title'] = rm['journal-title']
+            extra = dict()
+            if rm.get('DOI'):
+                extra['doi'] = rm.get('DOI').lower()
+            # TODO: what fields here? CSL citation stuff
+            for k in ('author', 'editor', 'edition', 'authority', 'version',
+                    'genre', 'url', 'event', 'issue', 'volume', 'date',
+                    'accessed_date', 'issued', 'page', 'medium',
+                    'collection_title', 'chapter_number'):
+                if clean(rm.get(k)):
+                    extra[k] = clean(rm[k])
             if extra:
                 extra = dict(crossref=extra)
             else:
@@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter):
                 target_release_id=None,
                 key=key,
                 year=year,
-                container_name=container_name,
-                title=rm.get('title'),
-                locator=rm.get('first-page'),
+                container_name=clean(container_name),
+                title=clean(rm.get('title')),
+                locator=clean(rm.get('first-page')),
                 # TODO: just dump JSON somewhere here?
                 extra=extra))
 
@@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter):
         if obj.get('abstract') != None:
             abstracts.append(fatcat_client.ReleaseEntityAbstracts(
                 mimetype="application/xml+jats",
-                content=obj.get('abstract')))
+                content=clean(obj.get('abstract'))))
 
         # extra fields
         extra = dict()
-        for key in ('subject', 'type', 'license', 'alternative-id',
-                'container-title', 'original-title', 'subtitle', 'archive',
-                'funder', 'group-title'):
-            # TODO: unpack "container-title" array
+        for key in ('subject', 'type', 'alternative-id', 'container-title',
+                'subtitle', 'archive', 'funder', 'group-title'):
+            # TODO: unpack "container-title" array?
             val = obj.get(key)
             if val:
-                extra[key] = val
-        if 'license' in extra and extra['license']:
-            for i in range(len(extra['license'])):
-                if 'start' in extra['license'][i]:
-                    extra['license'][i]['start'] = extra['license'][i]['start']['date-time']
+                if type(val) == str:
+                    extra[key] = clean(val)
+                else:
+                    extra[key] = val
+        if license_extra:
+            extra['license'] = license_extra
+
         if len(obj['title']) > 1:
-            extra['other-titles'] = obj['title'][1:]
-        # TODO: this should be top-level
-        extra['is_kept'] = len(obj.get('archive', [])) > 0
+            extra['other-titles'] = [clean(t) for t in obj['title'][1:]]
 
         # ISBN
         isbn13 = None
@@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter):
 
         re = fatcat_client.ReleaseEntity(
             work_id=None,
-            title=obj.get('title', [None])[0],
-            contribs=contribs,
-            refs=refs,
             container_id=container_id,
-            publisher=publisher,
-            release_type=self.map_release_type(obj['type']),
+            title=clean(obj.get('title', [None])[0], force_xml=True),
+            original_title=clean(obj.get('original-title', [None])[0]),
+            release_type=release_type,
             release_status=release_status,
+            release_date=release_date,
+            release_year=release_year,
+            publisher=clean(publisher),
             doi=obj['DOI'].lower(),
-            isbn13=isbn13,
-            core_id=extids['core_id'],
             pmid=extids['pmid'],
             pmcid=extids['pmcid'],
             wikidata_qid=extids['wikidata_qid'],
-            release_date=release_date,
-            release_year=release_year,
-            issue=obj.get('issue'),
-            volume=obj.get('volume'),
-            pages=obj.get('page'),
+            isbn13=isbn13,
+            core_id=extids['core_id'],
+            arxiv_id=extids['arxiv_id'],
+            jstor_id=extids['jstor_id'],
+            volume=clean(obj.get('volume')),
+            issue=clean(obj.get('issue')),
+            pages=clean(obj.get('page')),
+            language=None,  # crossref doesn't supply language info
+            license_slug=license_slug,
+            extra=dict(crossref=extra),
             abstracts=abstracts,
-            extra=dict(crossref=extra))
-        return (re, ce)
+            contribs=contribs,
+            refs=refs,
+        )
+        return re
+
+    def try_update(self, re):
+
+        # lookup existing DOI (don't need to try other ext idents for crossref)
+        existing = None
+        try:
+            existing = self.api.lookup_release(doi=re.doi)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_release_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
 
-    def create_row(self, row, editgroup_id=None):
-        if row is None:
-            return
-        obj = json.loads(row)
-        entities = self.parse_crossref_dict(obj)
-        if entities is not None:
-            (re, ce) = entities
-            if ce is not None:
-                container = self.api.create_container(ce, editgroup_id=editgroup_id)
-                re.container_id = container.ident
-                self._issnl_id_map[ce.issnl] = container.ident
-            self.api.create_release(re, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
-
-    def create_batch(self, batch):
-        """Current work/release pairing disallows batch creation of releases.
-        Could do batch work creation and then match against releases, but meh."""
-        release_batch = []
-        for row in batch:
-            if row is None:
-                continue
-            obj = json.loads(row)
-            entities = self.parse_crossref_dict(obj)
-            if entities is not None:
-                (re, ce) = entities
-                if ce is not None:
-                    ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())
-                    container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)
-                    self.api.accept_editgroup(ce_eg.editgroup_id)
-                    re.container_id = container.ident
-                    self._issnl_id_map[ce.issnl] = container.ident
-                release_batch.append(re)
-        self.api.create_release_batch(release_batch, autoaccept="true")
-        self.counts['insert'] += len(release_batch)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 5e61a154..9d95fe0b 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,22 @@ import json
 import base64
 import datetime
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
 MAX_ABSTRACT_BYTES=4096
 
 
-class GrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(EntityImporter):
+    """
+    This is a complex case: we need to parse and create both file and release entities.
+
+    The "primary" entity here is really File, not Release. If a matching File
+    exists, we bail in want(); if not we insert the Release during parsing, and
+    insert both.
+
+    TODO: should instead check if the File has any releases; if not, insert and update.
+    TODO: relaxing 'None' constraint on parse_record() might make this refactor-able.
+    """
 
     def __init__(self, api, **kwargs):
 
@@ -22,6 +32,45 @@ class GrobidMetadataImporter(FatcatImporter):
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
+        self.longtail_oa = kwargs.get("longtail_oa", False)
+
+    def want(self, raw_record):
+        return True
+
+    def parse_record(self, row):
+
+        fields = row.split('\t')
+        sha1_key = fields[0]
+        cdx = json.loads(fields[1])
+        mimetype = fields[2]
+        file_size = int(fields[3])
+        grobid_meta = json.loads(fields[4])
+        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
+        re = self.parse_grobid_json(grobid_meta)
+
+        if not (fe and re):
+            return None
+
+        # lookup existing file SHA1
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # if file is already in here, presumably not actually long-tail
+        # HACK: this is doing an exists check in parse_record(), which is weird
+        # TODO: this is where we should check if the file actually has
+        # release_ids and/or URLs associated with it
+        if existing and not self.bezerk_mode:
+            self.counts['exists'] += 1
+            self.counts['skip'] -= 1
+            return None
+
+        release_edit = self.create_release(re)
+        fe.release_ids.append(release_edit.ident)
+        return fe
 
     def parse_grobid_json(self, obj):
 
@@ -34,7 +83,7 @@ class GrobidMetadataImporter(FatcatImporter):
             abobj = dict(
                 mimetype="text/plain",
                 language=None,
-                content=obj.get('abstract').strip())
+                content=clean(obj.get('abstract')))
             abstracts = [abobj]
         else:
             abstracts = None
@@ -43,17 +92,18 @@ class GrobidMetadataImporter(FatcatImporter):
         for i, a in enumerate(obj.get('authors', [])):
             contribs.append(fatcat_client.ReleaseContrib(
                 index=i,
-                raw_name=a['name'],
+                raw_name=clean(a['name']),
                 role="author",
                 extra=None))
 
+        # XXX: why is this a dict()? not covered by tests?
         refs = []
         for raw in obj.get('citations', []):
             cite_extra = dict()
             ref = dict()
-            ref['key'] = raw.get('id')
+            ref['key'] = clean(raw.get('id'))
             if raw.get('title'):
-                ref['title'] = raw['title'].strip()
+                ref['title'] = clean(raw['title'])
             if raw.get('date'):
                 try:
                     year = int(raw['date'].strip()[:4])
@@ -62,9 +112,9 @@ class GrobidMetadataImporter(FatcatImporter):
                     pass
             for key in ('volume', 'url', 'issue', 'publisher'):
                 if raw.get(key):
-                    cite_extra[key] = raw[key].strip()
+                    cite_extra[key] = clean(raw[key])
             if raw.get('authors'):
-                cite_extra['authors'] = [a['name'] for a in raw['authors']]
+                cite_extra['authors'] = [clean(a['name']) for a in raw['authors']]
             if cite_extra:
                 cite_extra = dict(grobid=cite_extra)
             else:
@@ -81,27 +131,28 @@ class GrobidMetadataImporter(FatcatImporter):
         if obj.get('doi'):
             extra['doi'] = obj['doi']
         if obj['journal'] and obj['journal'].get('name'):
-            extra['container_name'] = obj['journal']['name']
-
-        extra['is_longtail_oa'] = True
+            extra['container_name'] = clean(obj['journal']['name'])
 
         # TODO: ISSN/eISSN handling? or just journal name lookup?
 
+        if self.longtail_oa:
+            extra['longtail_oa'] = True
+
         if extra:
             extra = dict(grobid=extra)
         else:
             extra = None
 
         re = fatcat_client.ReleaseEntity(
-            title=obj['title'].strip(),
+            title=clean(obj['title'], force_xml=True),
             release_type="article-journal",
             release_date=release_date,
             release_year=release_year,
             contribs=contribs,
             refs=refs,
-            publisher=obj['journal'].get('publisher'),
-            volume=obj['journal'].get('volume'),
-            issue=obj['journal'].get('issue'),
+            publisher=clean(obj['journal'].get('publisher')),
+            volume=clean(obj['journal'].get('volume')),
+            issue=clean(obj['journal'].get('issue')),
             abstracts=abstracts,
             extra=extra)
         return re
@@ -122,17 +173,6 @@ class GrobidMetadataImporter(FatcatImporter):
 
         sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower()
 
-        # lookup existing SHA1, or create new entity
-        try:
-            existing_file = self.api.lookup_file(sha1=sha1)
-        except fatcat_client.rest.ApiException as err:
-            if err.status != 404:
-                raise err
-            existing_file = None
-
-        if existing_file:
-            # if file is already in here, presumably not actually long-tail
-            return None
         fe = fatcat_client.FileEntity(
             sha1=sha1,
             size=int(file_size),
@@ -143,6 +183,7 @@ class GrobidMetadataImporter(FatcatImporter):
 
         # parse URLs and CDX
         original = cdx['url']
+        assert len(cdx['dt']) >= 8
         wayback = "https://web.archive.org/web/{}/{}".format(
             cdx['dt'],
             original)
@@ -154,23 +195,13 @@ class GrobidMetadataImporter(FatcatImporter):
 
         return fe
 
-    def create_row(self, row, editgroup_id=None):
-        if not row:
-            return
-        fields = row.split('\t')
-        sha1_key = fields[0]
-        cdx = json.loads(fields[1])
-        mimetype = fields[2]
-        file_size = int(fields[3])
-        grobid_meta = json.loads(fields[4])
-        fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size)
-        re = self.parse_grobid_json(grobid_meta)
-        if fe and re:
-            release_entity = self.api.create_release(re, editgroup_id=editgroup_id)
-            # release ident can't already be in release list because we just
-            # created it
-            fe.release_ids.append(release_entity.ident)
-            file_entity = self.api.create_file(fe, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
-
-    # NB: batch mode not implemented
+    def try_update(self, entity):
+        # did the exists check in 'parse_record()', because we needed to create a release
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_file_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
deleted file mode 100644
index f4d525a4..00000000
--- a/python/fatcat_tools/importers/issn.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-import sys
-import json
-import itertools
-import fatcat_client
-from .common import FatcatImporter
-
-
-def or_none(s):
-    if s is None:
-        return None
-    if len(s) == 0:
-        return None
-    return s
-
-def truthy(s):
-    if s is None:
-        return None
-    s = s.lower()
-
-    if s in ('true', 't', 'yes', 'y', '1'):
-        return True
-    elif s in ('false', 'f', 'no', 'n', '0'):
-        return False
-    else:
-        return None
-
-class IssnImporter(FatcatImporter):
-    """
-    Imports journal metadata ("containers") by ISSN, currently from a custom
-    (data munged) .csv file format
-
-    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-
-        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
-    """
-
-    def __init__(self, api, **kwargs):
-
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
-        eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter')
-        super().__init__(api,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra)
-
-    def parse_issn_row(self, row):
-        """
-        row is a python dict (parsed from CSV).
-        returns a ContainerEntity (or None if invalid or couldn't parse)
-        """
-        title = or_none(row['title'])
-        issnl = or_none(row['ISSN-L'])
-        if title is None or issnl is None:
-            return None
-        extra = dict(
-            in_doaj=truthy(row['in_doaj']),
-            in_road=truthy(row['in_road']),
-            in_norwegian=truthy(row['in_norwegian']),
-            language=or_none(row['lang']),
-            url=or_none(row['url']),
-            ISSNp=or_none(row['ISSN-print']),
-            ISSNe=or_none(row['ISSN-electronic']),
-            is_oa=truthy(row['is_oa']),
-            is_kept=truthy(row['is_kept']),
-        )
-        ce = fatcat_client.ContainerEntity(
-            issnl=issnl,
-            name=title,
-            publisher=or_none(row['publisher']),
-            abbrev=None,
-            coden=None,
-            extra=extra)
-        return ce
-
-    def create_row(self, row, editgroup_id=None):
-        ce = self.parse_issn_row(row)
-        if ce is not None:
-            self.api.create_container(ce, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
-
-    def create_batch(self, batch):
-        """Reads and processes in batches (not API-call-per-line)"""
-        objects = [self.parse_issn_row(l)
-                   for l in batch if (l is not None)]
-        objects = [o for o in objects if (o is not None)]
-        self.api.create_container_batch(objects, autoaccept="true")
-        self.counts['insert'] += len(objects)
diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py
new file mode 100644
index 00000000..cf3971b5
--- /dev/null
+++ b/python/fatcat_tools/importers/journal_metadata.py
@@ -0,0 +1,183 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from .common import EntityImporter, clean
+
+
+def or_none(s):
+    if s is None:
+        return None
+    if len(s) == 0:
+        return None
+    return s
+
+def truthy(s):
+    if s is None:
+        return None
+    s = s.lower()
+
+    if s in ('true', 't', 'yes', 'y', '1'):
+        return True
+    elif s in ('false', 'f', 'no', 'n', '0'):
+        return False
+    else:
+        return None
+
+class JournalMetadataImporter(EntityImporter):
+    """
+    Imports journal metadata ("containers") by ISSN, currently from a custom
+    (data munged) .csv file format
+
+    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+
+
+    'extra' fields:
+
+        doaj
+            as_of: datetime of most recent check; if not set, not actually in DOAJ
+            seal: bool
+            work_level: bool (are work-level publications deposited with DOAJ?)
+            archiving: array, can include 'library' or 'other'
+        road
+            as_of: datetime of most recent check; if not set, not actually in ROAD
+        pubmed (TODO: delete?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+        norwegian (TODO: drop this?)
+            as_of: datetime of most recent check; if not set, not actually indexed in pubmed
+            id (integer)
+            level (integer; 0-2)
+        kbart
+            lockss
+                year_rle
+                volume_rle
+            portico
+                ...
+            clockss
+                ...
+        sherpa_romeo
+            color
+        jstor
+            year_rle
+            volume_rle
+        scopus
+            id
+            TODO: print/electronic distinction?
+        wos
+            id
+        doi
+            crossref_doi: DOI of the title in crossref (if exists)
+            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref)
+        ia
+            sim
+                nap_id
+                year_rle
+                volume_rle
+            longtail: boolean
+            homepage
+                as_of: datetime of last attempt
+                url
+                status: HTTP/heritrix status of homepage crawl
+
+        issnp: string
+        issne: string
+        coden: string
+        abbrev: string
+        oclc_id: string (TODO: lookup?)
+        lccn_id: string (TODO: lookup?)
+        dblb_id: string
+        default_license: slug
+        original_name: native name (if name is translated)
+        platform: hosting platform: OJS, wordpress, scielo, etc
+        mimetypes: array of strings (eg, 'application/pdf', 'text/html')
+        first_year: year (integer)
+        last_year: if publishing has stopped
+        primary_language: single ISO code, or 'mixed'
+        languages: array of ISO codes
+        region: TODO: continent/world-region
+        nation: shortcode of nation
+        discipline: TODO: highest-level subject; "life science", "humanities", etc
+        field: TODO: narrower description of field
+        subjects: TODO?
+        url: homepage
+        is_oa: boolean. If true, can assume all releases under this container are "Open Access"
+        TODO: domains, if exclusive?
+        TODO: fulltext_regex, if a known pattern?
+
+    For KBART, etc:
+        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved.
+        year and volume spans are run-length-encoded arrays, using integers:
+            - if an integer, means that year is preserved
+            - if an array of length 2, means everything between the two numbers (inclusive) is preserved
+    """
+
+    def __init__(self, api, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.JournalMetadataImporter')
+        super().__init__(api,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra)
+
+    def want(self, raw_record):
+        if raw_record.get('ISSN-L'):
+            return True
+        return False
+
+    def parse_record(self, row):
+        """
+        row is a python dict (parsed from CSV).
+        returns a ContainerEntity (or None if invalid or couldn't parse)
+        """
+        title = or_none(row['title'])
+        issnl = or_none(row['ISSN-L'])
+        if title is None or issnl is None:
+            return None
+        extra = dict(
+            in_doaj=truthy(row['in_doaj']),
+            in_road=truthy(row['in_road']),
+            in_norwegian=truthy(row['in_norwegian']),
+            language=or_none(row['lang']),
+            url=or_none(row['url']),
+            ISSNp=or_none(row['ISSN-print']),
+            ISSNe=or_none(row['ISSN-electronic']),
+            is_oa=truthy(row['is_oa']),
+            is_kept=truthy(row['is_kept']),
+        )
+        ce = fatcat_client.ContainerEntity(
+            issnl=issnl,
+            name=clean(title),
+            publisher=or_none(clean(row['publisher'])),
+            extra=extra)
+        return ce
+
+    def try_update(self, ce):
+
+        existing = None
+        try:
+            existing = self.api.lookup_container(issnl=ce.issnl)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
+
+    def insert_batch(self, batch):
+        self.api.create_container_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 1e5c22f7..2ec6c95d 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,16 +4,10 @@ import json
 import sqlite3
 import itertools
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
-#row = row.split('\t')
-#assert len(row) == 2
-#sha1 = row[0].replace('sha1:')
-#sha1 = base64.b16encode(base64.b32decode(sha1)).lower()
-#print(sha1)
-#dois = [d.lower() for d in json.loads(row[1])]
 
-class MatchedImporter(FatcatImporter):
+class MatchedImporter(EntityImporter):
     """
     Importer for "file to crossref DOI" matches.
 
@@ -48,7 +42,6 @@ class MatchedImporter(FatcatImporter):
             editgroup_extra=eg_extra)
         self.default_link_rel = kwargs.get("default_link_rel", "web")
         self.default_mime = kwargs.get("default_mime", None)
-        self.skip_file_updates = kwargs.get("skip_file_updates", False)
 
     def make_url(self, raw):
         rel = self.default_link_rel
@@ -59,26 +52,13 @@ class MatchedImporter(FatcatImporter):
             rel = "repository"
         elif "//web.archive.org/" in raw or "//archive.is/" in raw:
             rel = "webarchive"
-        return fatcat_client.FileEntityUrls(url=raw, rel=rel)
+        return (rel, raw)
 
-    def parse_matched_dict(self, obj):
-        sha1 = obj['sha1']
-        dois = [d.lower() for d in obj.get('dois', [])]
+    def want(self, raw_record):
+        return True
 
-        # lookup sha1, or create new entity
-        fe = None
-        if not self.skip_file_updates:
-            try:
-                fe = self.api.lookup_file(sha1=sha1)
-            except fatcat_client.rest.ApiException as err:
-                if err.status != 404:
-                    raise err
-        if fe is None:
-            fe = fatcat_client.FileEntity(
-                sha1=sha1,
-                release_ids=[],
-                urls=[],
-            )
+    def parse_record(self, obj):
+        dois = [d.lower() for d in obj.get('dois', [])]
 
         # lookup dois
         re_list = set()
@@ -93,67 +73,77 @@ class MatchedImporter(FatcatImporter):
                 print("DOI not found: {}".format(doi))
             else:
                 re_list.add(re.ident)
-        if len(re_list) == 0:
+        release_ids = list(re_list)
+        if len(release_ids) == 0:
             return None
-        if fe.release_ids == set(re_list):
-            return None
-        re_list.update(fe.release_ids)
-        fe.release_ids = list(re_list)
 
         # parse URLs and CDX
-        existing_urls = [feu.url for feu in fe.urls]
+        urls = set()
         for url in obj.get('url', []):
-            if url not in existing_urls:
-                url = self.make_url(url)
-                if url != None:
-                    fe.urls.append(url)
+            url = self.make_url(url)
+            if url != None:
+                urls.add(url)
         for cdx in obj.get('cdx', []):
             original = cdx['url']
             wayback = "https://web.archive.org/web/{}/{}".format(
                 cdx['dt'],
                 original)
-            if wayback not in existing_urls:
-                fe.urls.append(
-                    fatcat_client.FileEntityUrls(url=wayback, rel="webarchive"))
-            if original not in existing_urls:
-                url = self.make_url(original)
-                if url != None:
-                    fe.urls.append(url)
-
-        if obj.get('size') != None:
-            fe.size = int(obj['size'])
-        fe.sha256 = obj.get('sha256', fe.sha256)
-        fe.md5 = obj.get('md5', fe.sha256)
-        if obj.get('mimetype') is None:
-            if fe.mimetype is None:
-                fe.mimetype = self.default_mime
-        else:
-            fe.mimetype = obj.get('mimetype')
+            urls.add(("webarchive", wayback))
+            url = self.make_url(original)
+            if url != None:
+                urls.add(url)
+        urls = [fatcat_client.FileEntityUrls(rel, url) for (rel, url) in urls]
+        if len(urls) == 0:
+            return None
+
+        size = obj.get('size')
+        if size:
+            size = int(size)
+
+        fe = fatcat_client.FileEntity(
+            md5=obj.get('md5'),
+            sha1=obj['sha1'],
+            sha256=obj.get('sha256'),
+            size=size,
+            mimetype=obj.get('mimetype'),
+            release_ids=release_ids,
+            urls=urls,
+        )
         return fe
 
-    def create_row(self, row, editgroup_id=None):
-        obj = json.loads(row)
-        fe = self.parse_matched_dict(obj)
-        if fe is not None:
-            if fe.ident is None:
-                self.api.create_file(fe, editgroup_id=editgroup_id)
-                self.counts['insert'] += 1
-            else:
-                self.api.update_file(fe.ident, fe, editgroup_id=editgroup_id)
-                self.counts['update'] += 1
-
-    def create_batch(self, batch):
-        """Reads and processes in batches (not API-call-per-line)"""
-        objects = [self.parse_matched_dict(json.loads(l))
-                   for l in batch if l != None]
-        new_objects = [o for o in objects if o != None and o.ident == None]
-        update_objects = [o for o in objects if o != None and o.ident != None]
-        if len(update_objects):
-            update_eg = self._editgroup().editgroup_id
-            for obj in update_objects:
-                self.api.update_file(obj.ident, obj, editgroup_id=update_eg)
-            self.api.accept_editgroup(update_eg)
-        if len(new_objects) > 0:
-            self.api.create_file_batch(new_objects, autoaccept="true")
-        self.counts['update'] += len(update_objects)
-        self.counts['insert'] += len(new_objects)
+    def try_update(self, fe):
+        # lookup sha1, or create new entity
+        existing = None
+        try:
+            existing = self.api.lookup_file(sha1=fe.sha1)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        if not existing:
+            return True
+
+        fe.release_ids = list(set(fe.release_ids + existing.release_ids))
+        if set(fe.release_ids) == set(existing.release_ids) and len(existing.urls) > 0:
+            # no new release matches *and* there are already existing URLs
+            self.counts['exists'] += 1
+            return False
+
+        # merge the existing into this one and update
+        existing.urls = list(set([(u.rel, u.url) for u in fe.urls + existing.urls]))
+        existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls]
+        existing.release_ids = list(set(fe.release_ids + existing.release_ids))
+        existing.mimetype = existing.mimetype or fe.mimetype
+        existing.size = existing.size or fe.size
+        existing.md5 = existing.md5 or fe.md5
+        existing.sha256 = existing.sha256 or fe.sha256
+        self.api.update_file(existing.ident, existing, editgroup_id=self._get_editgroup())
+        self.counts['update'] += 1
+        return False
+
+    def insert_batch(self, batch):
+        self.api.create_file_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
+
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 0c8b1d62..02c9bf00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
 import json
 import itertools
 import fatcat_client
-from .common import FatcatImporter
+from .common import EntityImporter, clean
 
 def value_or_none(e):
     if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
             return None
     return e
 
-class OrcidImporter(FatcatImporter):
+class OrcidImporter(EntityImporter):
 
     def __init__(self, api, **kwargs):
 
@@ -32,14 +32,16 @@ class OrcidImporter(FatcatImporter):
             editgroup_description=eg_desc,
             editgroup_extra=eg_extra)
 
-    def parse_orcid_dict(self, obj):
+    def want(self, raw_record):
+        return True
+
+    def parse_record(self, obj):
         """
         obj is a python dict (parsed from json).
         returns a CreatorEntity
         """
         name = obj['person']['name']
-        if name is None:
-            return None
+        assert name
         extra = None
         given = value_or_none(name.get('given-names'))
         sur = value_or_none(name.get('family-name'))
@@ -61,23 +63,30 @@ class OrcidImporter(FatcatImporter):
             return None
         ce = fatcat_client.CreatorEntity(
             orcid=orcid,
-            given_name=given,
-            surname=sur,
-            display_name=display,
+            given_name=clean(given),
+            surname=clean(sur),
+            display_name=clean(display),
             extra=extra)
         return ce
 
-    def create_row(self, row, editgroup_id=None):
-        obj = json.loads(row)
-        ce = self.parse_orcid_dict(obj)
-        if ce is not None:
-            self.api.create_creator(ce, editgroup_id=editgroup_id)
-            self.counts['insert'] += 1
+    def try_update(self, raw_record):
+        existing = None
+        try:
+            existing = self.api.lookup_creator(orcid=raw_record.orcid)
+        except fatcat_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+        
+        return True
 
-    def create_batch(self, batch):
-        """Reads and processes in batches (not API-call-per-line)"""
-        objects = [self.parse_orcid_dict(json.loads(l))
-                   for l in batch if l != None]
-        objects = [o for o in objects if o != None]
-        self.api.create_creator_batch(objects, autoaccept="true")
-        self.counts['insert'] += len(objects)
+    def insert_batch(self, batch):
+        self.api.create_creator_batch(batch,
+            autoaccept=True,
+            description=self.editgroup_description,
+            extra=json.dumps(self.editgroup_extra))
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 0f957f9a..2493b1ab 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -1,4 +1,5 @@
 
+
 import collections
 from fatcat_client import ReleaseEntity, ApiClient
 
@@ -26,25 +27,43 @@ def release_to_elasticsearch(release):
     Raises exception on error (never returns None)
     """
 
-    if release.state != 'active':
-        raise ValueError("Entity is not 'active'")
+    if release.state in ('redirect', 'deleted'):
+        return dict(
+            ident = release.ident,
+            state = release.state,
+        )
+    elif release.state != 'active':
+        raise ValueError("Unhandled release state: {}".format(release.state))
 
     # First, the easy ones (direct copy)
     t = dict(
         ident = release.ident,
+        state = release.state,
         revision = release.revision,
         title = release.title,
+        original_title = release.original_title,
         release_type = release.release_type,
         release_status = release.release_status,
         language = release.language,
+        license = release.license_slug,
         doi = release.doi,
         pmid = release.pmid,
         pmcid = release.pmcid,
         isbn13 = release.isbn13,
+        wikidata_qid = release.wikidata_qid,
         core_id = release.core_id,
-        wikidata_qid = release.wikidata_qid
+        arxiv_id = release.core_id,
+        jstor_id = release.jstor_id,
     )
 
+    is_oa = None
+    is_longtail_oa = None
+    in_kbart = None
+    in_web = False
+    in_dweb = False
+    in_ia = False
+    in_shadow = False
+
     if release.release_date:
         # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD)
         t['release_date'] = release.release_date.isoformat()
@@ -53,52 +72,99 @@ def release_to_elasticsearch(release):
     if release.release_year is not None:
         t['release_year'] = release.release_year
 
+    t['any_abstract'] = len(release.abstracts) > 0
+    t['ref_count'] = len(release.refs or [])
+    t['contrib_count'] = len(release.contribs or [])
+    contrib_names = []
+    for c in (release.contribs or []):
+        if c.raw_name:
+            contrib_names.append(c.raw_name)
+    t['contrib_names'] = contrib_names
+
     container = release.container
-    container_is_kept = False
     if container:
         t['publisher'] = container.publisher
         t['container_name'] = container.name
         t['container_issnl'] = container.issnl
-        container_extra = container.extra
-        if container_extra:
-            t['container_is_oa'] = container_extra.get('is_oa')
-            container_is_kept = container_extra.get('is_kept', False)
-            t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+        t['container_type'] = container.container_type
+        if container.extra:
+            if container.extra.get('is_oa') or container.extra.get('in_doaj'):
+                is_oa = True
+            if container.extra.get('in_kbart'):
+                # TODO: better KBART check goes here
+                in_kbart = True
+            if container.extra.get('ia'):
+                # TODO: container longtail check goes here
+                # TODO: sim/microfilm check goes here
+                pass
+            # TODO: SHERPA/Romeo goes here
     else:
         t['publisher'] = release.publisher
 
     files = release.files or []
     t['file_count'] = len(files)
-    in_wa = False
-    in_ia = False
-    t['file_pdf_url'] = None
+    t['fileset_count'] = len(release.filesets or [])
+    t['webcapture_count'] = len(release.webcaptures or [])
+    any_pdf_url = None
+    good_pdf_url = None
+    best_pdf_url = None
+    ia_pdf_url = None
     for f in files:
+        if f.extra and f.extra.get('shadows'):
+            # TODO: shadow check goes here
+            in_shadows = True
         is_pdf = 'pdf' in (f.mimetype or '')
         for url in (f.urls or []):
-            if url.rel == 'webarchive':
-                in_wa = True
-            if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''):
+            if url.url.lower().startswith('http'):
+                in_web = True
+            if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'):
+                # TODO: not sure what rel will be
+                in_dweb = True
+            if is_pdf:
+                any_pdf_url = url.url
+            if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:
+                is_preserved = True
+                good_pdf_url = url.url
+            if '//web.archive.org/' in url.url or '//archive.org/' in url.url:
                 in_ia = True
                 if is_pdf:
-                    t['file_pdf_url'] = url.url
-            if not t['file_pdf_url'] and is_pdf:
-                t['file_pdf_url'] = url.url
-    t['file_in_webarchive'] = in_wa
-    t['file_in_ia'] = in_ia
+                    best_pdf_url = url.url
+                    ia_pdf_url = url.url
+    # here is where we bake-in priority; IA-specific
+    t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url
+    t['ia_pdf_url'] = ia_pdf_url
+
+    if release.license_slug:
+        # TODO: more/better checks here, particularly strict *not* OA licenses
+        if release.license_slug.startswith("CC-"):
+            is_oa = True
 
     extra = release.extra or dict()
     if extra:
-        t['in_shadow'] = extra.get('in_shadow')
-        if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'):
-            t['container_is_longtail_oa'] = True
-    t['any_abstract'] = bool(release.abstracts)
-    t['is_kept'] = container_is_kept or extra.get('is_kept', False)
+        # TODO: longtail OA check from GROBID here
+        if extra.get('in_kbart'):
+            # NOTE: not actually setting this anywhere
+            in_kbart = True
+        if extra.get('is_oa'):
+            # NOTE: not actually setting this anywhere
+            is_oa = True
+        if extra.get('grobid'):
+            if not t.get('container_name'):
+                t['container_name'] = extra['grobid'].get('container_name')
+            if extra['grobid'].get('longtail_oa'):
+                is_longtail_oa = True
+        if extra.get('crossref'):
+            if extra['crossref'].get('archive'):
+                # all crossref archives are KBART, I believe
+                in_kbart = True
 
-    t['ref_count'] = len(release.refs or [])
-    t['contrib_count'] = len(release.contribs or [])
-    contrib_names = []
-    for c in (release.contribs or []):
-        if c.raw_name:
-            contrib_names.append(c.raw_name)
-    t['contrib_names'] = contrib_names
+    if is_longtail_oa:
+        is_oa = True
+    t['is_oa'] = is_oa
+    t['is_longtail_oa'] = is_longtail_oa
+    t['in_kbart'] = in_kbart
+    t['in_web'] = in_web
+    t['in_dweb'] = in_dweb
+    t['in_ia'] = in_ia
+    t['is_preserved'] = in_ia or in_kbart
     return t
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 8690a791..636ed304 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -93,7 +93,7 @@ class EntityUpdatesWorker(FatcatWorker):
                 release_edits = cle['editgroup']['edits']['releases']
                 for re in release_edits:
                     ident = re['ident']
-                    release = self.api.get_release(ident, expand="files,container")
+                    release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
                     release_dict = self.api.api_client.sanitize_for_serialization(release)
                     producer.produce(
                         message=json.dumps(release_dict).encode('utf-8'),