diff options
-rw-r--r-- | fatcat/models.py | 37 | ||||
-rw-r--r-- | next_thoughts.txt | 15 |
2 files changed, 45 insertions, 7 deletions
diff --git a/fatcat/models.py b/fatcat/models.py index dd84070d..2f0bfd72 100644 --- a/fatcat/models.py +++ b/fatcat/models.py @@ -16,16 +16,34 @@ release_contrib = db.Table("release_contrib", db.Column("stub", db.String, nullable=True)) class WorkId(db.Model): + """ + If revision_id is null, this was deleted. + If redirect_id is not null, this has been merged with the given id. In this + case revision_id is a "cached" copy of the redirect's revision_id, as + an optimization. If the merged work is "deleted", revision_id can be + null and redirect_id not-null. + """ __tablename__ = 'work_id' - id = db.Column(db.Integer, primary_key=True) - revision_id = db.Column(db.ForeignKey('work_revision.id')) + id = db.Column(db.Integer, primary_key=True, nullable=False) + revision_id = db.Column(db.ForeignKey('work_revision.id'), nullable=True) + redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True) + +class WorkLog(db.Model): + __tablename__ = 'work_log' + # ID is a monotonic int here; important for ordering! + id = db.Column(db.Integer, primary_key=True, nullable=False) + work_id = db.Column(db.ForeignKey('work_id.id'), nullable=False) + old_revision_id = db.Column(db.ForeignKey('work_revision.id'), nullable=True) + old_redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True) + new_revision_id = db.Column(db.ForeignKey('work_revision.id'), nullable=True) + new_redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True) + # TODO: is this right? + edit_id = db.Column(db.ForeignKey('edit.id')) class WorkRevision(db.Model): __tablename__ = 'work_revision' id = db.Column(db.Integer, primary_key=True) previous = db.Column(db.ForeignKey('work_revision.id'), nullable=True) - state = db.Column(db.String) - redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True) edit_id = db.Column(db.ForeignKey('edit.id')) extra_json = db.Column(db.ForeignKey('extra_json.sha1'), nullable=True) #work_ids = db.relationship("WorkId", backref="revision", lazy=True) @@ -130,9 +148,14 @@ class ReleaseFil(db.Model): class Edit(db.Model): __tablename__ = 'edit' id = db.Column(db.Integer, primary_key=True, autoincrement=True) - edit_group = db.Column(db.ForeignKey('edit_group.id')) - editor = db.Column(db.ForeignKey('editor.id')) - description = db.Column(db.String) + edit_group = db.Column(db.ForeignKey('edit_group.id'), nullable=True) + editor = db.Column(db.ForeignKey('editor.id'), nullable=False) + comment = db.Column(db.String, nullable=True) + extra_json = db.Column(db.ForeignKey('extra_json.sha1'), nullable=True) + # WARNING: polymorphic. Represents the id that should end up pointing to + # this revision. + # TODO: this doesn't work + entity_id = db.Column(db.Integer, nullable=True) class EditGroup(db.Model): __tablename__ = 'edit_group' diff --git a/next_thoughts.txt b/next_thoughts.txt index 0e89249a..8c7d12fc 100644 --- a/next_thoughts.txt +++ b/next_thoughts.txt @@ -1,3 +1,4 @@ + Should probably just UUID all the (public) ids. Instead of having a separate id pointer table, could have an extra "mutable" @@ -17,3 +18,17 @@ Trade-off: easy querying for common case (wanting "active" rows) vs. robust handling of redirects (likely to be pretty common). Also, having UUID handling across more than one table. +## Scaling database + +Two scaling issues: size of database due to edits (likely billions of rows) and +desire to do complex queries/reports ("analytics"). The later is probably not a +concern, and could be handled by dumping and working on a cluster (or secondary +views, etc). So just a distraction? Simpler to have all rolled up. + +Cockroach is postgres-like; might be able to use that for HA and scaling? +Bottlenecks are probably complex joins (mitigated by "interleave"?) and bulk +import performance (one-time?). + +Using elastic for most (eg, non-logged-in) views could keep things fast. + +Cockroach seems more resourced/polished than TiDB? |