aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-04-11 15:03:18 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-04-11 15:03:18 -0700
commit2c769d488334bc6aab9f65a7c66e18442949d482 (patch)
treef469db7c98da3cce46c7cb2756d897834a168a6c
parentf8920f46b46282670229bdb7cf12052d167c26e8 (diff)
downloadfatcat-2c769d488334bc6aab9f65a7c66e18442949d482.tar.gz
fatcat-2c769d488334bc6aab9f65a7c66e18442949d482.zip
commit old work
-rw-r--r--fatcat/models.py37
-rw-r--r--next_thoughts.txt15
2 files changed, 45 insertions, 7 deletions
diff --git a/fatcat/models.py b/fatcat/models.py
index dd84070d..2f0bfd72 100644
--- a/fatcat/models.py
+++ b/fatcat/models.py
@@ -16,16 +16,34 @@ release_contrib = db.Table("release_contrib",
db.Column("stub", db.String, nullable=True))
class WorkId(db.Model):
+ """
+ If revision_id is null, this was deleted.
+ If redirect_id is not null, this has been merged with the given id. In this
+ case revision_id is a "cached" copy of the redirect's revision_id, as
+ an optimization. If the merged work is "deleted", revision_id can be
+ null and redirect_id not-null.
+ """
__tablename__ = 'work_id'
- id = db.Column(db.Integer, primary_key=True)
- revision_id = db.Column(db.ForeignKey('work_revision.id'))
+ id = db.Column(db.Integer, primary_key=True, nullable=False)
+ revision_id = db.Column(db.ForeignKey('work_revision.id'), nullable=True)
+ redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True)
+
+class WorkLog(db.Model):
+ __tablename__ = 'work_log'
+ # ID is a monotonic int here; important for ordering!
+ id = db.Column(db.Integer, primary_key=True, nullable=False)
+ work_id = db.Column(db.ForeignKey('work_id.id'), nullable=False)
+ old_revision_id = db.Column(db.ForeignKey('work_revision.id'), nullable=True)
+ old_redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True)
+ new_revision_id = db.Column(db.ForeignKey('work_revision.id'), nullable=True)
+ new_redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True)
+ # TODO: is this right?
+ edit_id = db.Column(db.ForeignKey('edit.id'))
class WorkRevision(db.Model):
__tablename__ = 'work_revision'
id = db.Column(db.Integer, primary_key=True)
previous = db.Column(db.ForeignKey('work_revision.id'), nullable=True)
- state = db.Column(db.String)
- redirect_id = db.Column(db.ForeignKey('work_id.id'), nullable=True)
edit_id = db.Column(db.ForeignKey('edit.id'))
extra_json = db.Column(db.ForeignKey('extra_json.sha1'), nullable=True)
#work_ids = db.relationship("WorkId", backref="revision", lazy=True)
@@ -130,9 +148,14 @@ class ReleaseFil(db.Model):
class Edit(db.Model):
__tablename__ = 'edit'
id = db.Column(db.Integer, primary_key=True, autoincrement=True)
- edit_group = db.Column(db.ForeignKey('edit_group.id'))
- editor = db.Column(db.ForeignKey('editor.id'))
- description = db.Column(db.String)
+ edit_group = db.Column(db.ForeignKey('edit_group.id'), nullable=True)
+ editor = db.Column(db.ForeignKey('editor.id'), nullable=False)
+ comment = db.Column(db.String, nullable=True)
+ extra_json = db.Column(db.ForeignKey('extra_json.sha1'), nullable=True)
+ # WARNING: polymorphic. Represents the id that should end up pointing to
+ # this revision.
+ # TODO: this doesn't work
+ entity_id = db.Column(db.Integer, nullable=True)
class EditGroup(db.Model):
__tablename__ = 'edit_group'
diff --git a/next_thoughts.txt b/next_thoughts.txt
index 0e89249a..8c7d12fc 100644
--- a/next_thoughts.txt
+++ b/next_thoughts.txt
@@ -1,3 +1,4 @@
+
Should probably just UUID all the (public) ids.
Instead of having a separate id pointer table, could have an extra "mutable"
@@ -17,3 +18,17 @@ Trade-off: easy querying for common case (wanting "active" rows) vs. robust
handling of redirects (likely to be pretty common). Also, having UUID handling
across more than one table.
+## Scaling database
+
+Two scaling issues: size of database due to edits (likely billions of rows) and
+desire to do complex queries/reports ("analytics"). The later is probably not a
+concern, and could be handled by dumping and working on a cluster (or secondary
+views, etc). So just a distraction? Simpler to have all rolled up.
+
+Cockroach is postgres-like; might be able to use that for HA and scaling?
+Bottlenecks are probably complex joins (mitigated by "interleave"?) and bulk
+import performance (one-time?).
+
+Using elastic for most (eg, non-logged-in) views could keep things fast.
+
+Cockroach seems more resourced/polished than TiDB?