aboutsummaryrefslogtreecommitdiffstats
path: root/papers/sleep.latex
blob: cd695d06a730b573841df08ba62d3fd0b662ae91 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
\documentclass[a4paperpaper,twocolumn]{article}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{ifxetex,ifluatex}
\usepackage{fixltx2e} % provides \textsubscript
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
\else % if luatex or xelatex
  \ifxetex
    \usepackage{mathspec}
  \else
    \usepackage{fontspec}
  \fi
  \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
\fi
% use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
% use microtype if available
\IfFileExists{microtype.sty}{%
\usepackage{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\usepackage[unicode=true]{hyperref}
\hypersetup{
            pdftitle={SLEEP - Syncable Ledger of Exact Events Protocol},
            pdfauthor={Mathias Buus Madsen, Maxwell Ogden, Code for Science},
            pdfborder={0 0 0},
            breaklinks=true}
\urlstyle{same}  % don't use monospace font for urls
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}
}
\setlength{\emergencystretch}{3em}  % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{0}
% Redefines (sub)paragraphs to behave more like sections
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi

\title{SLEEP - Syncable Ledger of Exact Events Protocol}
\author{Mathias Buus Madsen, Maxwell Ogden, Code for Science}
\date{August 2017}

\begin{document}
\maketitle

\subsection{SLEEP}\label{sleep}

This document is a technical description of the SLEEP format intended
for implementers. SLEEP is the the on-disk format that Dat produces and
uses. It is a set of 9 files that hold all of the metadata needed to
list the contents of a Dat repository and verify the integrity of the
data you receive. SLEEP is designed to work with REST, allowing servers
to be plain HTTP file servers serving the static SLEEP files, meaning
you can implement a Dat protocol client using HTTP with a static HTTP
file server as the backend.

SLEEP files contain metadata about the data inside a Dat repository,
including cryptographic hashes, cryptographic signatures, filenames and
file permissions. The SLEEP format is specifically designed to allow
efficient access to subsets of the metadata and/or data in the
repository, even on very large repositories, which enables Dat's peer to
peer networking to be fast.

The acronym SLEEP is a slumber related pun on REST and stands for
Syncable Ledger of Exact Events Protocol. The Syncable part refers to
how SLEEP files are append-only in nature, meaning they grow over time
and new updates can be subscribed to as a realtime feed of events
through the Dat protocol.

The SLEEP version described here, used in Dat as of 2017 is SLEEP V2.
SLEEP V1 is documented at http://specs.okfnlabs.org/sleep.

\subsubsection{SLEEP Files}\label{sleep-files}

SLEEP is a set of 9 files that should be stored with the following
names. In Dat, the files are stored in a folder called \texttt{.dat} in
the top level of the repository.

\begin{verbatim}
metadata.key
metadata.signatures
metadata.bitfield
metadata.tree
metadata.data
content.key
content.signatures
content.bitfield
content.tree
\end{verbatim}

The files prefixed with \texttt{content} store metadata about the
primary data in a Dat repository, for example the raw binary contents of
the files. The files prefixed with \texttt{metadata} store metadata
about the files in the repository, for example the filenames, file
sizes, and file permissions. The \texttt{content} and \texttt{metadata}
files are both Hypercore registers, making SLEEP a set of two Hypercore
registers.

\subsubsection{SLEEP File Headers}\label{sleep-file-headers}

The following structured binary format is used for \texttt{signatures},
\texttt{bitfield}, and \texttt{tree} files. The header contains metadata
as well as information needed to decode the rest of the files after the
header. SLEEP files are designed to be easy to append new data, easy to
read arbitrary byte offsets in the middle, and are relatively flat,
simple files that rely on the filesystem for the heavy lifting.

SLEEP files are laid out like this:

\begin{verbatim}
<32 byte header>
<fixed-size entry 1>
<fixed-size entry 2>
<fixed-size entry ...>
<fixed-size entry n>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  32 byte header
\item
  4 bytes Uint32BE (``Big-Endian'') - magic byte (value varies depending
  on which file, used to quickly identify which file type it is)
\item
  1 byte - version number of the file header protocol, current version
  is 0
\item
  2 byte Uint16BE - entry size, describes how long each entry in the
  file is
\item
  1 byte - length prefix for body
\item
  rest of 32 byte header - string describing key or hash algorithm.
  length of this string matches the length in the previous length prefix
  field. This string must fit within the 32 byte header limitation (24
  bytes reserved for string). Unused bytes should be filled with zeroes.
\end{itemize}

Possible values in the Dat implementation for the body field are:

\begin{verbatim}
Ed25519
BLAKE2b
\end{verbatim}

To calculate the offset of some entry position, first read the header
and get the entry size, then do
\texttt{32\ +\ entrySize\ *\ entryIndex}. To calculate how many entries
are in a file, you can use the entry size and the filesize on disk and
do \texttt{(fileSize\ -\ 32)\ /\ entrySize}.

As mentioned above, \texttt{signatures}, \texttt{bitfield} and
\texttt{tree} are the three SLEEP files. There are two additional files,
\texttt{key}, and \texttt{data}, which do not contain SLEEP file headers
and store plain serialized data for easy access. \texttt{key} stores the
public key that is described by the \texttt{signatures} file, and
\texttt{data} stores the raw chunk data that the \texttt{tree} file
contains the hashes and metadata for.

\subsubsection{File Descriptions}\label{file-descriptions}

\paragraph{key}\label{key}

The public key used to verify the signatures in the \texttt{signatures}
file, stored in binary as a single buffer written to disk. To find out
what format of key is stored in this file, read the header of
\texttt{signatures}. In Dat, it's always a ed25519 public key, but other
implementations can specify other key types using a string value in that
header.

\paragraph{tree}\label{tree}

A SLEEP formatted 32 byte header with data entries representing a
serialized Merkle tree based on the data in the data storage layer. All
the fixed size nodes written in in-order tree notation. The header
algorithm string for \texttt{tree} files is \texttt{BLAKE2b}. The entry
size is 40 bytes. Entries are formatted like this:

\begin{verbatim}
<32 byte header>
  <4 byte magic string: 0x05025702>
  <1 byte version number: 0>
  <2 byte entry size: 40>
  <1 byte algorithm name length prefix: 7>
  <7 byte algorithm name: BLAKE2b>
  <17 zeroes>
<40 byte entries>
  <32 byte BLAKE2b hash>
  <8 byte Uint64BE children leaf byte length>
\end{verbatim}

The children leaf byte length is the byte size containing the sum byte
length of all leaf nodes in the tree below this node.

This file uses the in-order notation, meaning even entries are leaf
nodes and odd entries are parent nodes (non-leaf).

To prevent pre-image attacks, all hashes start with a one byte type
descriptor:

\begin{verbatim}
0 - LEAF
1 - PARENT
2 - ROOT
\end{verbatim}

To calculate leaf node entries (the hashes of the data entries) we hash
this data:

\begin{verbatim}
BLAKE2b(
  <1 byte type>
    0
  <8 bytes Uint64BE>
    length of entry data
  <entry data>
)
\end{verbatim}

Then we take this 32 byte hash and write it to the tree as 40 bytes like
this:

\begin{verbatim}
<32 bytes>
  BLAKE2b hash
<8 bytes Uint64BE>
  length of data
\end{verbatim}

Note that the Uint64 of length of data is included both in the hashed
data and written at the end of the entry. This is to expose more
metadata to Dat for advanced use cases such as verifying data length in
sparse replication scenarios.

To calculate parent node entries (the hashes of the leaf nodes) we hash
this data:

\begin{verbatim}
BLAKE2b(
  <1 byte>
    1
  <8 bytes Uint64BE>
    left child length + right child length
  <32 bytes>
    left child hash
  <32 bytes>
    right child hash
)
\end{verbatim}

Then we take this 32 byte hash and write it to the tree as 40 bytes like
this:

\begin{verbatim}
<32 bytes>
  BLAKE2b hash
<8 bytes Uint64BE>
  left child length + right child length
\end{verbatim}

The reason the tree entries contain data lengths is to allow for sparse
mode replication. Encoding lengths (and including lengths in all hashes)
means you can verify the Merkle subtrees independent of the rest of the
tree, which happens during sparse replication scenarios.

The tree file corresponds directly to the \texttt{data} file.

\paragraph{data}\label{data}

The \texttt{data} file is only included in the SLEEP format for the
\texttt{metadata.*} prefixed files which contains filesystem metadata
and not actual file data. For the \texttt{content.*} files, the data is
stored externally (in Dat it is stored as normal files on the filesystem
and not in a SLEEP file). However you can configure Dat to use a
\texttt{content.data} file if you want and it will still work. If you
want to store the full history of all versions of all files, using the
\texttt{content.data} file would provide that guarantee, but would have
the disadvantage of storing files as chunks merged into one huge file
(not as user friendly).

The \texttt{data} file does not contain a SLEEP file header. It just
contains a bunch of concatenated data entries. Entries are written in
the same order as they appear in the \texttt{tree} file. To read a
\texttt{data} file, first decode the \texttt{tree} file and for every
leaf in the \texttt{tree} file you can calculate a data offset for the
data described by that leaf node in the \texttt{data} file.

\subparagraph{Index Lookup}\label{index-lookup}

For example, if we wanted to seek to a specific entry offset (say entry
42):

\begin{itemize}
\tightlist
\item
  First, read the header of the \texttt{tree} file and get the entry
  size, then do \texttt{32\ +\ entrySize\ *\ 42} to get the raw tree
  index: \texttt{32\ +\ (40\ *\ 42)}
\item
  Since we want the leaf entry (even node in the in-order layout), we
  multiply the entry index by 2: \texttt{32\ +\ (40\ *\ (42\ *\ 2))}
\item
  Read the 40 bytes at that offset in the \texttt{tree} file to get the
  leaf node entry.
\item
  Read the last 8 bytes of the entry to get the length of the data entry
\item
  To calculate the offset of where in the \texttt{data} file your entry
  begins, you need to sum all the lengths of all the earlier entries in
  the tree. The most efficient way to do this is to sum all the previous
  parent node (non-leaf) entry lengths. You can also sum all leaf node
  lengths, but parent nodes contain the sum of their children's lengths
  so it's more efficient to use parents. During Dat replication, these
  nodes are fetched as part of the Merkle tree verification so you will
  already have them locally. This is a log(N) operation where N is the
  entry index. Entries are also small and therefore easily cacheable.
\item
  Once you get the offset, you use the length you decoded above and read
  N bytes (where N is the decoded length) at the offset in the
  \texttt{data} file. You can verify the data integrity using the 32
  byte hash from the \texttt{tree} entry.
\end{itemize}

\subparagraph{Byte Lookup}\label{byte-lookup}

The above method illustrates how to resolve a chunk position index to a
byte offset. You can also do the reverse operation, resolving a byte
offset to a chunk position index. This is used to stream arbitrary
random access regions of files in sparse replication scenarios.

\begin{itemize}
\tightlist
\item
  First, you start by calculating the current Merkle roots
\item
  Each node in the tree (including these root nodes) stores the
  aggregate file size of all byte sizes of the nodes below it. So the
  roots cumulatively will describe all possible byte ranges for this
  repository.
\item
  Find the root that contains the byte range of the offset you are
  looking for and get the node information for all of that nodes
  children using the Index Lookup method, and recursively repeat this
  step until you find the lowest down child node that describes this
  byte range.
\item
  The chunk described by this child node will contain the byte range you
  are looking for. You can use the \texttt{byteOffset} field in the
  \texttt{Stat} metadata object to seek to the correct position in the
  content file for the start of this chunk.
\end{itemize}

\subparagraph{Metadata Overhead}\label{metadata-overhead}

Using this scheme, if you write 4GB of data using on average 64KB data
chunks (note: chunks can be variable length and do not need to be the
same size), your tree file will be around 5MB (0.0125\% overhead).

\paragraph{signatures}\label{signatures}

A SLEEP formatted 32 byte header with data entries being 64 byte
signatures.

\begin{verbatim}
<32 byte header>
  <4 byte magic string: 0x05025701>
  <1 byte version number: 0>
  <2 byte entry size: 64>
  <1 byte algorithm name length prefix: 7>
  <7 byte algorithm name: Ed25519>
  <17 zeroes>
<64 byte entries>
  <64 byte Ed25519 signature>
\end{verbatim}

Every time the tree is updated we sign the current roots of the Merkle
tree, and append them to the signatures file. The signatures file starts
with no entries. Each time a new leaf is appended to the \texttt{tree}
file (aka whenever data is added to a Dat), we take all root hashes at
the current state of the Merkle tree and hash and sign them, then append
them as a new entry to the signatures file.

\begin{verbatim}
Ed25519 sign(
  BLAKE2b(
    <1 byte>
      2 // root type
    for (every root node left-to-right) {
      <32 byte root hash>
      <8 byte Uint64BE root tree index>
      <8 byte Uint64BE child byte lengths>
    }
  )
)
\end{verbatim}

The reason we hash all the root nodes is that the BLAKE2b hash above is
only calculable if you have all of the pieces of data required to
generate all the intermediate hashes. This is the crux of Dat's data
integrity guarantees.

\paragraph{bitfield}\label{bitfield}

A SLEEP formatted 32 byte header followed by a series of 3328 byte long
entries.

\begin{verbatim}
<32 byte header>
  <4 byte magic string: 0x05025700>
  <1 byte version number: 0>
  <2 byte entry size: 3328>
  <1 byte algorithm name length: 0>
  <1 byte algorithm name: 0>
  <24 zeroes>
<3328 byte entries> // (2048 + 1024 + 256)
\end{verbatim}

The bitfield describes which pieces of data you have, and which nodes in
the \texttt{tree} file have been written. This file exists as an index
of the \texttt{tree} and \texttt{data} to quickly figure out which
pieces of data you have or are missing. This file can be regenerated if
you delete it, so it is considered a materialized index.

The \texttt{bitfield} file actually contains three bitfields of
different sizes. A bitfield (AKA bitmap) is defined as a set of bits
where each bit (0 or 1) represents if you have or do not have a piece of
data at that bit index. So if there is a dataset of 10 cat pictures, and
you have pictures 1, 3, and 5 but are missing the rest, your bitfield
would look like \texttt{1010100000}.

Each entry contains three objects:

\begin{itemize}
\tightlist
\item
  Data Bitfield (1024 bytes) - 1 bit for for each data entry that you
  have synced (1 for every entry in \texttt{data}).
\item
  Tree Bitfield (2048 bytes) - 1 bit for every tree entry (all nodes in
  \texttt{tree})
\item
  Bitfield Index (256 bytes) - This is an index of the Data Bitfield
  that makes it efficient to figure out which pieces of data are missing
  from the Data Bitfield without having to do a linear scan.
\end{itemize}

The Data Bitfield is 1Kb somewhat arbitrarily, but the idea is that
because most filesystems work in 4Kb chunk sizes, we can fit the Data,
Tree and Index in less then 4Kb of data for efficient writes to the
filesystem. The Tree and Index sizes are based on the Data size (the
Tree has twice the entries as the Data, odd and even nodes vs just even
nodes in \texttt{tree}, and Index is always 1/4th the size).

To generate the Index, you take pairs of 2 bytes at a time from the Data
Bitfield, check if all bits in the 2 bytes are the same, and generate 4
bits of Index metadata~for every 2 bytes of Data (hence how 1024 bytes
of Data ends up as 256 bytes of Index).

First you generate a 2 bit tuple for the 2 bytes of Data:

\begin{verbatim}
if (data is all 1's) then [1,1]
if (data is all 0's) then [0,0]
if (data is not all the same) then [1, 0]
\end{verbatim}

The Index itself is an in-order binary tree, not a traditional bitfield.
To generate the tree, you take the tuples you generate above and then
write them into a tree like the following example, where non-leaf nodes
are generated using the above scheme by looking at the results of the
relative even child tuples for each odd parent tuple:

\begin{verbatim}
// for e.g. 16 bytes (8 tuples) of
// sparsely replicated data
0 - [00 00 00 00]
1 -     [10 10 10 10]
2 - [11 11 11 11]
\end{verbatim}

The tuples at entry \texttt{1} above are \texttt{{[}1,0{]}} because the
relative child tuples are not uniform. In the following example, all
non-leaf nodes are \texttt{{[}1,1{]}} because their relative children
are all uniform (\texttt{{[}1,1{]}})

\begin{verbatim}
// for e.g. 32 bytes (16 tuples) of
// fully replicated data (all 1's)
0 - [11 11 11 11]
1 -     [11 11 11 11]
2 - [11 11 11 11]
3 -         [11 11 11 11]
4 - [11 11 11 11]
5 -     [11 11 11 11]
6 - [11 11 11 11]
\end{verbatim}

Using this scheme, it takes at most 8 bytes of Index to represent 32
bytes of data. In this example the Index can compresses well because it
consists of all one bits. Similarly, an empty bitfield is all zero bits.

If you write 4GB of data using on average 64KB data chunk size, your
bitfield will be at most 32KB.

\paragraph{metadata.data}\label{metadata.data}

This file is used to store content described by the rest of the
\texttt{metadata.*} hypercore SLEEP files. Whereas the
\texttt{content.*} SLEEP files describe the data stored in the actual
data cloned in the Dat repository filesystem, the \texttt{metadata} data
feed is stored inside the \texttt{.dat} folder along with the rest of
the SLEEP files.

The contents of this file is a series of versions of the Dat filesystem
tree. As this is a hypercore data feed, it's just an append only log of
binary data entries. The challenge is representing a tree in a
one-dimensional way to make it representable as a Hypercore register.
For example, imagine three files:

\begin{verbatim}
~/dataset $ ls
figures
  graph1.png
  graph2.png
results.csv

1 directory, 3 files
\end{verbatim}

We want to take this structure and map it to a serialized representation
that gets written into an append only log in a way that still allows for
efficient random access by file path.

To do this, we convert the filesystem metadata into entries in a feed
like this:

\begin{verbatim}
{
  "path": "/results.csv",
  trie: [[]],
  sequence: 0
}
{
  "path": "/figures/graph1.png",
  trie: [[0], []],
  sequence: 1
}
{
  "path": "/figures/graph2.png",
  trie: [[0], [1]],
  sequence: 2
}
\end{verbatim}

\subparagraph{Filename Resolution}\label{filename-resolution}

Each sequence represents adding one of the files to the register, so at
sequence 0 the filesystem state only has a single file,
\texttt{results.csv} in it. At sequence 1, there are only 2 files added
to the register, and at sequence 3 all files are finally added. The
\texttt{children} field represents a shorthand way of declaring which
other files at every level of the directory hierarchy exist alongside
the file being added at that revision. For example at the time of
sequence 1, children is \texttt{{[}{[}0{]},\ {[}{]}{]}}. The first
sub-array, \texttt{{[}0{]}}, represents the first folder in the
\texttt{path}, which is the root folder \texttt{/}. In this case
\texttt{{[}0{]}} means the root folder at this point in time only has a
single file, the file that is the subject of sequence \texttt{0}. The
second subarray is empty \texttt{{[}{]}} because there are no other
existing files in the second folder in the \texttt{path},
\texttt{figures}.

To look up a file by filename, you fetch the latest entry in the log,
then use the \texttt{children} metadata in that entry to look up the
longest common ancestor based on the parent folders of the filename you
are querying. You can then recursively repeat this operation until you
find the \texttt{path} entry you are looking for (or you exhaust all
options which means the file does not exist). This is a
\texttt{O(number\ of\ slashes\ in\ your\ path)} operation.

For example, if you wanted to look up \texttt{/results.csv} given the
above register, you would start by grabbing the metadata at sequence 2.
The longest common ancestor between \texttt{/results.csv} and
\texttt{/figures/graph2} is \texttt{/}. You then grab the corresponding
entry in the children array for \texttt{/}, which in this case is the
first entry, \texttt{{[}0{]}}. You then repeat this with all of the
children entries until you find a child that is closer to the entry you
are looking for. In this example, the first entry happens to be the
match we are looking for.

You can also perform lookups relative to a point in time by starting
from a specific sequence number in the register. For example to get the
state of some file relative to an old sequence number, similar to
checking out an old version of a repository in Git.

\subparagraph{Data Serialization}\label{data-serialization}

The format of the \texttt{metadata.data} file is as follows:

\begin{verbatim}
<Header>
<Node 1>
<Node 2>
<Node ...>
<Node N>
\end{verbatim}

Each entry in the file is encoded using Protocol Buffers (Varda 2008).

The first message we write to the file is of a type called Header which
uses this schema:

\begin{verbatim}
message Header {
  required string type = 1;
  optional bytes content = 2;
}
\end{verbatim}

This is used to declare two pieces of metadata used by Dat. It includes
a \texttt{type} string with the value \texttt{hyperdrive} and
\texttt{content} binary value that holds the public key of the content
register that this metadata register represents. When you share a Dat,
the metadata key is the main key that gets used, and the content
register key is linked from here in the metadata.

After the header the file will contain many filesystem \texttt{Node}
entries:

\begin{verbatim}
message Node {
  required string path = 1;
  optional Stat value = 2;
  optional bytes trie = 3;
  repeated Writer writers = 4;
  optional uint64 writersSequence = 5;
}

message Writer {
  required bytes publicKey = 1;
  optional string permission = 2;
}
\end{verbatim}

The \texttt{Node} object has five fields

\begin{itemize}
\tightlist
\item
  \texttt{path} - the string of the absolute file path of this file.
\item
  \texttt{Stat} - a Stat encoded object representing the file metadata
\item
  \texttt{trie} - a compressed list of the sequence numbers as described
  earlier
\item
  \texttt{writers} - a list of the writers who are allowed to write to
  this dat
\item
  \texttt{writersSequence} - a reference to the last sequence where the
  writers array was modified. you can use this to quickly find the value
  of the writers keys.
\end{itemize}

The \texttt{trie} value is encoded by starting with the nested array of
sequence numbers, e.g.
\texttt{{[}{[}{[}0,\ 3{]}{]},\ {[}{[}0,\ 2{]},\ {[}0,\ 1{]}{]}{]}}. Each
entry is a tuple where the first item is the index of the feed in the
\texttt{writers} array and the second value is the sequence number.
Finally you prepend the trie value with a version number varint.

To write these subarrays we use variable width integers (varints), using
a repeating pattern like this, one for each array:

\begin{verbatim}
<varint of 0>
<varint of 3>
<varint of 0>
<varint of 2>
<varint of 0>
<varint of 1>
\end{verbatim}

This encoding is designed for efficiency as it reduces the filesystem
path + feed index metadata down to a series of small integers.

The \texttt{Stat} objects use this encoding:

\begin{verbatim}
message Stat {
  required uint32 mode = 1;
  optional uint32 uid = 2;
  optional uint32 gid = 3;
  optional uint64 size = 4;
  optional uint64 blocks = 5;
  optional uint64 offset = 6;
  optional uint64 byteOffset = 7;
  optional uint64 mtime = 8;
  optional uint64 ctime = 9;
}
\end{verbatim}

These are the field definitions:

\begin{itemize}
\tightlist
\item
  \texttt{mode} - POSIX file mode bitmask
\item
  \texttt{uid} - POSIX user id
\item
  \texttt{gid} - POSIX group id
\item
  \texttt{size} - file size in bytes
\item
  \texttt{blocks} - number of data chunks that make up this file
\item
  \texttt{offset} - the data feed entry index for the first chunk in
  this file
\item
  \texttt{byteOffset} - the data feed file byte offset for the first
  chunk in this file
\item
  \texttt{mtime} - POSIX modified\_at time
\item
  \texttt{mtime} - POSIX created\_at time
\end{itemize}

\subsection*{References}\label{references}
\addcontentsline{toc}{subsection}{References}

\hypertarget{refs}{}
\hypertarget{ref-varda2008protocol}{}
Varda, Kenton. 2008. ``Protocol Buffers: Google's Data Interchange
Format.'' \emph{Google Open Source Blog, Available at Least as Early as
Jul}.

\end{document}