aboutsummaryrefslogtreecommitdiffstats
path: root/papers/dat-paper.txt
blob: cf8cd85afc0837091e45f4c16d46abf68bface06 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
\documentclass[a4paperpaper,twocolumn]{article}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{ifxetex,ifluatex}
\usepackage{fixltx2e} % provides \textsubscript
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
\else % if luatex or xelatex
  \ifxetex
    \usepackage{mathspec}
  \else
    \usepackage{fontspec}
  \fi
  \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
\fi
% use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
% use microtype if available
\IfFileExists{microtype.sty}{%
\usepackage{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\usepackage[unicode=true]{hyperref}
\hypersetup{
            pdftitle={Dat - Distributed Dataset Synchronization And Versioning},
            pdfauthor={Maxwell Ogden, Karissa McKelvey, Mathias Buus Madsen, Code for Science},
            pdfborder={0 0 0},
            breaklinks=true}
\urlstyle{same}  % don't use monospace font for urls
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}
}
\setlength{\emergencystretch}{3em}  % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{0}
% Redefines (sub)paragraphs to behave more like sections
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi

% set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother


\title{Dat - Distributed Dataset Synchronization And Versioning}
\author{Maxwell Ogden, Karissa McKelvey, Mathias Buus Madsen, Code for Science}
\date{May 2017}

\begin{document}
\maketitle

\section{Abstract}\label{abstract}

Dat is a protocol designed for syncing folders of data, even if they are
large or changing constantly. Dat uses a cryptographically secure
register of changes to prove that the requested data version is
distributed. A byte range of any file's version can be efficiently
streamed from a Dat repository over a network connection. Consumers can
choose to fully or partially replicate the contents of a remote Dat
repository, and can also subscribe to live changes. To ensure writer and
reader privacy, Dat uses public key cryptography to encrypt network
traffic. A group of Dat clients can connect to each other to form a
public or private decentralized network to exchange data between each
other. A reference implementation is provided in JavaScript.

\section{1. Background}\label{background}

Many datasets are shared online today using HTTP and FTP, which lack
built in support for version control or content addressing of data. This
results in link rot and content drift as files are moved, updated or
deleted, leading to an alarming rate of disappearing data references in
areas such as
\href{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0115253}{published
scientific literature}.

Cloud storage services like S3 ensure availability of data, but they
have a centralized hub-and-spoke networking model and are therefore
limited by their bandwidth, meaning popular files can become very
expensive to share. Services like Dropbox and Google Drive provide
version control and synchronization on top of cloud storage services
which fixes many issues with broken links but rely on proprietary code
and services requiring users to store their data on centralized cloud
infrastructure which has implications on cost, transfer speeds, vendor
lock-in and user privacy.

Distributed file sharing tools can become faster as files become more
popular, removing the bandwidth bottleneck and making file distribution
cheaper. They also use link resolution and discovery systems which can
prevent broken links meaning if the original source goes offline other
backup sources can be automatically discovered. However these file
sharing tools today are not supported by Web browsers, do not have good
privacy guarantees, and do not provide a mechanism for updating files
without redistributing a new dataset which could mean entire
redownloading data you already have.

\section{2. Dat}\label{dat}

Dat is a dataset synchronization protocol that does not assume a dataset
is static or that the entire dataset will be downloaded. The main
reference implementation is available from from npm as
\texttt{npm\ install\ dat\ -g}.

The protocol is agnostic to the underlying transport e.g.~you could
implement Dat over carrier pigeon. The key properties of the Dat design
are explained in this section.

\begin{itemize}
\tightlist
\item
  2.1 \textbf{Content Integrity} - Data and publisher integrity is
  verified through use of signed hashes of the content.
\item
  2.2 \textbf{Decentralized Mirroring} - Users sharing the same Dat
  automatically discover each other and exhange data in a swarm.
\item
  2.3 \textbf{Network Privacy} - Dat provides certain privacy guarantees
  including end-to-end encryption.
\item
  2.4 \textbf{Incremental Versioning} - Datasets can be efficiently
  synced, even in real time, to other peers.
\end{itemize}

\subsection{2.1 Content Integrity}\label{content-integrity}

Content integrity means being able to verify the data you received is
the exact same version of the data that you expected. This is imporant
in a distributed system as this mechanism will catch incorrect data sent
by bad peers. It also has implications for reproducibility as it lets
you refer to a specific version of a dataset.

Link rot, when links online stop resolving, and content drift, when data
changes but the link to the data remains the same, are two common issues
in data analysis. For example, one day a file called data.zip might
change, but a typical HTTP link to the file does not include a hash of
the content, or provide a way to get updated metadata, so clients that
only have the HTTP link have no way to check if the file changed without
downloading the entire file again. Referring to a file by the hash of
its content is called content addressability, and lets users not only
verify that the data they receive is the version of the data they want,
but also lets people cite specific versions of the data by referring to
a specific hash.

Dat uses BLAKE2b (Aumasson et al. 2013) cryptographically secure hashes
to address content. Hashes are arranged in a Merkle tree (Mykletun,
Narasimha, and Tsudik 2003), a tree where each non-leaf node is the hash
of all child nodes. Leaf nodes contain pieces of the dataset. Due to the
properties of secure cryptographic hashes the top hash can only be
produced if all data below it matches exactly. If two trees have
matching top hashes then you know that all other nodes in the tree must
match as well, and you can conclude that your dataset is synchronized.
Trees are chosen as the primary data structure in Dat as they have a
number of properties that allow for efficient access to subsets of the
metadata, which allows Dat to work efficiently over a network
connection.

\subsubsection{Dat Links}\label{dat-links}

Dat links are Ed25519 (Bernstein et al. 2012) public keys which have a
length of 32 bytes (64 characters when Hex encoded). You can represent
your Dat link in the following ways and Dat clients will be able to
understand them:

\begin{itemize}
\tightlist
\item
  The standalone public key:
\end{itemize}

\texttt{8e1c7189b1b2dbb5c4ec2693787884771201da9...}

\begin{itemize}
\tightlist
\item
  Using the dat:// protocol:
\end{itemize}

\texttt{dat://8e1c7189b1b2dbb5c4ec2693787884771...}

\begin{itemize}
\tightlist
\item
  As part of an HTTP URL:
\end{itemize}

\texttt{https://datproject.org/8e1c7189b1b2dbb5...}

All messages in the Dat protocol are encrypted and signed using the
public key during transport. This means that unless you know the public
key (e.g.~unless the Dat link was shared with you) then you will not be
able to discover or communicate with any member of the swarm for that
Dat. Anyone with the public key can verify that messages (such as
entries in a Dat Stream) were created by a holder of the private key.

Every Dat repository has a corresponding private key that is kept in your
home folder and never shared. Dat never exposes either the public or
private key over the network. During the discovery phase the BLAKE2b
hash of the public key is used as the discovery key. This means that the
original key is impossible to discover (unless it was shared publicly
through a separate channel) since only the hash of the key is exposed
publicly.

Dat does not provide an authentication mechanism at this time. Instead
it provides a capability system. Anyone with the Dat link is currently
considered able to discover and access data. Do not share your Dat links
publicly if you do not want them to be accessed.

\subsubsection{Hypercore and Hyperdrive}\label{hypercore-and-hyperdrive}

The Dat storage, content integrity, and networking protocols are
implemented in a module called
\href{https://npmjs.org/hypercore}{Hypercore}. Hypercore is agnostic to
the format of the input data, it operates on any stream of binary data.
For the Dat use case of synchronizing datasets we use a file system
module on top of Hypercore called
\href{https://npmjs.org/hyperdrive}{Hyperdrive}.

Dat has a layered abstraction so that users can use Hypercore directly
to have full control over how they model their data. Hyperdrive works
well when your data can be represented as files on a filesystem, which
is the main use case with Dat.

\subsubsection{Hypercore Registers}\label{hypercore-registers}

Hypercore Registers are the core mechanism used in Dat. They are binary
append-only streams whose contents are cryptographically hashed and
signed and therefore can be verified by anyone with access to the public
key of the writer. They are an implemenation of the concept known as a
register, a digital ledger you can trust

Dat uses two registers, \texttt{content} and \texttt{metadata}. The
\texttt{content} register contains the files in your repository and
\texttt{metadata} contains the metadata about the files including name,
size, last modified time, etc. Dat replicates them both when
synchronizing with another peer.

When files are added to Dat, each file gets split up into some number of
chunks, and the chunks are then arranged into a Merkle tree, which is
used later for version control and replication processed.

\subsection{2.2 Decentralized Mirroring}\label{decentralized-mirroring}

Dat is a peer to peer protocol designed to exchange pieces of a dataset
amongst a swarm of peers. As soon as a peer acquires their first piece
of data in the dataset they can choose to become a partial mirror for
the dataset. If someone else contacts them and needs the piece they
have, they can choose to share it. This can happen simultaneously while
the peer is still downloading the pieces they want from others.

\subsubsection{Source Discovery}\label{source-discovery}

An important aspect of mirroring is source discovery, the techniques
that peers use to find each other. Source discovery means finding the IP
and port of data sources online that have a copy of that data you are
looking for. You can then connect to them and begin exchanging data. By
using source discovery techniques Dat is able to create a network where
data can be discovered even if the original data source disappears.

Source discovery can happen over many kinds of networks, as long as you
can model the following actions:

\begin{itemize}
\tightlist
\item
  \texttt{join(key,\ {[}port{]})} - Begin performing regular lookups on
  an interval for \texttt{key}. Specify \texttt{port} if you want to
  announce that you share \texttt{key} as well.
\item
  \texttt{leave(key,\ {[}port{]})} - Stop looking for \texttt{key}.
  Specify \texttt{port} to stop announcing that you share \texttt{key}
  as well.
\item
  \texttt{foundpeer(key,\ ip,\ port)} - Called when a peer is found by a
  lookup
\end{itemize}

In the Dat implementation we implement the above actions on top of three
types of discovery networks:

\begin{itemize}
\tightlist
\item
  DNS name servers - An Internet standard mechanism for resolving keys
  to addresses
\item
  Multicast DNS - Useful for discovering peers on local networks
\item
  Kademlia Mainline Distributed Hash Table - Less central points of
  failure, increases probability of Dat working even if DNS servers are
  unreachable
\end{itemize}

Additional discovery networks can be implemented as needed. We chose the
above three as a starting point to have a complementary mix of
strategies to increase the probability of source discovery. Additionally
you can specify a Dat via HTTPS link, which runs the Dat protocol in
``single-source'' mode, meaning the above discovery networks are not
used, and instead only that one HTTPS server is used as the only peer.

\subsubsection{Peer Connections}\label{peer-connections}

After the discovery phase, Dat should have a list of potential data
sources to try and contact. Dat uses either TCP, HTTP or
\href{https://en.wikipedia.org/wiki/Micro_Transport_Protocol}{UTP}
(Rossi et al. 2010). UTP uses LEDBAT which is designed to not take up
all available bandwidth on a network (e.g.~so that other people sharing
wifi can still use the Internet), and is still based on UDP so works
with NAT traversal techniques like UDP hole punching. HTTP is support
for compatibility with static file servers and web browser clients. Note
that these are the protocols we support in the reference Dat
implementation, but the Dat protocol itself is transport agnostic.

If an HTTP source is specified Dat will prefer that one over other
sources. Otherwise when Dat gets the IP and port for a potential TCP or
UTP source it tries to connect using both protocols. If one connects
first, Dat aborts the other one. If none connect, Dat will try again
until it decides that source is offline or unavailable and then stops
trying to connect to them. Sources Dat is able to connect to go into a
list of known good sources, so that if the Internet connection goes down
Dat can use that list to reconnect to known good sources again quickly.

If Dat gets a lot of potential sources it picks a handful at random to
try and connect to and keeps the rest around as additional sources to
use later in case it decides it needs more sources.

Once a duplex binary connection to a remote source is open Dat then
layers on the Hypercore protocol, a message based replication protocol
that allows two peers to communicate over a stateless channel to request
and exchange data. You open separate replication channels with many
peers at once which allows clients to parallelize data requests across
the entire pool of peers they have established connections with.

\subsection{2.3 Network Privacy}\label{network-privacy}

On the Web today, with SSL, there is a guarantee that the traffic
between your computer and the server is private. As long as you trust
the server to not leak your logs, attackers who intercept your network
traffic will not be able to read the HTTP traffic exchanged between you
and the server. This is a fairly straightforward model as clients only
have to trust a single server for some domain.

There is an inherent tradeoff in peer to peer systems of source
discovery vs.~user privacy. The more sources you contact and ask for
some data, the more sources you trust to keep what you asked for
private. Our goal is to have Dat be configurable in respect to this
tradeoff to allow application developers to meet their own privacy
guidelines.

It is up to client programs to make design decisions around which
discovery networks they trust. For example if a Dat client decides to
use the BitTorrent DHT to discover peers, and they are searching for a
publicly shared Dat key (e.g.~a key cited publicly in a published
scientific paper) with known contents, then because of the privacy
design of the BitTorrent DHT it becomes public knowledge what key that
client is searching for.

A client could choose to only use discovery networks with certain
privacy guarantees. For example a client could only connect to an
approved list of sources that they trust, similar to SSL. As long as
they trust each source, the encryption built into the Dat network
protocol will prevent the Dat key they are looking for from being
leaked.

\subsection{2.4 Incremental Versioning}\label{incremental-versioning}

Given a stream of binary data, Dat splits the stream into chunks, hashes
each chunk, and arranges the hashes in a specific type of Merkle tree
that allows for certain replication properties.

Dat is also able to fully or partially synchronize streams in a
distributed setting even if the stream is being appended to. This is
accomplished by using the messaging protocol to traverse the Merkle tree
of remote sources and fetch a strategic set of nodes. Due to the low
level message oriented design of the replication protocol different node
traversal strategies can be implemented.

There are two types of versioning performed automatically by Dat.
Metadata is stored in a folder called \texttt{.dat} in the root folder
of a repository, and data is stored as normal files in the root folder.

\subsubsection{Metadata Versioning}\label{metadata-versioning}

Dat tries as much as possible to act as a one-to-one mirror of the state
of a folder and all its contents. When importing files, Dat uses a
sorted depth-first recursion to list all the files in the tree. For each
file it finds, it grabs the filesystem metadata (filename, Stat object,
etc) and checks if there is already an entry for this filename with this
exact metadata already represented in the Dat repository metadata. If
the file with this metadata matches exactly the newest version of the
file metadata stored in Dat, then this file will be skipped (no change).

If the metadata differs from the current existing one (or there are no
entries for this filename at all in the history), then this new metadata
entry will be appended as the new `latest' version for this file in the
append-only SLEEP metadata content register (described below).

\subsubsection{Content Versioning}\label{content-versioning}

In addition to storing a historical record of filesystem metadata, the
content of the files themselves are also capable of being stored in a
version controlled manner. The default storage system used in Dat stores
the files as files. This has the advantage of being very straightforward
for users to understand, but the downside of not storing old versions of
content by default.

In contrast to other version control systems like Git, Dat by default
only stores the current set of checked out files on disk in the
repository folder, not old versions. It does store all previous metadata
for old versions in \texttt{.dat}. Git for example stores all previous
content versions and all previous metadata versions in the \texttt{.git}
folder. Because Dat is designed for larger datasets, if it stored all
previous file versions in \texttt{.dat}, then the \texttt{.dat} folder
could easily fill up the user's hard drive inadvertently. Therefore Dat
has multiple storage modes based on usage.

Hypercore registers include an optional \texttt{data} file that stores
all chunks of data. In Dat, only the \texttt{metadata.data} file is
used, but the \texttt{content.data} file is not used. The default
behavior is to store the current files only as normal files. If you want
to run an `archival' node that keeps all previous versions, you can
configure Dat to use the \texttt{content.data} file instead. For
example, on a shared server with lots of storage you probably want to
store all versions. However on a workstation machine that is only
accessing a subset of one version, the default mode of storing all
metadata plus the current set of downloaded files is acceptable, because
you know the server has the full history.

\subsubsection{Merkle Trees}\label{merkle-trees}

Registers in Dat use a specific method of encoding a Merkle tree where
hashes are positioned by a scheme called binary in-order interval
numbering or just ``bin'' numbering. This is just a specific,
deterministic way of laying out the nodes in a tree. For example, a tree
with 7 nodes will always be arranged like this:

\begin{verbatim}
0
  1
2
    3
4
  5
6
\end{verbatim}

In Dat, the hashes of the chunks of files are always even numbers, at
the wide end of the tree. So the above tree had four original values
that become the even numbers:

\begin{verbatim}
chunk0 -> 0
chunk1 -> 2
chunk2 -> 4
chunk3 -> 6
\end{verbatim}

In the resulting Merkle tree, the even and odd nodes store different
information:

\begin{itemize}
\tightlist
\item
  Evens - List of data hashes {[}chunk0, chunk1, chunk2, \ldots{}{]}
\item
  Odds - List of Merkle hashes (hashes of child even nodes) {[}hash0,
  hash1, hash2, \ldots{}{]}
\end{itemize}

These two lists get interleaved into a single register such that the
indexes (position) in the register are the same as the bin numbers from
the Merkle tree.

All odd hashes are derived by hashing the two child nodes, e.g.~given
hash0 is \texttt{hash(chunk0)} and hash2 is \texttt{hash(chunk1)}, hash1
is \texttt{hash(hash0\ +\ hash2)}.

For example a register with two data entries would look something like
this (pseudocode):

\begin{verbatim}
0. hash(value0)
1. hash(hash(chunk0) + hash(chunk1))
2. hash(value1)
\end{verbatim}

It is possible for the in-order Merkle tree to have multiple roots at
once. A root is defined as a parent node with a full set of child node
slots filled below it.

For example, this tree has 2 roots (1 and 4)

\begin{verbatim}
0
  1
2

4
\end{verbatim}

This tree has one root (3):

\begin{verbatim}
0
  1
2
    3
4
  5
6
\end{verbatim}

This one has one root (1):

\begin{verbatim}
0
  1
2
\end{verbatim}

\subsubsection{Replication Example}\label{replication-example}

This section describes in high level the replication flow of a Dat. Note
that the low level details are available by reading the SLEEP section
below. For the sake of illustrating how this works in practice in a
networked replication scenario, consider a folder with two files:

\begin{verbatim}
bat.jpg
cat.jpg
\end{verbatim}

To send these files to another machine using Dat, you would first add
them to a Dat repository by splitting them into chunks and constructing
SLEEP files representing the chunks and filesystem metadata.

Let's assume \texttt{cat.jpg} produces three chunks, and
\texttt{bat.jpg} produces four chunks, each around 64KB. Dat stores in a
representation called SLEEP, but here we will show a
pseudo-representation for the purposes of illustrating the replication
process. The seven chunks get sorted into a list like this:

\begin{verbatim}
bat-1
bat-2
bat-3
cat-1
cat-2
cat-3
\end{verbatim}

These chunks then each get hashed, and the hashes get arranged into a
Merkle tree (the content register):

\begin{verbatim}
0 - hash(bat-1)
    1 - hash(0 + 2)
2 - hash(bat-2)
        3 - hash(1 + 5)
4 - hash(bat-3)
    5 - hash(4 + 6)
6 - hash(cat-1)
8 - hash(cat-2)
    9 - hash()
10 - hash(cat-3)
\end{verbatim}

Next we calculate the root hashes of our tree, in this case 3 and 9. We
then hash them together, and cryptographically sign the hash. This
signed hash now can be used to verify all nodes in the tree, and the
signature proves it was produced by us, the holder of the private key
for this Dat.

This tree is for the hashes of the contents of the photos. There is also
a second Merkle tree that Dat generates that represents the list of
files and their metadata, and looks something like this (the metadata
register):

\begin{verbatim}
0 - hash({contentRegister: '9e29d624...'})
  1 - hash(0 + 2)
2 - hash({"bat.png", first: 0, last: 3})
4 - hash({"cat.png", first: 3, last: 3})
\end{verbatim}

The first entry in this feed is a special metadata entry that tells Dat
the address of the second feed (the content register). Note that node 3
is not included yet, because 3 is the hash of \texttt{1\ +\ 5}, but 5
does not exist yet, so will be written at a later update.

Now we're ready to send our metadata to the other peer. The first
message is a \texttt{Register} message with the key that was shared for
this Dat. Let's call ourselves Alice and the other peer Bob. Alice sends
Bob a \texttt{Want} message that declares they want all nodes in the
file list (the metadata register). Bob replies with a single
\texttt{Have} message that indicates he has 2 nodes of data. Alice sends
three \texttt{Request} messages, one for each leaf node
(\texttt{0,\ 2,\ 4}). Bob sends back three \texttt{Data} messages. The
first \texttt{Data} message contains the content register key, the hash
of the sibling, in this case node \texttt{2}, the hash of the uncle root
\texttt{4}, as well as a signature for the root hashes (in this case
\texttt{1,\ 4}). Alice verifies the integrity of this first
\texttt{Data} message by hashing the metadata received for the content
register metadata to produce the hash for node \texttt{0}. They then
hash the hash \texttt{0} with the hash \texttt{2} that was included to
reproduce hash \texttt{1}, and hashes their \texttt{1} with the value
for \texttt{4} they received which they can use the signature they
received to verify it was the same data. When the next \texttt{Data}
message is received, a similar process is performed to verify the
content.

Now Alice has the full list of files in the Dat, but decides they only
want to download \texttt{cat.png}. Alice knows they want blocks 3
through 6 from the content register. First Alice sends another
\texttt{Register} message with the content key to open a new replication
channel over the connection. Then Alice sends three \texttt{Request}
messages, one for each of blocks \texttt{4,\ 5,\ 6}. Bob sends back
three \texttt{Data} messages with the data for each block, as well as
the hashes needed to verify the content in a way similar to the process
described above for the metadata feed.

\subsection{3. SLEEP Specification}\label{sleep-specification}

This section is a technical description of the SLEEP format intended for
implementers. SLEEP is the the on-disk format that Dat produces and
uses. It is a set of 9 files that hold all of the metadata needed to
list the contents of a Dat repository and verify the integrity of the
data you receive. SLEEP is designed to work with REST, allowing servers
to be plain HTTP file servers serving the static SLEEP files, meaning
you can implement a Dat protocol client using HTTP with a static HTTP
file server as the backend.

SLEEP files contain metadata about the data inside a Dat repository,
including cryptographic hashes, cryptographic signatures, filenames and
file permissions. The SLEEP format is specifically engineered to allow
efficient access to subsets of the metadat and/or data in the
repository, even on very large repositories, which enables Dat's peer to
peer networking to be fast.

The acronym SLEEP is a slumber related pun on REST and stands for
Syncable Lightweight Event Emitting Persistence. The Event Emitting part
refers to how SLEEP files are append-only in nature, meaning they grow
over time and new updates can be subscribed to as a realtime feed of
events through the Dat protocol.

The SLEEP version described here, used in Dat as of 2017 is SLEEP V2.
SLEEP V1 is documented at http://specs.okfnlabs.org/sleep.

\subsubsection{SLEEP Files}\label{sleep-files}

SLEEP is a set of 9 files that should be stored with the following
names. In Dat, the files are stored in a folder called \texttt{.dat} in
the top level of the repository.

\begin{verbatim}
metadata.key
metadata.signatures
metadata.bitfield
metadata.tree
metadata.data
content.key
content.signatures
content.bitfield
content.tree
\end{verbatim}

The files prefixed with \texttt{content} store metadata about the
primary data in a Dat repository, for example the raw binary contents of
the files. The files prefixed with \texttt{metadata} store metadata
about the files in the repository, for example the filenames, file
sizes, and file permissions. The \texttt{content} and \texttt{metadata}
files are both Hypercore registers, making SLEEP a set of two Hypercore
registers.

\subsubsection{SLEEP File Headers}\label{sleep-file-headers}

The following structured binary format is used for \texttt{signatures},
\texttt{bitfield}, and \texttt{tree} files. The header contains metadata
as well as information needed to decode the rest of the files after the
header. SLEEP files are designed to be easy to append new data to at the
end, easy to read arbitrary byte offsets in the middle, and are
relatively flat, simple files that rely on the filesystem for the heavy
lifting.

SLEEP files are laid out like this:

\begin{verbatim}
<32 byte header>
<fixed-size entry 1>
<fixed-size entry 2>
<fixed-size entry ...>
<fixed-size entry n>
\end{verbatim}

\begin{itemize}
\tightlist
\item
  32 byte header
\item
  4 bytes - magic byte (value varies depending on which file, used to
  quickly identify which file type it is)
\item
  1 byte - version number of the file header protocol, current version
  is 0
\item
  2 byte Uint16BE - entry size, describes how long each entry in the
  file is
\item
  1 byte - length prefix for body
\item
  rest of 32 byte header - string describing key algorithm (in dat
  `ed25519'). length of this string matches the length in the previous
  length prefix field. This string must fit within the 32 byte header
  limitation (24 bytes reserved for string). Unused bytes should be
  filled with zeroes.
\end{itemize}

Possible values in the Dat implementation for the body field are:

\begin{verbatim}
Ed25519
BLAKE2b
\end{verbatim}

To calculate the offset of some entry position, first read the header
and get the entry size, then do
\texttt{32\ +\ entrySize\ *\ entryIndex}. To calculate how many entries
are in a file, you can use the entry size and the filesize on disk and
do \texttt{(fileSize\ -\ 32)\ /\ entrySize}.

As mentioned above, \texttt{signatures}, \texttt{bitfield} and
\texttt{tree} are the three SLEEP files. There are two additional files,
\texttt{key}, and \texttt{data}, which do not contain SLEEP file headers
and store plain serialized data for easy access. \texttt{key} stores the
public key that is described by the \texttt{signatures} file, and
\texttt{data} stores the raw chunk data that the \texttt{tree} file
contains the hashes and metadata for.

\subsubsection{File Descriptions}\label{file-descriptions}

\paragraph{key}\label{key}

The public key used to verify the signatures in the \texttt{signatures}
file. Stored in binary as a single buffer written to disk. To find out
what format of key is stored in this file, read the header of
\texttt{signatures}. In Dat, it's always a ed25519 public key, but other
implementations can specify other key types using a string value in that
header.

\paragraph{tree}\label{tree}

A SLEEP formatted 32 byte header with data entries representing a
serialized merkle tree based on the data in the data storage layer. All
the fixed size nodes written in in-order tree notation. The header
algorithm string for \texttt{tree} files is \texttt{BLAKE2b}. The entry
size is 40 bytes. Entries are formatted like this:

\begin{verbatim}
<32 byte header>
  <4 byte magic string: 0x05025702>
  <1 byte version number: 0>
  <2 byte entry size: 40>
  <1 byte algorithm name length prefix: 7>
  <7 byte algorithm name: BLAKE2b>
  <17 zeroes>
<40 byte entries>
  <32 byte BLAKE2b hash>
  <8 byte Uint64BE children leaf byte length>
\end{verbatim}

The children leaf byte length is the byte size containing the sum byte
length of all leaf nodes in the tree below this node.

This file uses the in-order notation, meaning even entries are leaf
nodes and odd entries are parent nodes (non-leaf).

To prevent pre-image attacks, all hashes start with a one byte type
descriptor:

\begin{verbatim}
0 - LEAF
1 - PARENT
2 - ROOT
\end{verbatim}

To calculate leaf node entries (the hashes of the data entries) we hash
this data:

\begin{verbatim}
BLAKE2b(
  <1 byte type>
    0
  <8 bytes Uint64BE>
    length of entry data
  <entry data>
)
\end{verbatim}

Then we take this 32 byte hash and write it to the tree as 40 bytes like
this:

\begin{verbatim}
<32 bytes>
  BLAKE2b hash
<8 bytes Uint64BE>
  length of data
\end{verbatim}

Note that the Uint64 of length of data is included both in the hashed
data and written at the end of the entry. This is to expose more
metadata to Dat for advanced use cases such as verifying data length in
sparse replication scenarios.

To calculate parent node entries (the hashes of the leaf nodes) we hash
this data:

\begin{verbatim}
BLAKE2b(
  <1 byte>
    1
  <8 bytes Uint64BE>
    left child length + right child length
  <32 bytes>
    left child hash
  <32 bytes>
    right child hash
)
\end{verbatim}

Then we take this 32 byte hash and write it to the tree as 40 bytes like
this:

\begin{verbatim}
<32 bytes>
  BLAKE2b hash
<8 bytes Uint64BE>
  left child length + right child length
\end{verbatim}

The reason the tree entries contain data lengths is to allow for sparse
mode replication. Encoding lengths (and including lengths in all hashes)
means you can verify the merkle subtrees independent of the rest of the
tree, which happens during sparse replication scenarios.

The tree file corresponds directly to the \texttt{data} file.

\paragraph{data}\label{data}

The \texttt{data} file is only included in the SLEEP format for the
\texttt{metadata.*} prefixed files which contains filesystem metadata
and not actual file data. For the \texttt{content.*} files, the data is
stored externally (in Dat it is stored as normal files on the filesystem
and not in a SLEEP file). However you can configure Dat to use a
\texttt{content.data} file if you want and it will still work. If you
want to store the full history of all versions of all files, using the
\texttt{content.data} file would provide that guarantee, but would have
the disadvantage of storing files as chunks merged into one huge file
(not as user friendly).

The \texttt{data} file does not contain a SLEEP file header. It just
contains a bunch of concatenated data entries. Entries are written in
the same order as they appear in the \texttt{tree} file. To read a
\texttt{data} file, first decode the \texttt{tree} file and for every
leaf in the \texttt{tree} file you can calculate a data offset for the
data described by that leaf node in the \texttt{data} file.

\subparagraph{Index Lookup}\label{index-lookup}

For example, if we wanted to seek to a specific entry offset (say entry
42):

\begin{itemize}
\tightlist
\item
  First, read the header of the \texttt{tree} file and get the entry
  size, then do \texttt{32\ +\ entrySize\ *\ 42} to get the raw tree
  index: \texttt{32\ +\ (40\ *\ 42)}
\item
  Since we want the leaf entry (even node in the in-order layout), we
  multiply the entry index by 2: \texttt{32\ +\ (40\ *\ (42\ *\ 2))}
\item
  Read the 40 bytes at that offset in the \texttt{tree} file to get the
  leaf node entry.
\item
  Read the last 8 bytes of the entry to get the length of the data entry
\item
  To calculate the offset of where in the \texttt{data} file your entry
  begins, you need to sum all the lengths of all the earlier entries in
  the tree. The most efficient way to do this is to sum all the previous
  parent node (non-leaf) entry lengths. You can also sum all leaf node
  lengths, but parent nodes contain the sum of their childrens lengths
  so it's more efficient to use parents. During Dat replication, these
  nodes are fetched as part of the Merkle tree verification so you will
  already have them locally. This is a log(N) operation where N is the
  entry index. Entries are also small and therefore easily cacheable.
\item
  Once you get the offset, you use the length you decoded above and read
  N bytes (where N is the decoded length) at the offset in the
  \texttt{data} file. You can verify the data integrity using the 32
  byte hash from the \texttt{tree} entry.
\end{itemize}

\subparagraph{Byte Lookup}\label{byte-lookup}

The above method illustrates how to resolve a chunk position index to a
byte offset. You can also do the reverse operation, resolving a byte
offset to a chunk position index. This is used to stream arbitrary
random access regions of files in sparse replication scenarios.

\begin{itemize}
\tightlist
\item
  First, you start by calculating the current Merkle roots
\item
  Each node in the tree (including these root nodes) stores the
  aggregate file size of all byte sizes of the nodes below it. So the
  roots cumulatively will describe all possible byte ranges for this
  repository.
\item
  Find the root that contains the byte range of the offset you are
  looking for and get the node information for all of that nodes
  children using the Index Lookup method, and recursively repeat this
  step until you find the lowest down child node that describes this
  byte range.
\item
  The chunk described by this child node will contain the byte range you
  are looking for. You can use the \texttt{byteOffset} property in the
  \texttt{Stat} metadata object to seek into the right position in the
  content for the start of this chunk.
\end{itemize}

\subparagraph{Metadata Overhead}\label{metadata-overhead}

Using this scheme, if you write 4GB of data using on average 64KB data
chunks (note: chunks can be variable length and do not need to be the
same size), your tree file will be around 5MB (0.0125\% overhead).

\paragraph{signatures}\label{signatures}

A SLEEP formatted 32 byte header with data entries being 64 byte
signatures.

\begin{verbatim}
<32 byte header>
  <4 byte magic string: 0x05025701>
  <1 byte version number: 0>
  <2 byte entry size: 64>
  <1 byte algorithm name length prefix: 7>
  <7 byte algorithm name: Ed25519>
  <17 zeroes>
<64 byte entries>
  <64 byte Ed25519 signature>
\end{verbatim}

Every time the tree is updated we sign the current roots of the Merkle
tree, and append them to the signatures file. The signatures file starts
with no entries. Each time a new leaf is appended to the \texttt{tree}
file (aka whenever data is added to a Dat), we take all root hashes at
the current state of the Merkle tree and hash and sign them, then append
them as a new entry to the signatures file.

\begin{verbatim}
Ed25519 sign(
  BLAKE2b(
    <1 byte>
      2 // root type
    for (every root node left-to-right) {
      <32 byte root hash>
      <8 byte Uint64BE root tree index>
      <8 byte Uint64BE child byte lengths>
    }
  )
)
\end{verbatim}

The reason we hash all the root nodes is that the BLAKE2b hash above is
only calculable if you have all of the pieces of data required to
generate all the intermediate hashes. This is the crux of Dat's data
integrity guarantees.

\paragraph{bitfield}\label{bitfield}

A SLEEP formatted 32 byte header followed by a series of 3328 byte long
entries.

\begin{verbatim}
<32 byte header>
  <4 byte magic string: 0x05025700>
  <1 byte version number: 0>
  <2 byte entry size: 3328>
  <1 byte algorithm name length: 0>
  <1 byte algorithm name: 0>
  <24 zeroes>
<3328 byte entries> // (2048 + 1024 + 256)
\end{verbatim}

The bitfield describes which pieces of data you have, and which nodes in
the \texttt{tree} file have been written. This file exists as an index
of the \texttt{tree} and \texttt{data} to quickly figure out which
pieces of data you have or are missing. This file can be regenerated if
you delete it, so it is considered a materialized index.

The \texttt{bitfield} file actually contains three bitfields of
different sizes. A bitfield (AKA bitmap) is defined as a set of bits
where each bit (0 or 1) represents if you have or do not have a piece of
data at that bit index. So if there is a dataset of 10 cat pictures, and
you have pictures 1, 3, and 5 but are missing the rest, your bitfield
would look like \texttt{1010100000}.

Each entry contains three objects:

\begin{itemize}
\tightlist
\item
  Data Bitfield (1024 bytes) - 1 bit for each data entry that you
  have synced (1 for every entry in \texttt{data}).
\item
  Tree Bitfield (2048 bytes) - 1 bit for every tree entry (all nodes in
  \texttt{tree})
\item
  Bitfield Index (256 bytes) - This is an index of the Data Bitfield
  that makes it efficient to figure out which pieces of data are missing
  from the Data Bitfield without having to do a linear scan.
\end{itemize}

The Data Bitfield is 1Kb somewhat arbitrarily, but the idea is that
because most filesystems work in 4Kb chunk sizes, we can fit the Data,
Tree and Index in less then 4Kb of data for efficient writes to the
filesystem. The Tree and Index sizes are based on the Data size (the
Tree has twice the entries as the Data, odd and even nodes vs just even
nodes in \texttt{tree}, and Index is always 1/4th the size).

To generate the Index, you pair 2 bytes at a time from the Data
Bitfield, check if all bits in the 2 bytes are the same, and generate 4
bits of Index metadata~for every 2 bytes of Data (hence how 1024 bytes
of Data ends up as 256 bytes of Index).

First you generate a 2 bit tuple for the 2 bytes of Data:

\begin{verbatim}
if (data is all 1's) then [1,1]
if (data is all 0's) then [0,0]
if (data is not all the same) then [1, 0]
\end{verbatim}

The Index itself is an in-order binary tree, not a traditional bitfield.
To generate the tree, you take the tuples you generate above and then
write them into a tree like the following example, where non-leaf nodes
are generated using the above scheme by looking at the results of the
relative even child tuples for each odd parent tuple:

\begin{verbatim}
// for e.g. 16 bytes (8 tuples) of
// sparsely replicated data
0 - [00 00 00 00]
1 -     [10 10 10 10]
2 - [11 11 11 11]
\end{verbatim}

The tuples at entry \texttt{1} above are \texttt{{[}1,0{]}} because the
relative child tuples are not uniform. In the following example, all
non-leaf nodes are \texttt{{[}1,1{]}} because their relative children
are all uniform (\texttt{{[}1,1{]}})

\begin{verbatim}
// for e.g. 32 bytes (16 tuples) of
// fully replicated data (all 1's)
0 - [11 11 11 11]
1 -     [11 11 11 11]
2 - [11 11 11 11]
3 -         [11 11 11 11]
4 - [11 11 11 11]
5 -     [11 11 11 11]
6 - [11 11 11 11]
\end{verbatim}

Using this scheme, to represent 32 bytes of data it takes at most 8
bytes of Index. In this example it compresses nicely as its all
contiguous ones on disk, similarly for an empty bitfield it would be all
zeroes.

If you write 4GB of data using on average 64KB data chunk size, your
bitfield will be at most 32KB.

\paragraph{metadata.data}\label{metadata.data}

This file is used to store content described by the rest of the
\texttt{metadata.*} hypercore SLEEP files. Whereas the
\texttt{content.*} SLEEP files describe the data stored in the actual
data cloned in the Dat repository filesystem, the \texttt{metadata} data
feed is stored inside the \texttt{.dat} folder along with the rest of
the SLEEP files.

The contents of this file is a series of versions of the Dat filesystem
tree. As this is a hypercore data feed, it's just an append only log of
binary data entries. The challenge is representing a tree in a one
dimensional way to make it representable as a Hypercore register. For
example, imagine three files:

\begin{verbatim}
~/dataset $ ls
figures
  graph1.png
  graph2.png
results.csv

1 directory, 3 files
\end{verbatim}

We want to take this structure and map it to a serialized representation
that gets written into an append only log in a way that still allows for
efficient random access by file path.

To do this, we convert the filesystem metadata into entries in a feed
like this:

\begin{verbatim}
{
  "path": "/results.csv",
  children: [[]],
  sequence: 0
}
{
  "path": "/figures/graph1.png",
  children: [[0], []],
  sequence: 1
}
{
  "path": "/figures/graph2",
  children: [[0], [1]],
  sequence: 2
}
\end{verbatim}

\subparagraph{Filename Resolution}\label{filename-resolution}

Each sequence represents adding one of the files to the register, so at
sequence 0 the filesystem state only has a single file,
\texttt{results.csv} in it. At sequence 1, there are only 2 files added
to the register, and at sequence 3 all files are finally added. The
\texttt{children} field represents a shorthand way of declaring which
other files at every level of the directory hierarchy exist alongside
the file being added at that revision. For example at the time of
sequence 1, children is \texttt{{[}{[}0{]},\ {[}{]}{]}}. The first
sub-array, \texttt{{[}0{]}}, represents the first folder in the
\texttt{path}, which is the root folder \texttt{/}. In this case
\texttt{{[}0{]}} means the root folder at this point in time only has a
single file, the file that is the subject of sequence \texttt{0}. The
second subarray is empty \texttt{{[}{]}} because there are no other
existing files in the second folder in the \texttt{path},
\texttt{figures}.

To look up a file by filename, you fetch the latest entry in the log,
then use the \texttt{children} metadata in that entry to look up the
longest common ancestor based on the parent folders of the filename you
are querying. You can then recursively repeat this operation until you
find the \texttt{path} entry you are looking for (or you exhaust all
options which means the file does not exist). This is a
\texttt{O(number\ of\ slashes\ in\ your\ path)} operation.

For example, if you wanted to look up \texttt{/results.csv} given the
above register, you would start by grabbing the metadata at sequence 2.
The longest common ancestor between \texttt{/results.csv} and
\texttt{/figures/graph2} is \texttt{/}. You then grab the corresponding
entry in the children array for \texttt{/}, which in this case is the
first entry, \texttt{{[}0{]}}. You then repeat this with all of the
chilren entries until you find a child that is closer to the entry you
are looking for. In this example, the first entry happens to be the
match we are looking for.

You can also perform lookups relative to a point in time by starting
from a specific sequence number in the register. For example to get the
state of some file relative to an old sequence number, similar to
checking out an old version of a repository in Git.

\subparagraph{Data Serialization}\label{data-serialization}

The format of the \texttt{metadata.data} file is as follows:

\begin{verbatim}
<Header>
<Node 1>
<Node 2>
<Node ...>
<Node N>
\end{verbatim}

Each entry in the file is encoded using Protocol Buffers (Varda 2008).

The first message we write to the file is of a type called Header which
uses this schema:

\begin{verbatim}
message Header {
  required string type = 1;
  optional bytes content = 2;
}
\end{verbatim}

This is used to declare two pieces of metadata used by Dat. It includes
a \texttt{type} string with the value \texttt{hyperdrive} and
\texttt{content} binary value that holds the public key of the content
register that this metadata register represents. When you share a Dat,
the metadata key is the main key that gets used, and the content
register key is linked from here in the metadata.

After the header the file will contain many filesystem \texttt{Node}
entries:

\begin{verbatim}
message Node {
  required string path = 1;
  optional Stat value = 2;
  optional bytes children = 3;
}
\end{verbatim}

The \texttt{Node} object has three fields

\begin{itemize}
\tightlist
\item
  \texttt{path} - the string of the absolute file path of this file.
\item
  \texttt{Stat} - a Stat encoded object representing the file metadata
\item
  \texttt{children} - a compressed list of the sequence numbers as
  described earlier
\end{itemize}

The \texttt{children} value is encoded by starting with the nested array
of sequence numbers, e.g. \texttt{{[}{[}3{]},\ {[}2,\ 1{]}{]}}. You then
sort the individual arrays, in this case resulting in
\texttt{{[}{[}3{]},\ {[}1,\ 2{]}{]}}. You then delta compress each
subarray by storing the difference between each integer. In this case it
would be \texttt{{[}{[}3{]},\ {[}1,\ 1{]}{]}} because \texttt{3} is 3
more than 0, \texttt{1} is 1 more than than 0, and \texttt{2} is 1 more
than \texttt{1}.

When we write these delta compressed subarrays we write them using
variable width integers (varints), using a repeating pattern like this,
one for each array:

\begin{verbatim}
<varint of first subarray element length>
<varint of the first delta in this array>
<varint of the next delta ...>
<varint of the last delta>
\end{verbatim}

This encoding is designed for efficiency as it reduces the filesystem
path metadata down to a series of small integers.

The \texttt{Stat} objects use this encoding:

\begin{verbatim}
message Stat {
  required uint32 mode = 1;
  optional uint32 uid = 2;
  optional uint32 gid = 3;
  optional uint64 size = 4;
  optional uint64 blocks = 5;
  optional uint64 offset = 6;
  optional uint64 byteOffset = 7;
  optional uint64 mtime = 8;
  optional uint64 ctime = 9;
}
\end{verbatim}

These are the field defintions:

\begin{itemize}
\tightlist
\item
  \texttt{mode} - posix file mode bitmask
\item
  \texttt{uid} - posix user id
\item
  \texttt{gid} - posix group id
\item
  \texttt{size} - file size in bytes
\item
  \texttt{blocks} - number of data chunks that make up this file
\item
  \texttt{offset} - the data feed entry index for the first chunk in
  this file
\item
  \texttt{byteOffset} - the data feed file byte offset for the first
  chunk in this file
\item
  \texttt{mtime} - posix modified\_at time
\item
  \texttt{mtime} - posix created\_at time
\end{itemize}

\subsection{4. Dat Network Protocol}\label{dat-network-protocol}

The SLEEP format is designed to allow for sparse replication, meaning
you can efficiently download only the metadata and data required to
resolve a single byte region of a single file, which makes Dat suitable
for a wide variety of streaming, real time and large dataset use cases.

To take advantage of this, Dat includes a network protocol. It is
message based and stateless, making it possible to implement on a
variety of network transport protocols including UDP and TCP. Both
metadata and content registers in SLEEP share the exact same replication
protocol.

Individual messages are encoded using Protocol Buffers and there are ten
message types using the following schema:

\subsubsection{Wire Protocol}\label{wire-protocol}

Over the wire messages are packed in the following lightweight container
format

\begin{verbatim}
<varint - length of rest of message>
  <varint - header>
  <message>
\end{verbatim}

The \texttt{header} value is a single varint that has two pieces of
information, the integer \texttt{type} that declares a 4-bit message
type (used below), and a channel identifier, \texttt{0} for metadata and
\texttt{1} for content.

To generate this varint, you bitshift the 4-bit type integer onto the
end of the channel identifier, e.g.
\texttt{channel\ \textless{}\textless{}\ 4\ \textbar{}\ \textless{}4-bit-type\textgreater{}}.

\subsubsection{Register}\label{register}

Type 0, should be the first message sent on a channel.

\begin{itemize}
\tightlist
\item
  \texttt{discoveryKey} - A BLAKE2b keyed hash of the string `hypercore'
  using the public key of the metadata register as the key.
\item
  \texttt{nonce} - 32 bytes of random binary data, used in our
  encryption scheme
\end{itemize}

\begin{verbatim}
message Register {
  required bytes discoveryKey = 1;
  optional bytes nonce = 2;
}
\end{verbatim}

\subsubsection{Handshake}\label{handshake}

Type 1. Overall connection handshake. Should be sent just after the
register message on the first channel only (metadata).

\begin{itemize}
\tightlist
\item
  \texttt{id} - 32 byte random data used as an identifier for this peer
  on the network, useful for checking if you are connected to yourself
  or another peer more than once
\item
  \texttt{live} - Whether or not you want to operate in live
  (continuous) replication mode or end after the initial sync
\end{itemize}

\begin{verbatim}
message Handshake {
  optional bytes id = 1;
  optional bool live = 2;
}
\end{verbatim}

\subsubsection{Status}\label{status}

Type 2. Message indicating state changes. Used to indicate whether you
are uploading and/or downloading.

Initial state for uploading/downloading is true. If both ends are not
downloading and not live it is safe to consider the stream ended.

\begin{verbatim}
message Status {
  optional bool uploading = 1;
  optional bool downloading = 2;
}
\end{verbatim}

\subsubsection{Have}\label{have}

Type 3. How you tell the other peer what chunks of data you have or
don't have. You should only send Have messages to peers who have
expressed interest in this region with Want messages.

\begin{itemize}
\tightlist
\item
  \texttt{start} - If you only specify \texttt{start}, it means you are
  telling the other side you only have 1 chunk at the position at the
  value in \texttt{start}.
\item
  \texttt{length} - If you specify length, you can describe a range of
  values that you have all of, starting from \texttt{start}.
\item
  \texttt{bitfield} - If you would like to send a range of sparse data
  about haves/don't haves via bitfield, relative to \texttt{start}.
\end{itemize}

\begin{verbatim}
message Have {
  required uint64 start = 1;
  optional uint64 length = 2 [default = 1];
  optional bytes bitfield = 3;
}
\end{verbatim}

When sending bitfields you must run length encode them. The encoded
bitfield is a series of compressed and uncompressed bit sequences. All
sequences start with a header that is a varint.

If the last bit is set in the varint (it is an odd number) then a header
represents a compressed bit sequence.

\begin{verbatim}
compressed-sequence = varint(
  byte-length-of-sequence
  << 2 | bit << 1 | 1
)
\end{verbatim}

If the last bit is \emph{not} set then a header represents an non
compressed sequence

\begin{verbatim}
uncompressed-sequence = varint(
  byte-length-of-bitfield << 1 | 0
) + (bitfield)
\end{verbatim}

\subsubsection{Unhave}\label{unhave}

Type 4. How you communicate that you deleted or removed a chunk you used
to have.

\begin{verbatim}
message Unhave {
  required uint64 start = 1;
  optional uint64 length = 2 [default = 1];
}
\end{verbatim}

\subsubsection{Want}\label{want}

Type 5. How you ask the other peer to subscribe you to Have messages for
a region of chunks. The \texttt{length} value defaults to Infinity or
feed.length (if not live).

\begin{verbatim}
message Want {
  required uint64 start = 1;
  optional uint64 length = 2;
}
\end{verbatim}

\subsubsection{Unwant}\label{unwant}

Type 6. How you ask to unsubscribe from Have messages for a region of
chunks from the other peer. You should only Unwant previously Wanted
regions, but if you do Unwant something that hasn't been Wanted it won't
have any effect. The \texttt{length} value defaults to Infinity or
feed.length (if not live).

\begin{verbatim}
message Unwant {
  required uint64 start = 1;
  optional uint64 length = 2;
}
\end{verbatim}

\subsubsection{Request}\label{request}

Type 7. Request a single chunk of data.

\begin{itemize}
\tightlist
\item
  \texttt{index} - The chunk index for the chunk you want. You should
  only ask for indexes that you have received the Have messages for.
\item
  \texttt{bytes} - You can also optimistically specify a byte offset,
  and in the case the remote is able to resolve the chunk for this byte
  offset depending on their Merkle tree state, they will ignore the
  \texttt{index} and send the chunk that resolves for this byte offset
  instead. But if they cannot resolve the byte request, \texttt{index}
  will be used.
\item
  \texttt{hash} - If you only want the hash of the chunk and not the
  chunk data itself.
\item
  \texttt{nodes} - A 64 bit long bitfield representing which parent
  nodes you have.
\end{itemize}

The \texttt{nodes} bitfield is an optional optimization to reduce the
amount of duplicate nodes exchanged during the replication lifecycle. It
indicates which parents you have or don't have. You have a maximum of 64
parents you can specify. Because \texttt{uint64} in Protocol Buffers is
implemented as a varint, over the wire this does not take up 64 bits in
most cases. The first bit is reserved to signify whether or not you need
a signature in response. The rest of the bits represent whether or not
you have (\texttt{1}) or don't have (\texttt{0}) the information at this
node already. The ordering is determined by walking parent, sibling up
the tree all the way to the root.

\begin{verbatim}
message Request {
  required uint64 index = 1;
  optional uint64 bytes = 2;
  optional bool hash = 3;
  optional uint64 nodes = 4;
}
\end{verbatim}

\subsubsection{Cancel}\label{cancel}

Type 8. Cancel a previous Request message that you haven't received yet.

\begin{verbatim}
message Cancel {
  required uint64 index = 1;
  optional uint64 bytes = 2;
  optional bool hash = 3;
}
\end{verbatim}

\subsubsection{Data}\label{data-1}

Type 9. Sends a single chunk of data to the other peer. You can send it
in response to a Request or unsolicited on its own as a friendly gift.
The data includes all of the Merkle tree parent nodes needed to verify
the hash chain all the way up to the Merkle roots for this chunk.
Because you can produce the direct parents by hashing the chunk, only
the roots and `uncle' hashes are included (the siblings to all of the
parent nodes).

\begin{itemize}
\tightlist
\item
  \texttt{index} - The chunk position for this chunk.
\item
  \texttt{value} - The chunk binary data. Empty if you are sending only
  the hash.
\item
  \texttt{Node.index} - The index for this chunk in in-order notation
\item
  \texttt{Node.hash} - The hash of this chunk
\item
  \texttt{Node.size}- The aggregate chunk size for all children below
  this node (The sum of all chunk sizes of all children)
\item
  \texttt{signature} - If you are sending a root node, all root nodes
  must have the signature included.
\end{itemize}

\begin{verbatim}
message Data {
  required uint64 index = 1;
  optional bytes value = 2;
  repeated Node nodes = 3;
  optional bytes signature = 4;

  message Node {
    required uint64 index = 1;
    required bytes hash = 2;
    required uint64 size = 3;
  }
}
\end{verbatim}

\section{5. Existing Work}\label{existing-work}

Dat is inspired by a number of features from existing systems.

\subsection{Git}\label{git}

Git popularized the idea of a directed acyclic graph (DAG) combined with
a Merkle tree, a way to represent changes to data where each change is
addressed by the secure hash of the change plus all ancestor hashes in a
graph. This provides a way to trust data integrity, as the only way a
specific hash could be derived by another peer is if they have the same
data and change history required to reproduce that hash. This is
important for reproducibility as it lets you trust that a specific git
commit hash refers to a specific source code state.

Decentralized version control tools for source code like Git provide a
protocol for efficiently downloading changes to a set of files, but are
optimized for text files and have issues with large files. Solutions
like Git-LFS solve this by using HTTP to download large files, rather
than the Git protocol. GitHub offers Git-LFS hosting but charges
repository owners for bandwidth on popular files. Building a distributed
distribution layer for files in a Git repository is difficult due to
design of Git Packfiles, which are delta compressed repository states
that do not easily support random access to byte ranges in previous file
versions.

\subsection{BitTorrent}\label{bittorrent}

BitTorrent implements a swarm based file sharing protocol for static
datasets. Data is split into fixed sized chunks, hashed, and then that
hash is used to discover peers that have the same data. An advantage of
using BitTorrent for dataset transfers is that download bandwidth can be
fully saturated. Since the file is split into pieces, and peers can
efficiently discover which pieces each of the peers they are connected
to have, it means one peer can download non-overlapping regions of the
dataset from many peers at the same time in parallel, maximizing network
throughput.

Fixed sized chunking has drawbacks for data that changes (see LBFS
above). BitTorrent assumes all metadata will be transferred up front
which makes it impractical for streaming or updating content. Most
BitTorrent clients divide data into 1024 pieces meaning large datasets
could have a very large chunk size which impacts random access
performance (e.g.~for streaming video).

Another drawback of BitTorrent is due to the way clients advertise and
discover other peers in absence of any protocol level privacy or trust.
From a user privacy standpoint, BitTorrent leaks what users are
accessing or attempting to access, and does not provide the same
browsing privacy functions as systems like SSL.

\subsection{Kademlia Distributed Hash
Table}\label{kademlia-distributed-hash-table}

Kademlia (Maymounkov and Mazieres 2002) is a distributed hash table, a
distributed key/value store that can serve a similar purpose to DNS
servers but has no hard coded server addresses. All clients in Kademlia
are also servers. As long as you know at least one address of another
peer in the network, you can ask them for the key you are trying to find
and they will either have it or give you some other people to talk to
that are more likely to have it.

If you don't have an initial peer to talk to you, most clients use a
bootstrap server that randomly gives you a peer in the network to start
with. If the bootstrap server goes down, the network still functions as
long as other methods can be used to bootstrap new peers (such as
sending them peer addresses through side channels like how .torrent
files include tracker addresses to try in case Kademlia finds no peers).

Kademlia is distinct from previous DHT designs due to its simplicity. It
uses a very simple XOR operation between two keys as its ``distance''
metric to decide which peers are closer to the data being searched for.
On paper it seems like it wouldn't work as it doesn't take into account
things like ping speed or bandwidth. Instead its design is very simple
on purpose to minimize the amount of control/gossip messages and to
minimize the amount of complexity required to implement it. In practice
Kademlia has been extremely successful and is widely deployed as the
``Mainline DHT'' for BitTorrent, with support in all popular BitTorrent
clients today.

Due to the simplicity in the original Kademlia design a number of
attacks such as DDOS and/or sybil have been demonstrated. There are
protocol extensions (BEPs) which in certain cases mitigate the effects
of these attacks, such as BEP 44 which includes a DDOS mitigation
technique. Nonetheless anyone using Kademlia should be aware of the
limitations.

\subsection{Peer to Peer Streaming Peer Protocol
(PPSPP)}\label{peer-to-peer-streaming-peer-protocol-ppspp}

PPSPP
(\href{https://datatracker.ietf.org/doc/rfc7574/?include_text=1}{IETF
RFC 7574}, (Bakker, Petrocco, and Grishchenko 2015)) is a protocol for
live streaming content over a peer to peer network. In it they define a
specific type of Merkle Tree that allows for subsets of the hashes to be
requested by a peer in order to reduce the time-till-playback for end
users. BitTorrent for example transfers all hashes up front, which is
not suitable for live streaming.

Their Merkle trees are ordered using a scheme they call ``bin
numbering'', which is a method for deterministically arranging an
append-only log of leaf nodes into an in-order layout tree where
non-leaf nodes are derived hashes. If you want to verify a specific
node, you only need to request its sibling's hash and all its uncle
hashes. PPSPP is very concerned with reducing round trip time and
time-till-playback by allowing for many kinds of optimizations, such as
to pack as many hashes into datagrams as possible when exchanging tree
information with peers.

Although PPSPP was designed with streaming video in mind, the ability to
request a subset of metadata from a large and/or streaming dataset is
very desirable for many other types of datasets.

\subsection{WebTorrent}\label{webtorrent}

With WebRTC, browsers can now make peer to peer connections directly to
other browsers. BitTorrent uses UDP sockets which aren't available to
browser JavaScript, so can't be used as-is on the Web.

WebTorrent implements the BitTorrent protocol in JavaScript using WebRTC
as the transport. This includes the BitTorrent block exchange protocol
as well as the tracker protocol implemented in a way that can enable
hybrid nodes, talking simultaneously to both BitTorrent and WebTorrent
swarms (if a client is capable of making both UDP sockets as well as
WebRTC sockets, such as Node.js). Trackers are exposed to web clients
over HTTP or WebSockets.

\subsection{InterPlanetary File
System}\label{interplanetary-file-system}

IPFS is a family of application and network protocols that have peer to
peer file sharing and data permanence baked in. IPFS abstracts network
protocols and naming systems to provide an alternative application
delivery platform to today's Web. For example, instead of using HTTP and
DNS directly, in IPFS you would use LibP2P streams and IPNS in order to
gain access to the features of the IPFS platform.

\subsection{Certificate Transparency/Secure
Registers}\label{certificate-transparencysecure-registers}

The UK Government Digital Service have developed the concept of a
register which they define as a digital public ledger you can trust. In
the UK, government registers are beginning to be piloted as a way to
expose essential open data sets in a way where consumers can verify the
data has not been tampered with, and allows the data publishers to
update their data sets over time.

The design of registers was inspired by the infrastructure backing the
Certificate Transparency (Laurie, Langley, and Kasper 2013) project,
initated at Google, which provides a service on top of SSL certificates
that enables service providers to write certificates to a distributed
public ledger. Any client or service provider can verify if a
certificate they received is in the ledger, which protects against so
called ``rogue certificates''.

\section{6. Reference Implementation}\label{reference-implementation}

The connection logic is implemented in a module called
\href{https://www.npmjs.com/package/discovery-swarm}{discovery-swarm}.
This builds on discovery-channel and adds connection establishment,
management and statistics. It provides statistics such as how many
sources are currently connected, how many good and bad behaving sources
have been talked to, and it automatically handles connecting and
reconnecting to sources. UTP support is implemented in the module
\href{https://www.npmjs.com/package/utp-native}{utp-native}.

Our implementation of source discovery is called
\href{https://npmjs.org/discovery-channel}{discovery-channel}. We also
run a \href{https://www.npmjs.com/package/dns-discovery}{custom DNS
server} that Dat clients use (in addition to specifying their own if
they need to), as well as a
\href{https://github.com/bittorrent/bootstrap-dht}{DHT bootstrap}
server. These discovery servers are the only centralized infrastructure
we need for Dat to work over the Internet, but they are redundant,
interchangeable, never see the actual data being shared, and anyone can run
their own and Dat will still work even if they all are unavailable. If
this happens discovery will just be manual (e.g.~manually sharing
IP/ports).

\section{Acknowledgements}\label{acknowledgements}

This work was made possible through grants from the John S. and James L.
Knight and Alfred P. Sloan Foundations.

\section*{References}\label{references}
\addcontentsline{toc}{section}{References}

\hypertarget{refs}{}
\hypertarget{ref-aumasson2013blake2}{}
Aumasson, Jean-Philippe, Samuel Neves, Zooko Wilcox-O'Hearn, and
Christian Winnerlein. 2013. ``BLAKE2: Simpler, Smaller, Fast as Md5.''
In \emph{International Conference on Applied Cryptography and Network
Security}, 119--35. Springer.

\hypertarget{ref-bakker2015peer}{}
Bakker, A, R Petrocco, and V Grishchenko. 2015. ``Peer-to-Peer Streaming
Peer Protocol (Ppspp).''

\hypertarget{ref-bernstein2012high}{}
Bernstein, Daniel J, Niels Duif, Tanja Lange, Peter Schwabe, and Bo-Yin
Yang. 2012. ``High-Speed High-Security Signatures.'' \emph{Journal of
Cryptographic Engineering}. Springer, 1--13.

\hypertarget{ref-laurie2013certificate}{}
Laurie, Ben, Adam Langley, and Emilia Kasper. 2013. ``Certificate
Transparency.''

\hypertarget{ref-maymounkov2002kademlia}{}
Maymounkov, Petar, and David Mazieres. 2002. ``Kademlia: A Peer-to-Peer
Information System Based on the Xor Metric.'' In \emph{International
Workshop on Peer-to-Peer Systems}, 53--65. Springer.

\hypertarget{ref-mykletun2003providing}{}
Mykletun, Einar, Maithili Narasimha, and Gene Tsudik. 2003. ``Providing
Authentication and Integrity in Outsourced Databases Using Merkle Hash
Trees.'' \emph{UCI-SCONCE Technical Report}.

\hypertarget{ref-rossi2010ledbat}{}
Rossi, Dario, Claudio Testa, Silvio Valenti, and Luca Muscariello. 2010.
``LEDBAT: The New Bittorrent Congestion Control Protocol.'' In
\emph{ICCCN}, 1--6.

\hypertarget{ref-varda2008protocol}{}
Varda, Kenton. 2008. ``Protocol Buffers: Google's Data Interchange
Format.'' \emph{Google Open Source Blog, Available at Least as Early as
Jul}.

\end{document}