# 
# add_file "TODO"
# 
# patch "TODO"
#  from []
#    to [cf0d5dc2493d53cf6ae94759cb0da5639de3e8cb]
# 
# patch "dumb.py"
#  from [86cc3fff44c25f7d941346c8b0ab44259d69b20d]
#    to [b520c982593c8711ef22e447b7084ee0a7ea7171]
# 
# patch "merkle_dir.py"
#  from [fea4ab58f66ad1998e570b0ed5b0615f2845a068]
#    to [d4e89ce8e256741fd064a9694d06026e341c5b84]
# 
========================================================================
--- TODO	
+++ TODO	cf0d5dc2493d53cf6ae94759cb0da5639de3e8cb
@@ -0,0 +1,9 @@
+* packet commands -> automate?
+* merkle dir stuff that doesn't require loading entire chunks into
+   memory all the time
+* ftp/sftp/http
+* possibly better rollback stuff?
+  - e.g., truncate when possible
+  - include some info in the lockdir on who has things locked, to aid
+   in detecting staleness of locks
+* file reader needs to do the pause/retry thing on missing files
========================================================================
--- dumb.py	86cc3fff44c25f7d941346c8b0ab44259d69b20d
+++ dumb.py	b520c982593c8711ef22e447b7084ee0a7ea7171
@@ -1,133 +1,9 @@
 import sha
 from sets import Set
 import os
 import os.path
 from cStringIO import StringIO
 
-# What's on disk:
-#   We write all interesting data out to a file DATA
-#     It consists of a bunch of "chunks", appended one after another
-#     each chunk is a deflated (not gzipped) block
-#     after decompression, each such chunk can be fed to 'monotone read'
-#   each chunk has a unique id.  it is assumed that the id<->data mapping is
-#     immutable; in particular, when transferring, we assumed that if we have
-#     an id already, then we already have the corresponding data
-#   a file INDEX is maintained which contains table-of-contents information
-#     for DATA -- each line in INDEX gives metadata for the next chunk in
-#     DATA.  this metadata is associated line, offset in DATA, number of bytes
-#     taken up in DATA.  This allows, for instance, random access into DATA.
-#   the INDEX file is used entirely locally; we never look at it over the
-#     network.
-#   locally, the INDEX file is used to generate the HASHES files
-#   the HASHES files implement the merkle trie algorithm
-#   we chop off our merkle tree at depth 2; this should give good loading for
-#     even very large projects, and is not so large that we pay excessive
-#     overhead on smaller projects
-#   this means there are two kinds of HASHES files -- the root file
-#     ("HASHES_"), and the child files ("HASHES_<2 digits of hex>")
-#   both kinds of HASHES_ files are deflated as a whole
-#   the child files contain a bunch of lines of the form:
-#       chunk <id> <offset> <length>
-#     where <id> begins with the 2 hex digits that this file is responsible
-#     for, and <offset> and <length> are the location of the corresponding
-#     data in DATA.  These files are sorted lexicographically by id.
-#   the root file contains a bunch of lines of the form:
-#       subtree <prefix> <hash> <filename>
-#     where <prefix> is the prefix of the tree being referred to (always two
-#     digits), <filename> is the child file containing the corresponding
-#     information, and <hash> is the sha1 of the string
-#     "<id1>\0<id2>\0...<idn>\0", for all ids mentioned in <filename>.
-#     This file is always sorted lexicographically by prefix.
-#     (FIXME: should we always generate child files for all prefixes, even if
-#     some are unused?)
-#   finally, there's a file VERSION, which contains the sha1 of the string
-#   "<hash1>\0<hash2>\0...<hashn>\0", plus a newline.
-#   finally2, there's a file called LENGTH, which contains the size of
-#     the file DATA in bytes, and then the size of the file INDEX in bytes.
-
-# sometimes there is a file _rollback, which is a copy of LENGTH made before
-# starting a transaction.  the way this works is that if a transaction gets
-# uncleanly aborted (which we try to make sure doesn't happen, because it is
-# icky), then we use this file to do rollback.  rollback can only be done
-# locally -- the way it works is that we truncate DATA and INDEX to the
-# lengths given in _rollback, replace LENGTH by _rollback, and regenerate all
-# hashes from scratch.
-# actually, SFTP supports truncate (paramiko doesn't expose it, but all you
-# have to do is create a SFTPAttributes object, set its size field, and send a
-# setstat request on it -- see implementation of paramiko's chmod, for
-# similar).  which means that it would be better to not have to regenerate all
-# hash files from scratch.  various options... move them out of the way, don't
-# delete them until everything is committed, perhaps.
-#
-# hmm... even if we don't have truncate, we should be able to restore original
-# hash files, at which point, we have random junk in DATA and INDEX but it
-# doesn't matter too much... INDEX will become corrupted, of course.
-
-# How a pull works:
-#   -- pull VERSION, see if it matches ours.  (note down what it says for
-#      later.)
-#   -- if not, pull root hashes file.  compare each line to our hashes.  for
-#      each one that differs, fetch the child file.
-#   -- compare fetched child files to our child files.  For each line that's
-#      new in the fetched files, note down the id/offset/length in a list.
-#   -- after we have processed _all_ child files, use the list to figure out
-#      which parts of the DATA file to fetch.  coalesce adjacent sections at
-#      this stage.  (but keep track of how everything matches back up to ids.)
-#   -- do one or more byte-range fetches to grab the stuff we need
-#   -- append the results to our DATA
-#   -- add the new stuff to INDEX, with the appropriate offsets for our DATA
-#      file
-#   -- rehash
-#  (-- pull VERSION again, see if it's the same as it was at the beginning.
-#      If not, warn the user or just print a message and go back to step 1.)
-# How a push works:
-#   -- lock the remote copy (see below)
-#   -- pull VERSION/pull HASHES_/pull any child hash files we need
-#   -- save HASHES_ and the child hash files so we can update them!
-#   -- before we write anything, replace remote VERSION with a random nonce
-#      (so that reads can reliably detect concurrent updates)
-#   -- for each id we have that are not mentioned in the appropriate child
-#      hash file, pull out the appropriate chunk from our DATA, assemble these
-#      pieces together, and append them to the remote DATA file
-#      also append appropriate lines to the remote INDEX
-#   -- _update_ atomically each child hash file we pulled to include the stuff
-#      we appended (do _not_ just push our version, remote side may have stuff
-#      we don't, and everything may be at different offsets)
-#   -- after child hash files have all been written, update root hash file
-#      similarly
-#   -- finally update VERSION
-
-## FIXME: I guess INDEX should just contain lengths, so that it can be updated
-## when pushing (without having to know the length of the remote DATA file)
-#### No -- still need to update child hash files, which do have to know
-#### offset.  So add a file LENGTH_OF_DATA or something, that we read after we
-#### get our edit lock, and update after we finished appending.
-
-# remote operations:
-#   -- synchronous append
-#   -- atomic update -- can queue multiple, block on all completing
-#   -- fetch file (sync and async)
-#   -- fetch byte ranges (sync, but perhaps should be able to pipeline
-#      fetching multiple different ranges?)
-
-# so: "append and then return"
-#     "fetch all of these files and then return"
-#     "atomically replace all of these files and then return"
-#     "fetch this list of byte ranges from a given file and then return"
-#     "query if this file exists and then return" (may just attempt fetching
-#       it and see what happens)
-#     "delete file"
-
-# also write a file called _lock_info or something containing info on who
-# created the lock, to ease in cleaning things up.
-#
-# there is no atomic replace in sftp (probably not ftp either).  can write,
-# delete, rename, to minimize window.  but clients should be prepared to retry
-# if a file fetch fails.
-
-# TODO:
-#   -- cat revision and packet commands -> automate?
-
 def do_import(monotone, dir):
     monotone.ensure_db()
     md = MerkleDir(dir)
========================================================================
--- merkle_dir.py	fea4ab58f66ad1998e570b0ed5b0615f2845a068
+++ merkle_dir.py	d4e89ce8e256741fd064a9694d06026e341c5b84
@@ -1,8 +1,84 @@
 import sha
 import os
 import os.path
 import zlib
 
+# What's on disk:
+#   We write all interesting data out to a file DATA
+#     It consists of a bunch of "chunks", appended one after another
+#     each chunk is a raw block of bytes, as passed to add()
+#   each chunk has a unique id.  it is assumed that the id<->data mapping is
+#     immutable; in particular, when transferring, we assumed that if we have
+#     an id already, then we already have the corresponding data
+#   ids should probably be hashes of something or the other; they don't
+#     actually have to be, but we do rely on them "looking like" hex encoded
+#     hashes (e.g., first 2 characters are randomly distributed, doesn't
+#     contain spaces, etc.)
+#   the HASHES files implement the merkle trie algorithm
+#   we chop off our merkle tree at depth 2; this should give good loading for
+#     even very large projects, and is not so large that we pay excessive
+#     overhead on smaller projects
+#   this means there are two kinds of HASHES files -- the root file
+#     ("HASHES_"), and the child files ("HASHES_<2 digits of hex>")
+#   both kinds of HASHES_ files are deflated (not gzipped) as a whole
+#   both sorts of files are sorted asciibetically
+#   the child files contain a bunch of lines of the form:
+#       chunk <id> <offset> <length>
+#     where <id> begins with the 2 hex digits that this file is responsible
+#     for, and <offset> and <length> are the location of the corresponding
+#     data in DATA.
+#   the root file contains a bunch of lines of the form:
+#       subtree <prefix> <hash> <filename>
+#     where <prefix> is the prefix of the tree being referred to (always two
+#     digits), <filename> is the child file containing the corresponding
+#     information, and <hash> is the sha1 of the string
+#     "<id1>\0<id2>\0...<idn>\0", for all ids mentioned in <filename>.
+#
+#   Finally, there may be a directory "_lock".  This is used for locking; see
+#   below.
+#
+# Transaction handling:
+#   Readers pay no attention to transactions or writers.  Writers assume the
+#     full burden of keeping the repo fully consistent at all times.
+#   Writers:
+#     -- first acquire a lock by creating the directory _lock.  This ensures
+#        that only one writer will run at a time.
+#     -- append everything they want to append to DATA.  Until this part of
+#        the file is referenced by HASHES files, it won't be noticed.
+#        Before appending any particular item, they read child hash file that
+#        would reference it, and make sure that it does not already exist.
+#        If interrupted at this point, unreferenced garbage may be left in
+#        DATA, but this is harmless.
+#     -- start atomically replacing child hash files.
+#        During this phase, the root hash file will lag behind the child hash
+#        files -- it will describe them as containing less than they actually
+#        do.  This does not cause any problems, because
+#           a) when determining whether an id exists, we always read the child
+#              hash file (as noted in previous step)
+#           b) the only things that can happen to readers are that they fetch
+#              a file that is _even newer_ than they were expecting (in which
+#              case, they actually wanted it _even more_ than they realized),
+#              or that they skip a file that has been replaced, but was
+#              unininteresting before being replaced.
+#        This does mean that a pull is not guaranteed to fetch everything that
+#        was included in the most recent push; it may instead fetch only some
+#        random subset.  Users of this library must be robust against this
+#        possibility -- even if two items A and B were added at the same time,
+#        a client may receive only A, or only B.
+#
+#        In some situations (FTP, NTFS, ...) it is actually impossible to
+#        atomically replace a file.  In these cases we simply bite the bullet
+#        and have a very small race condition, while each file is being
+#        swapped around.  If readers try to open a file but find it does not
+#        exist, they should try again after a short pause, before giving up.
+#     -- atomically replace the root hash file (subject again to the above
+#        proviso)
+#     -- remove the lockdir
+#  Rollback:
+#     If a connection is interrupted uncleanly, or there is a stale lock, we:
+#       -- check for any missing files left by non-atomic renames
+#       -- remove the lockdir
+
 class HashFile:
     prefix = ""
     values = ()
@@ -45,6 +121,7 @@
         for key, values in self:
             value_txt = " ".join([str(v) for v in values])
             lines.append("%s %s %s" % (self.prefix, key, value_txt))
+        lines.sort()
         return zlib.compress("\n".join(lines))
 
     # yields (key, values)
@@ -206,6 +283,7 @@
         self._ids_to_flush = []
 
     #### Adding new items
+    # can only be called from inside a transaction.
     def add(self, id, data):
         self._need_lock()
         assert None not in (self._data_handle,