duplicity-team team mailing list archive
-
duplicity-team team
-
Mailing list archive
-
Message #01530
[Merge] lp:~mterry/duplicity/static-corruption into lp:duplicity
Michael Terry has proposed merging lp:~mterry/duplicity/static-corruption into lp:duplicity.
Requested reviews:
duplicity-team (duplicity-team)
For more details, see:
https://code.launchpad.net/~mterry/duplicity/static-corruption/+merge/142044
This branch fixes three possible ways a backup could get data-corrupted. Inspired by bug 1091269.
A) If resuming after a volume that ended in a one-block file, we would skip the first block of the next file.
B) If resuming after a volume that ended in a multi-block file, we would skip the first block of the next file.
C) If resuming after a volume that spanned a multi-block file, we would skip some data inside the file.
A and B are because when finding the right place in the source files to restart the backup, the iteration loop didn't handle None block numbers very well (which are used to indicate the end of a file).
C is what bug 1091269 talks about. This was because data block sizes would get smaller as the difftar file got closer and closer to the volsize. Standard block sizes were 64 * 1024. But say we were close to the end of the difftar... When resuming, duplicity doesn't know the custom block sizes used by the previous run, so it uses standard block sizes. And it doesn't always match up, as you can imagine. So we would leave chunks of data out of the backed up file.
Tests added for these cases.
This branch is called 'static-corruption' because all these issues occur even when the source data doesn't change. I still think there are some corruption issues when a file changes in between duplicity runs. I haven't started looking into that yet, but that's next on my list.
--
https://code.launchpad.net/~mterry/duplicity/static-corruption/+merge/142044
Your team duplicity-team is requested to review the proposed merge of lp:~mterry/duplicity/static-corruption into lp:duplicity.
=== modified file 'bin/duplicity'
--- bin/duplicity 2012-11-21 01:27:35 +0000
+++ bin/duplicity 2013-01-06 18:22:21 +0000
@@ -223,7 +223,13 @@
# Just spin our wheels
while tarblock_iter.next():
if (tarblock_iter.previous_index == last_index):
- if (tarblock_iter.previous_block > last_block):
+ # If both the previous index and this index are done, exit now
+ # before we hit the next index, to prevent skipping its first
+ # block.
+ if not last_block and not tarblock_iter.previous_block:
+ break
+ # Only check block number if last_block is also a number
+ if last_block and tarblock_iter.previous_block > last_block:
break
if tarblock_iter.previous_index > last_index:
log.Warn(_("File %s complete in backup set.\n"
@@ -937,11 +943,10 @@
"""
Copy data from src_iter to file at fn
"""
- block_size = 128 * 1024
file = open(filename, "wb")
while True:
try:
- data = src_iter.next(block_size).data
+ data = src_iter.next().data
except StopIteration:
break
file.write(data)
@@ -989,9 +994,9 @@
def __init__(self, fileobj):
self.fileobj = fileobj
- def next(self, size):
+ def next(self):
try:
- res = Block(self.fileobj.read(size))
+ res = Block(self.fileobj.read(self.get_read_size()))
except Exception:
if hasattr(self.fileobj, 'name'):
name = self.fileobj.name
@@ -1005,6 +1010,9 @@
raise StopIteration
return res
+ def get_read_size(self):
+ return 128 * 1024
+
def get_footer(self):
return ""
=== modified file 'duplicity/diffdir.py'
--- duplicity/diffdir.py 2012-09-28 15:48:21 +0000
+++ duplicity/diffdir.py 2013-01-06 18:22:21 +0000
@@ -481,14 +481,14 @@
filler_data = ""
return TarBlock(index, "%s%s%s" % (headers, file_data, filler_data))
- def process(self, val, size):
+ def process(self, val):
"""
Turn next value of input_iter into a TarBlock
"""
assert not self.process_waiting
XXX # Override in subclass @UndefinedVariable
- def process_continued(self, size):
+ def process_continued(self):
"""
Get more tarblocks
@@ -498,15 +498,15 @@
assert self.process_waiting
XXX # Override in subclass @UndefinedVariable
- def next(self, size = 1024 * 1024):
+ def next(self):
"""
- Return next block, no bigger than size, and update offset
+ Return next block and update offset
"""
if self.process_waiting:
- result = self.process_continued(size)
+ result = self.process_continued()
else:
# Below a StopIteration exception will just be passed upwards
- result = self.process(self.input_iter.next(), size)
+ result = self.process(self.input_iter.next())
block_number = self.process_next_vol_number
self.offset += len(result.data)
self.previous_index = result.index
@@ -517,6 +517,13 @@
self.remember_next = False
return result
+ def get_read_size(self):
+ # read size must always be the same, because if we are restarting a
+ # backup volume where the previous volume ended in a data block, we
+ # have to be able to assume it's length in order to continue reading
+ # the file from the right place.
+ return 64 * 1024
+
def get_previous_index(self):
"""
Return index of last tarblock, or None if no previous index
@@ -553,7 +560,7 @@
"""
TarBlockIter that does no file reading
"""
- def process(self, delta_ropath, size):
+ def process(self, delta_ropath):
"""
Get a fake tarblock from delta_ropath
"""
@@ -577,13 +584,9 @@
"""
TarBlockIter that yields blocks of a signature tar from path_iter
"""
- def process(self, path, size):
+ def process(self, path):
"""
Return associated signature TarBlock from path
-
- Here size is just ignored --- let's hope a signature isn't too
- big. Also signatures are stored in multiple volumes so it
- doesn't matter.
"""
ti = path.get_tarinfo()
if path.isreg():
@@ -606,7 +609,7 @@
delta_path_iter, so the delta information has already been
calculated.
"""
- def process(self, delta_ropath, size):
+ def process(self, delta_ropath):
"""
Get a tarblock from delta_ropath
"""
@@ -631,8 +634,7 @@
# Now handle single volume block case
fp = delta_ropath.open("rb")
- # Below the 512 is the usual length of a tar header
- data, last_block = self.get_data_block(fp, size - 512)
+ data, last_block = self.get_data_block(fp)
if stats:
stats.RawDeltaSize += len(data)
if last_block:
@@ -654,11 +656,11 @@
self.process_next_vol_number = 2
return self.tarinfo2tarblock(index, ti, data)
- def get_data_block(self, fp, max_size):
+ def get_data_block(self, fp):
"""
Return pair (next data block, boolean last data block)
"""
- read_size = min(64*1024, max(max_size, 512))
+ read_size = self.get_read_size()
buf = fp.read(read_size)
if len(buf) < read_size:
if fp.close():
@@ -667,7 +669,7 @@
else:
return (buf, False)
- def process_continued(self, size):
+ def process_continued(self):
"""
Return next volume in multivol diff or snapshot
"""
@@ -675,7 +677,7 @@
ropath = self.process_ropath
ti, index = ropath.get_tarinfo(), ropath.index
ti.name = "%s/%d" % (self.process_prefix, self.process_next_vol_number)
- data, last_block = self.get_data_block(self.process_fp, size - 512)
+ data, last_block = self.get_data_block(self.process_fp)
if stats:
stats.RawDeltaSize += len(data)
if last_block:
=== modified file 'duplicity/dup_temp.py'
--- duplicity/dup_temp.py 2012-09-28 15:48:21 +0000
+++ duplicity/dup_temp.py 2013-01-06 18:22:21 +0000
@@ -256,9 +256,9 @@
def __init__(self, src):
self.src = src
self.fp = src.open("rb")
- def next(self, size):
+ def next(self):
try:
- res = Block(self.fp.read(size))
+ res = Block(self.fp.read(self.get_read_size()))
except Exception:
log.FatalError(_("Failed to read %s: %s") %
(self.src.name, sys.exc_info()),
@@ -267,5 +267,7 @@
self.fp.close()
raise StopIteration
return res
+ def get_read_size(self):
+ return 128 * 1024
def get_footer(self):
return ""
=== modified file 'duplicity/gpg.py'
--- duplicity/gpg.py 2013-01-02 14:02:35 +0000
+++ duplicity/gpg.py 2013-01-06 18:22:21 +0000
@@ -319,17 +319,16 @@
def get_current_size():
return os.stat(filename).st_size
- block_size = 128 * 1024 # don't bother requesting blocks smaller, but also don't ask for bigger
target_size = size - 50 * 1024 # fudge factor, compensate for gpg buffering
data_size = target_size - max_footer_size
file = GPGFile(True, path.Path(filename), profile)
at_end_of_blockiter = 0
while True:
bytes_to_go = data_size - get_current_size()
- if bytes_to_go < block_size:
+ if bytes_to_go < block_iter.get_read_size():
break
try:
- data = block_iter.next(min(block_size, bytes_to_go)).data
+ data = block_iter.next().data
except StopIteration:
at_end_of_blockiter = 1
break
@@ -373,16 +372,15 @@
def close(self):
return self.fileobj.close()
- block_size = 64 * 1024
file_counted = FileCounted(open(filename, "wb"))
gzip_file = gzip.GzipFile(None, "wb", 6, file_counted)
at_end_of_blockiter = 0
while True:
bytes_to_go = size - file_counted.byte_count
- if bytes_to_go < block_size:
+ if bytes_to_go < block_iter.get_read_size():
break
try:
- new_block = block_iter.next(min(block_size, bytes_to_go))
+ new_block = block_iter.next()
except StopIteration:
at_end_of_blockiter = 1
break
=== modified file 'testing/tests/restarttest.py'
--- testing/tests/restarttest.py 2012-11-24 19:45:09 +0000
+++ testing/tests/restarttest.py 2013-01-06 18:22:21 +0000
@@ -36,7 +36,6 @@
other_args = ["-v0", "--no-print-statistics"]
#other_args = ["--short-filenames"]
#other_args = ["--ssh-command 'ssh -v'", "--scp-command 'scp -C'"]
-#other_args = ['--no-encryption']
# If this is set to true, after each backup, verify contents
verify = 1
@@ -52,8 +51,9 @@
Test checkpoint/restart using duplicity binary
"""
def setUp(self):
+ self.class_args = []
assert not os.system("tar xzf testfiles.tar.gz > /dev/null 2>&1")
- assert not os.system("rm -rf testfiles/output "
+ assert not os.system("rm -rf testfiles/output testfiles/largefiles "
"testfiles/restore_out testfiles/cache")
assert not os.system("mkdir testfiles/output testfiles/cache")
backend = duplicity.backend.get_backend(backend_url)
@@ -80,6 +80,7 @@
cmd_list.append("--current-time %s" % (current_time,))
if other_args:
cmd_list.extend(other_args)
+ cmd_list.extend(self.class_args)
cmd_list.extend(arglist)
cmd_list.extend(["<", "/dev/null"])
cmdline = " ".join(cmd_list)
@@ -144,11 +145,14 @@
self.verify(dirname,
time = current_time, options = restore_options)
- def make_largefiles(self):
- # create 3 2M files
+ def make_largefiles(self, count=3, size=2):
+ """
+ Makes a number of large files in testfiles/largefiles that each are
+ the specified number of megabytes.
+ """
assert not os.system("mkdir testfiles/largefiles")
- for n in (1,2,3):
- assert not os.system("dd if=/dev/urandom of=testfiles/largefiles/file%d bs=1024 count=2048 > /dev/null 2>&1" % n)
+ for n in range(count):
+ assert not os.system("dd if=/dev/urandom of=testfiles/largefiles/file%d bs=1024 count=%d > /dev/null 2>&1" % (n + 1, size * 1024))
def check_same(self, filename1, filename2):
"""
@@ -327,6 +331,105 @@
self.backup("inc", "testfiles/largefiles")
self.verify("testfiles/largefiles")
+ def make_fake_second_volume(self, name):
+ """
+ Takes a successful backup and pretend that we interrupted a backup
+ after two-volumes. (This is because we want to be able to model
+ restarting the second volume and duplicity deletes the last volume
+ found because it may have not finished uploading.)
+ """
+ # First, confirm that we have signs of a successful backup
+ self.assertEqual(len(glob.glob("testfiles/output/*.manifest*")), 1)
+ self.assertEqual(len(glob.glob("testfiles/output/*.sigtar*")), 1)
+ self.assertEqual(len(glob.glob("testfiles/cache/%s/*" % name)), 2)
+ self.assertEqual(len(glob.glob(
+ "testfiles/cache/%s/*.manifest*" % name)), 1)
+ self.assertEqual(len(glob.glob(
+ "testfiles/cache/%s/*.sigtar*" % name)), 1)
+ # Alright, everything is in order; fake a second interrupted volume
+ assert not os.system("rm testfiles/output/*.manifest*")
+ assert not os.system("rm testfiles/output/*.sigtar*")
+ assert not os.system("rm -f testfiles/output/*.vol[23456789].*")
+ assert not os.system("rm -f testfiles/output/*.vol1[^.]+.*")
+ self.assertEqual(len(glob.glob("testfiles/output/*.difftar*")), 1)
+ assert not os.system("rm testfiles/cache/%s/*.sigtar*" % name)
+ assert not os.system("cp testfiles/output/*.difftar* "
+ "`ls testfiles/output/*.difftar* | "
+ " sed 's|vol1|vol2|'`")
+ assert not os.system("head -n6 testfiles/cache/%s/*.manifest > "
+ "testfiles/cache/%s/"
+ "`basename testfiles/cache/%s/*.manifest`"
+ ".part" % (name, name, name))
+ assert not os.system("rm testfiles/cache/%s/*.manifest" % name)
+ assert not os.system("""echo 'Volume 2:
+ StartingPath foo
+ EndingPath bar
+ Hash SHA1 sha1' >> testfiles/cache/%s/*.manifest.part""" % name)
+
+ def test_split_after_small(self):
+ """
+ If we restart right after a volume that ended with a small
+ (one-block) file, make sure we restart in the right place.
+ """
+ source = 'testfiles/largefiles'
+ assert not os.system("mkdir -p %s" % source)
+ assert not os.system("echo hello > %s/file1" % source)
+ self.backup("full", source, options=["--name=backup1"])
+ # Fake an interruption
+ self.make_fake_second_volume("backup1")
+ # Add new small file
+ assert not os.system("echo hello > %s/newfile" % source)
+ # 'restart' the backup
+ self.backup("full", source, options=["--name=backup1"])
+ # Confirm we actually resumed the previous backup
+ self.assertEqual(len(os.listdir("testfiles/output")), 4)
+ # Now make sure everything is byte-for-byte the same once restored
+ self.restore()
+ assert not os.system("diff -r %s testfiles/restore_out" % source)
+
+ def test_split_after_large(self):
+ """
+ If we restart right after a volume that ended with a large
+ (multi-block) file, make sure we restart in the right place.
+ """
+ source = 'testfiles/largefiles'
+ self.make_largefiles(count=1, size=1)
+ self.backup("full", source, options=["--name=backup1"])
+ # Fake an interruption
+ self.make_fake_second_volume("backup1")
+ # Add new small file
+ assert not os.system("echo hello > %s/newfile" % source)
+ # 'restart' the backup
+ self.backup("full", source, options=["--name=backup1"])
+ # Confirm we actually resumed the previous backup
+ self.assertEqual(len(os.listdir("testfiles/output")), 4)
+ # Now make sure everything is byte-for-byte the same once restored
+ self.restore()
+ assert not os.system("diff -r %s testfiles/restore_out" % source)
+
+ def test_split_inside_large(self):
+ """
+ If we restart right after a volume that ended inside of a large
+ (multi-block) file, make sure we restart in the right place.
+ """
+ source = 'testfiles/largefiles'
+ self.make_largefiles(count=1, size=3)
+ self.backup("full", source, options=["--vols 1", "--name=backup1"])
+ # Fake an interruption
+ self.make_fake_second_volume("backup1")
+ # 'restart' the backup
+ self.backup("full", source, options=["--vols 1", "--name=backup1"])
+ # Now make sure everything is byte-for-byte the same once restored
+ self.restore()
+ assert not os.system("diff -r %s testfiles/restore_out" % source)
+
+
+# Note that this class duplicates all the tests in RestartTest
+class RestartTestWithoutEncryption(RestartTest):
+ def setUp(self):
+ RestartTest.setUp(self)
+ self.class_args.extend(["--no-encryption"])
+
def test_no_write_double_snapshot(self):
"""
Test that restarting a full backup does not write duplicate entries
@@ -337,12 +440,12 @@
self.make_largefiles()
# Start backup
try:
- self.backup("full", "testfiles/largefiles", options = ["--fail 2", "--vols 1", "--no-encryption"])
+ self.backup("full", "testfiles/largefiles", options = ["--fail 2", "--vols 1"])
self.fail()
except CmdError, e:
self.assertEqual(30, e.exit_status)
# Finish it
- self.backup("full", "testfiles/largefiles", options = ["--no-encryption"])
+ self.backup("full", "testfiles/largefiles")
# Now check sigtar
sigtars = glob.glob("testfiles/output/duplicity-full*.sigtar.gz")
self.assertEqual(1, len(sigtars))
@@ -359,7 +462,7 @@
https://launchpad.net/bugs/929067
"""
# Intial normal backup
- self.backup("full", "testfiles/blocktartest", options = ["--no-encryption"])
+ self.backup("full", "testfiles/blocktartest")
# Create an exact clone of the snapshot folder in the sigtar already.
# Permissions and mtime must match.
os.mkdir("testfiles/snapshot", 0755)
@@ -379,7 +482,7 @@
self.assertEqual(0, os.system("rm -r testfiles/cache"))
# Try a follow on incremental (which in buggy versions, would create
# a deleted entry for the base dir)
- self.backup("inc", "testfiles/blocktartest", options = ["--no-encryption"])
+ self.backup("inc", "testfiles/blocktartest")
self.assertEqual(1, len(glob.glob("testfiles/output/duplicity-new*.sigtar.gz")))
# Confirm we can restore it (which in buggy versions, would fail)
self.restore()
Follow ups