>From 0d4e2c213519a2781b98e3a40cb5feec512bc188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Thu, 2 Oct 2014 14:07:42 +0100 Subject: [PATCH 1/4] copy: support smaller holes than the copy buffer size Previously cp would not detect runs of NULs that were smaller than the buffer size used for I/O (currently 128KiB). * src/copy.c (copy_reg): Use an independent hole_size, set to st_blksize, to increase the chances of detecting a representable hole, in a run of NULs read from the input. * tests/cp/sparse.sh: Add test cases for various sparse chunk sizes. * NEWS: Mention the improvement. --- NEWS | 3 + src/copy.c | 125 +++++++++++++++++++++++++++++++++++----------------- tests/cp/sparse.sh | 29 ++++++++++++ 3 files changed, 116 insertions(+), 41 deletions(-) diff --git a/NEWS b/NEWS index 1811ae4..7dc2644 100644 --- a/NEWS +++ b/NEWS @@ -30,6 +30,9 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + cp,install,mv will convert smaller runs of NULs in the input to holes, + to reduce allocation in the copy. + mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/src/copy.c b/src/copy.c index b7baee4..d32a131 100644 --- a/src/copy.c +++ b/src/copy.c @@ -158,18 +158,18 @@ utimens_symlink (char const *file, struct timespec const *timespec) bytes read. */ static bool sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - bool make_holes, + size_t hole_size, bool make_holes, char const *src_name, char const *dst_name, uintmax_t max_n_read, off_t *total_n_read, bool *last_write_made_hole) { *last_write_made_hole = false; *total_n_read = 0; + bool make_hole = false; + off_t psize = 0; while (max_n_read) { - bool make_hole = false; - ssize_t n_read = read (src_fd, buf, MIN (max_n_read, buf_size)); if (n_read < 0) { @@ -183,47 +183,88 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, max_n_read -= n_read; *total_n_read += n_read; - if (make_holes) + /* Loop over the input buffer in chunks of hole_size. */ + size_t csize = make_holes ? hole_size : buf_size; + char *cbuf = buf; + char *pbuf = buf; + + while (n_read) { - /* Sentinel required by is_nul(). */ - buf[n_read] = '\1'; -#ifdef lint - typedef uintptr_t word; - /* Usually, buf[n_read] is not the byte just before a "word" - (aka uintptr_t) boundary. In that case, the word-oriented - test below (*wp++ == 0) would read some uninitialized bytes - after the sentinel. To avoid false-positive reports about - this condition (e.g., from a tool like valgrind), set the - remaining bytes -- to any value. */ - memset (buf + n_read + 1, 0, sizeof (word) - 1); -#endif + bool prev_hole = make_hole; + csize = MIN (csize, n_read); + + if (make_holes && csize) + { + /* Setup sentinel required by is_nul(). */ + typedef uintptr_t word; + word isnul_tmp; + memcpy (&isnul_tmp, cbuf + csize, sizeof (word)); + memset (cbuf + csize, 1, sizeof (word)); + + make_hole = is_nul (cbuf, csize); + + memcpy (cbuf + csize, &isnul_tmp, sizeof (word)); + } + + bool transition = (make_hole != prev_hole) && psize; + bool last_chunk = (n_read == csize && ! make_hole) || ! csize; - if ((make_hole = is_nul (buf, n_read))) + if (transition || last_chunk) { - if (lseek (dest_fd, n_read, SEEK_CUR) < 0) + if (! transition) + psize += csize; + + if (! prev_hole) { - error (0, errno, _("cannot lseek %s"), quote (dst_name)); - return false; + if (full_write (dest_fd, pbuf, psize) != psize) + { + error (0, errno, _("error writing %s"), quote (dst_name)); + return false; + } + } + else + { + if (lseek (dest_fd, psize, SEEK_CUR) < 0) + { + error (0, errno, _("cannot lseek %s"), quote (dst_name)); + return false; + } } - } - } - if (!make_hole) - { - size_t n = n_read; - if (full_write (dest_fd, buf, n) != n) + pbuf = cbuf; + psize = csize; + + if (last_chunk) + { + if (transition) + csize = 0; /* Loop again to deal with last chunk. */ + else + psize = 0; /* Reset for next read loop. */ + } + else if (! csize) + n_read = 0; /* Finished processing buffer. */ + } + else /* Coalesce writes/seeks. */ { - error (0, errno, _("error writing %s"), quote (dst_name)); - return false; + if (psize <= OFF_T_MAX - csize) + psize += csize; + else + { + error (0, 0, _("overflow reading %s"), quote (src_name)); + return false; + } } - /* It is tempting to return early here upon a short read from a - regular file. That would save the final read syscall for each - file. Unfortunately that doesn't work for certain files in - /proc with linux kernels from at least 2.6.9 .. 2.6.29. */ + n_read -= csize; + cbuf += csize; } *last_write_made_hole = make_hole; + + /* It's tempting to break early here upon a short read from + a regular file. That would save the final read syscall + for each file. Unfortunately that doesn't work for + certain files in /proc or /sys with linux kernels. */ } return true; @@ -290,7 +331,8 @@ write_zeros (int fd, off_t n_bytes) return false. */ static bool extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - off_t src_total_size, enum Sparse_type sparse_mode, + size_t hole_size, off_t src_total_size, + enum Sparse_type sparse_mode, char const *src_name, char const *dst_name, bool *require_normal_copy) { @@ -331,7 +373,7 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, { off_t ext_start; off_t ext_len; - off_t hole_size; + off_t ext_hole_size; if (i < scan.ei_count) { @@ -345,11 +387,11 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, ext_len = 0; } - hole_size = ext_start - last_ext_start - last_ext_len; + ext_hole_size = ext_start - last_ext_start - last_ext_len; wrote_hole_at_eof = false; - if (hole_size) + if (ext_hole_size) { if (lseek (src_fd, ext_start, SEEK_SET) < 0) { @@ -374,9 +416,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, /* When not inducing holes and when there is a hole between the end of the previous extent and the beginning of the current one, write zeros to the destination file. */ - off_t nzeros = hole_size; + off_t nzeros = ext_hole_size; if (empty_extent) - nzeros = MIN (src_total_size - dest_pos, hole_size); + nzeros = MIN (src_total_size - dest_pos, ext_hole_size); if (! write_zeros (dest_fd, nzeros)) { @@ -409,7 +451,7 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, empty_extent = false; last_ext_len = ext_len; - if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, + if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, hole_size, sparse_mode == SPARSE_ALWAYS, src_name, dst_name, ext_len, &n_read, &wrote_hole_at_eof)) @@ -1105,6 +1147,7 @@ copy_reg (char const *src_name, char const *dst_name, size_t buf_alignment = lcm (getpagesize (), sizeof (word)); size_t buf_alignment_slop = sizeof (word) + buf_alignment - 1; size_t buf_size = io_blksize (sb); + size_t hole_size = ST_BLKSIZE (sb); fdadvise (source_desc, 0, 0, FADVISE_SEQUENTIAL); @@ -1164,7 +1207,7 @@ copy_reg (char const *src_name, char const *dst_name, standard copy only if the initial extent scan fails. If the '--sparse=never' option is specified, write all data but use any extents to read more efficiently. */ - if (extent_copy (source_desc, dest_desc, buf, buf_size, + if (extent_copy (source_desc, dest_desc, buf, buf_size, hole_size, src_open_sb.st_size, S_ISREG (sb.st_mode) ? x->sparse_mode : SPARSE_NEVER, src_name, dst_name, &normal_copy_required)) @@ -1179,7 +1222,7 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, + if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, hole_size, make_holes, src_name, dst_name, UINTMAX_MAX, &n_read, &wrote_hole_at_eof) diff --git a/tests/cp/sparse.sh b/tests/cp/sparse.sh index d6cc4c4..29a7d42 100755 --- a/tests/cp/sparse.sh +++ b/tests/cp/sparse.sh @@ -37,4 +37,33 @@ test $(stat --printf %b copy) -le $(stat --printf %b sparse) || fail=1 cp --sparse=always --reflink sparse copy && fail=1 cp --sparse=never --reflink sparse copy && fail=1 + +# Ensure we handle sparse/non-sparse transitions correctly +maxn=128 # how many $hole_size chunks per file +hole_size=$(stat -c %o copy) +dd if=/dev/zero bs=$hole_size count=$maxn of=zeros || framework_failure_ +tr '\0' 'U' < zeros > nonzero || framework_failure_ + +for n in 1 2 3 4 32 $maxn; do + parts=$(expr $maxn / $n) + + rm -f sparse.in + + # Generate non sparse file for copying with alternating + # hole/data patterns of size n * $hole_size + pattern=$(printf "%s\n%s" nonzero zeros) + for i in $(yes "$pattern" | head -n$parts); do + dd iflag=fullblock if=$i of=sparse.in conv=notrunc oflag=append \ + bs=$hole_size count=$n status=none || framework_failure_ + done + + cp --sparse=always sparse.in sparse.out || fail=1 # non sparse input + cp --sparse=always sparse.out sparse.out2 || fail=1 # sparse input + + cmp sparse.in sparse.out || fail=1 + cmp sparse.in sparse.out2 || fail=1 + + ls -lsh sparse.* +done + Exit $fail -- 1.7.7.6 >From 69c5570bb2ce8bdc650e19f0dbf0a1764afba9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 6 Oct 2014 10:19:58 +0100 Subject: [PATCH 2/4] cp: avoid speculative preallocation with --sparse=always With --sparse=always use fallocate(...PUNCH_HOLE...) to avoid any permanent allocation due to speculative preallocation employed by file systems such as XFS. * m4/jm-macros.m4: Check for and fallocate(). * src/copy.c (punch_hole): A new function to try and punch a hole an the specified offset if supported. (sparse_copy): Call punch_hole() after requesting a hole. (extent_copy): Likewise. * NEWS: Mention the improvement. --- NEWS | 2 +- m4/jm-macros.m4 | 2 + src/copy.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 61 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 7dc2644..bb2a1f3 100644 --- a/NEWS +++ b/NEWS @@ -31,7 +31,7 @@ GNU coreutils NEWS -*- outline -*- ** Improvements cp,install,mv will convert smaller runs of NULs in the input to holes, - to reduce allocation in the copy. + and cp --sparse=always avoids speculative preallocation on XFS for example. mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/m4/jm-macros.m4 b/m4/jm-macros.m4 index a96ecab..07b9085 100644 --- a/m4/jm-macros.m4 +++ b/m4/jm-macros.m4 @@ -78,6 +78,7 @@ AC_DEFUN([coreutils_MACROS], AC_CHECK_FUNCS_ONCE([ endgrent endpwent + fallocate fchown fchmod ftruncate @@ -189,6 +190,7 @@ AC_DEFUN([gl_CHECK_ALL_HEADERS], [ AC_CHECK_HEADERS_ONCE([ hurd.h + linux/falloc.h paths.h priv.h stropts.h diff --git a/src/copy.c b/src/copy.c index d32a131..03b62a5 100644 --- a/src/copy.c +++ b/src/copy.c @@ -70,6 +70,10 @@ # include "verror.h" #endif +#if HAVE_LINUX_FALLOC_H +# include +#endif + #ifndef HAVE_FCHOWN # define HAVE_FCHOWN false # define fchown(fd, uid, gid) (-1) @@ -145,6 +149,26 @@ utimens_symlink (char const *file, struct timespec const *timespec) return err; } +/* Attempt to punch a hole to avoid any permanent + speculative preallocation on file systems such as XFS. + Return values as per fallocate(2) except ENOSYS etc. are ignored. */ + +static int +punch_hole (int fd, off_t offset, off_t length) +{ + int ret = 0; +#if HAVE_FALLOCATE +# if defined FALLOC_FL_PUNCH_HOLE && defined FALLOC_FL_KEEP_SIZE + ret = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, length); + if (ret < 0 + && (errno == EOPNOTSUPP || errno == ENOTSUP || errno == ENOSYS)) + ret = 0; +# endif +#endif + return ret; +} + /* Copy the regular file open on SRC_FD/SRC_NAME to DST_FD/DST_NAME, honoring the MAKE_HOLES setting and using the BUF_SIZE-byte buffer BUF for temporary storage. Copy no more than MAX_N_READ bytes. @@ -158,7 +182,7 @@ utimens_symlink (char const *file, struct timespec const *timespec) bytes read. */ static bool sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, - size_t hole_size, bool make_holes, + size_t hole_size, bool punch_holes, char const *src_name, char const *dst_name, uintmax_t max_n_read, off_t *total_n_read, bool *last_write_made_hole) @@ -184,7 +208,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, *total_n_read += n_read; /* Loop over the input buffer in chunks of hole_size. */ - size_t csize = make_holes ? hole_size : buf_size; + size_t csize = hole_size ? hole_size : buf_size; char *cbuf = buf; char *pbuf = buf; @@ -193,7 +217,7 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, bool prev_hole = make_hole; csize = MIN (csize, n_read); - if (make_holes && csize) + if (hole_size && csize) { /* Setup sentinel required by is_nul(). */ typedef uintptr_t word; @@ -224,11 +248,20 @@ sparse_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, } else { - if (lseek (dest_fd, psize, SEEK_CUR) < 0) + off_t file_end = lseek (dest_fd, psize, SEEK_CUR); + if (file_end < 0) { error (0, errno, _("cannot lseek %s"), quote (dst_name)); return false; } + + if (punch_holes + && punch_hole (dest_fd, file_end - psize, psize) < 0) + { + error (0, errno, _("error deallocating %s"), + quote (dst_name)); + return false; + } } pbuf = cbuf; @@ -409,6 +442,14 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, error (0, errno, _("cannot lseek %s"), quote (dst_name)); goto fail; } + if (sparse_mode == SPARSE_ALWAYS + && punch_hole (dest_fd, ext_start - ext_hole_size, + ext_hole_size) < 0) + { + error (0, errno, _("error deallocating %s"), + quote (dst_name)); + goto fail; + } wrote_hole_at_eof = true; } else @@ -451,9 +492,9 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, empty_extent = false; last_ext_len = ext_len; - if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, hole_size, - sparse_mode == SPARSE_ALWAYS, - src_name, dst_name, ext_len, &n_read, + if ( ! sparse_copy (src_fd, dest_fd, buf, buf_size, + sparse_mode == SPARSE_ALWAYS ? hole_size: 0, + true, src_name, dst_name, ext_len, &n_read, &wrote_hole_at_eof)) goto fail; @@ -495,6 +536,13 @@ extent_copy (int src_fd, int dest_fd, char *buf, size_t buf_size, return false; } + if (sparse_mode == SPARSE_ALWAYS && dest_pos < src_total_size + && punch_hole (dest_fd, dest_pos, src_total_size - dest_pos) < 0) + { + error (0, errno, _("error deallocating %s"), quote (dst_name)); + return false; + } + return true; } @@ -1222,8 +1270,9 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, hole_size, - make_holes, src_name, dst_name, + if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, + make_holes ? hole_size : 0, + x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, UINTMAX_MAX, &n_read, &wrote_hole_at_eof) || (wrote_hole_at_eof -- 1.7.7.6 >From 15d6b67f4e67bb6ed42144bb2151af5c3b110acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Mon, 6 Oct 2014 11:02:34 +0100 Subject: [PATCH 3/4] cp: read sparse files more efficiently with non regular destination * src.copy.c (copy_reg): Use fiemap to read sparse files, even if the output is not to a regular file. * NEWS: Mention the improvement. --- NEWS | 3 +++ src/copy.c | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index bb2a1f3..17fa76d 100644 --- a/NEWS +++ b/NEWS @@ -33,6 +33,9 @@ GNU coreutils NEWS -*- outline -*- cp,install,mv will convert smaller runs of NULs in the input to holes, and cp --sparse=always avoids speculative preallocation on XFS for example. + cp will read sparse files more efficiently when the destination is a + non regular file. For example when copying a disk image to a device node. + mv will try a reflink before falling back to a standard copy, which is more efficient when moving files across BTRFS subvolume boundaries. diff --git a/src/copy.c b/src/copy.c index 03b62a5..8a065a8 100644 --- a/src/copy.c +++ b/src/copy.c @@ -1201,7 +1201,7 @@ copy_reg (char const *src_name, char const *dst_name, /* Deal with sparse files. */ bool make_holes = false; - bool sparse_src = false; + bool sparse_src = is_probably_sparse (&src_open_sb); if (S_ISREG (sb.st_mode)) { @@ -1214,7 +1214,6 @@ copy_reg (char const *src_name, char const *dst_name, blocks. If the file has fewer blocks than would normally be needed for a file of its size, then at least one of the blocks in the file is a hole. */ - sparse_src = is_probably_sparse (&src_open_sb); if (x->sparse_mode == SPARSE_AUTO && sparse_src) make_holes = true; } @@ -1257,7 +1256,7 @@ copy_reg (char const *src_name, char const *dst_name, any extents to read more efficiently. */ if (extent_copy (source_desc, dest_desc, buf, buf_size, hole_size, src_open_sb.st_size, - S_ISREG (sb.st_mode) ? x->sparse_mode : SPARSE_NEVER, + make_holes ? x->sparse_mode : SPARSE_NEVER, src_name, dst_name, &normal_copy_required)) goto preserve_metadata; -- 1.7.7.6 >From d0b3100828551fbffb3f3d95fea2c81e3d55e1a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Tue, 7 Oct 2014 19:48:53 +0100 Subject: [PATCH 4/4] copy: avoid an extraneous error when reporting errors * src/copy.c (copy_reg): If sparse_copy() failed, then an erroneous error about failing to extend the file would be reported. --- src/copy.c | 17 ++++++++++------- 1 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/copy.c b/src/copy.c index 8a065a8..a2d8dec 100644 --- a/src/copy.c +++ b/src/copy.c @@ -1269,13 +1269,16 @@ copy_reg (char const *src_name, char const *dst_name, off_t n_read; bool wrote_hole_at_eof; - if ( ! sparse_copy (source_desc, dest_desc, buf, buf_size, - make_holes ? hole_size : 0, - x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, - UINTMAX_MAX, &n_read, - &wrote_hole_at_eof) - || (wrote_hole_at_eof - && ftruncate (dest_desc, n_read) < 0)) + if (! sparse_copy (source_desc, dest_desc, buf, buf_size, + make_holes ? hole_size : 0, + x->sparse_mode == SPARSE_ALWAYS, src_name, dst_name, + UINTMAX_MAX, &n_read, + &wrote_hole_at_eof)) + { + return_val = false; + goto close_src_and_dst_desc; + } + else if (wrote_hole_at_eof && ftruncate (dest_desc, n_read) < 0) { error (0, errno, _("failed to extend %s"), quote (dst_name)); return_val = false; -- 1.7.7.6