Re: Patch that adds a new test to find

bug-findutils
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Patch that adds a new test to find

From:	James Youngman
Subject:	Re: Patch that adds a new test to find
Date:	Mon, 5 Apr 2010 11:39:37 +0100
On Sun, Apr 4, 2010 at 7:38 PM, address@hidden <address@hidden> wrote:
> Hi,
>
> I have written a small patch (against findutils 4.4.2) that
> adds a switch to find (namely, -samecontent) in order to find
> duplicates of a file.  In other words, the command
>
>  find -samecontent FILENAME
>
> is now roughly equivalent to
>
>  find -type f -exec cmp -s FILENAME '{}' \; -print
>
> except that there is no need for find to fork a new process
> for every file to check.  I have implemented the feature by
> incorporating code from GNU cmp.  Of course, my patch is still
> in an experimental state, hence I would greatly appreciate any
> suggestions or help.

First, thanks for contributing.

I have some reservations about the idea itself, because unlike all the
other tests, it actually reads the contents of files.   However, in my
comments below I'll restrict my comments to questions of how the
implementation works, because I hope you will find that more useful.


diff -ur findutils-4.4.2/find/defs.h findutils-4.4.2-rreale/find/defs.h


It's generally better to prepare patches to find using the git
repository, because the latest table verison is in general a long way
behind the development tree.   See
http://savannah.gnu.org/git/?group=findutils for more information.


--- findutils-4.4.2/find/defs.h 2009-05-16 17:17:01.000000000 +0200
+++ findutils-4.4.2-rreale/find/defs.h  2010-04-04 19:11:38.000000000 +0200
@@ -65,6 +65,8 @@
 #include "buildcmd.h"
 #include "quotearg.h"

+#define LARGE_BLOCK_SIZE 4096
+
 /* These days we will assume ANSI/ISO C protootypes work on our compiler. */
 #define PARAMS(Args) Args

@@ -167,6 +169,15 @@
   int   fd;
 };

+struct samecontent_args
+{
+  struct stat st;
+  char buf[LARGE_BLOCK_SIZE];
+  size_t size;
+  boolean buf_read;
+  char *filename;
+};
+
 struct size_val
 {
   enum comparison_type kind;
@@ -313,6 +324,7 @@
     struct time_val reftime;   /* newer newerXY anewer cnewer mtime
atime ctime mmin amin cmin */
     struct perm_val perm;      /* perm */
     struct samefile_file_id samefileid; /* samefile */
+    struct samecontent_args samecontentargs; /* samecontent */
     mode_t type;               /* type */
     struct format_val printf_vec; /* printf fprintf fprint ls fls
print0 fprint0 print */
   } args;


So, we allocate a buffer of size LARGE_BLOCK_SIZE for every predicate?
   It seems wasteful to do that, since we don't need to do any I/O
until we're looking at a file.


@@ -459,6 +471,7 @@
 PREDICATEFUNCTION pred_user;
 PREDICATEFUNCTION pred_writable;
 PREDICATEFUNCTION pred_xtype;
+PREDICATEFUNCTION pred_samecontent;



@@ -535,7 +548,7 @@
   /* If true, -depth was EXPLICITLY set (as opposed to having been turned
    * on by -delete, for example).
    */
-   boolean explicit_depth;
+  boolean explicit_depth;

   /* If >=0, don't descend more than this many levels of subdirectories. */
   int maxdepth;
diff -ur findutils-4.4.2/find/parser.c findutils-4.4.2-rreale/find/parser.c
--- findutils-4.4.2/find/parser.c       2009-05-16 17:17:01.000000000 +0200
+++ findutils-4.4.2-rreale/find/parser.c        2010-04-04 19:35:50.000000000 
+0200
@@ -154,6 +154,7 @@
 static boolean parse_noignore_race PARAMS((const struct
parser_table*, char *argv[], int *arg_ptr));
 static boolean parse_warn          PARAMS((const struct
parser_table*, char *argv[], int *arg_ptr));
 static boolean parse_xtype         PARAMS((const struct
parser_table*, char *argv[], int *arg_ptr));
+static boolean parse_samecontent         PARAMS((const struct
parser_table*, char *argv[], int *arg_ptr));
 static boolean parse_quit          PARAMS((const struct
parser_table*, char *argv[], int *arg_ptr));

 boolean parse_print             PARAMS((const struct parser_table*,
char *argv[], int *arg_ptr));
@@ -321,6 +322,7 @@
   {ARG_TEST,       "writable",               parse_accesscheck,
pred_writable}, /* GNU, 4.3.0+ */
   PARSE_OPTION     ("xdev",                  xdev), /* POSIX */
   PARSE_TEST       ("xtype",                 xtype),        /* GNU */
+  PARSE_TEST       ("samecontent",           samecontent),
 #ifdef UNIMPLEMENTED_UNIX
   /* It's pretty ugly for find to know about archive formats.
      Plus what it could do with cpio archives is very limited.
@@ -2585,6 +2587,42 @@
 {
   return insert_type (argv, arg_ptr, entry, pred_xtype);
 }
+
+static boolean
+parse_samecontent (const struct parser_table* entry, char **argv, int *arg_ptr)
+{
+  struct predicate *our_pred;
+  const char *filename;
+  struct stat st;
+
+  set_stat_placeholders(&st);
+
+  if (collect_arg(argv, arg_ptr, &filename))
+    {
+      if (0 != (options.xstat)(filename, &st))
+       {
+         fatal_file_error(filename);
+       }
+    }
+  else
+    {
+      return false;
+    }
+
+  our_pred = insert_primary (entry);
+  memcpy (&our_pred->args.samecontentargs.st, &st, sizeof (struct stat));
+  our_pred->args.samecontentargs.filename = xmalloc (strlen (filename) + 1);
+  strcpy (our_pred->args.samecontentargs.filename, filename);
+  our_pred->args.samecontentargs.buf_read = false;
+
+#if 0
+  our_pred->args.samefileid.fd  = fd;
+  our_pred->need_type = false;
+  our_pred->need_stat = true;
+#endif
+  our_pred->est_success_rate = 0.01f;

I think much fewer than 1% of files are identical, so this could
usefully be lower.

+  return true;
+}
 
 static boolean
 insert_type (char **argv, int *arg_ptr,
diff -ur findutils-4.4.2/find/pred.c findutils-4.4.2-rreale/find/pred.c
--- findutils-4.4.2/find/pred.c 2009-05-16 17:17:01.000000000 +0200
+++ findutils-4.4.2-rreale/find/pred.c  2010-04-04 19:36:41.000000000 +0200
@@ -20,7 +20,24 @@
 #include "defs.h"

 #include <fnmatch.h>
+
 #include <signal.h>
+#ifndef SA_RESTART
+# ifdef SA_INTERRUPT /* e.g. SunOS 4.1.x */
+#  define SA_RESTART SA_INTERRUPT
+# else
+#  define SA_RESTART 0
+# endif
+#endif
+
+#if HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+
 #include <math.h>
 #include <pwd.h>
 #include <grp.h>
@@ -33,6 +50,7 @@
 #include <locale.h>
 #include <openat.h>
 #include <ctype.h>
+#include <limits.h>
 #include "xalloc.h"
 #include "dirname.h"
 #include "human.h"
@@ -47,6 +65,12 @@
 #include "dircallback.h"
 #include "error.h"
 #include "verify.h"
+#include "intprops.h"
+
+/* Type used for fast comparison of several bytes at a time.  */
+#ifndef word
+# define word uintmax_t
+#endif

 #if ENABLE_NLS
 # include <libintl.h>
@@ -230,6 +254,7 @@
   {pred_user, "user    "},
   {pred_writable, "writable "},
   {pred_xtype, "xtype   "},
+  {pred_samecontent, "samecontent "},
   {0, "none    "}
 };
 #endif
@@ -1844,6 +1869,294 @@
    */
   return (pred_type (pathname, &sbuf, pred_ptr));
 }
+
+/* Buffer primitives for comparison operations.  Adapted from lib/cmpbuf.c
+   in GNU diffutils 2.9.
+
+   Copyright (C) 1993, 1995, 1998, 2001-2002, 2006, 2009-2010 Free Software
+   Foundation, Inc.  */
+
+/*#include "cmpbuf.h"*/
+
+#ifndef PTRDIFF_MAX
+# define PTRDIFF_MAX TYPE_MAXIMUM (ptrdiff_t)
+#endif
+#ifndef SIZE_MAX
+# define SIZE_MAX TYPE_MAXIMUM (size_t)
+#endif
+#ifndef SSIZE_MAX
+# define SSIZE_MAX TYPE_MAXIMUM (ssize_t)
+#endif

gnulib should provide all of these types anyway.

+
+#undef MIN
+#define MIN(a, b) ((a) <= (b) ? (a) : (b))
+
+/* Read NBYTES bytes from descriptor FD into BUF.
+   NBYTES must not be SIZE_MAX.
+   Return the number of characters successfully read.
+   On error, return SIZE_MAX, setting errno.
+   The number returned is always NBYTES unless end-of-file or error.  */
+
+size_t
+block_read (int fd, char *buf, size_t nbytes)
+{
+  char *bp = buf;
+  char const *buflim = buf + nbytes;
+  size_t readlim = MIN (SSIZE_MAX, SIZE_MAX);
+
+  do
+    {
+      size_t bytes_remaining = buflim - bp;
+      size_t bytes_to_read = MIN (bytes_remaining, readlim);
+      ssize_t nread = read (fd, bp, bytes_to_read);
+      if (nread <= 0)
+       {
+         if (nread == 0)
+           break;
+
+         /* Accommodate Tru64 5.1, which can't read more than INT_MAX
+            bytes at a time.  They call that a 64-bit OS?  */
+         if (errno == EINVAL && INT_MAX < bytes_to_read)
+           {
+             readlim = INT_MAX;
+             continue;
+           }
+
+         /* This is needed for programs that have signal handlers on
+            older hosts without SA_RESTART.  It also accommodates
+            ancient AIX hosts that set errno to EINTR after uncaught
+            SIGCONT.  See <news:address@hidden>
+            (1993-04-22).  */
+         if (! SA_RESTART && errno == EINTR)
+           continue;
+
+         return SIZE_MAX;
+       }
+      bp += nread;
+    }
+  while (bp < buflim);
+
+  return bp - buf;
+}
+
+/* Compare two files byte by byte.  Adapted from src/cmp.c in
+   GNU diffutils 2.9.
+
+   Copyright (C) 1990-1996, 1998, 2001-2002, 2004, 2006-2007, 2009-2010 Free
+   Software Foundation, Inc.  */
+
+static boolean cmp (int fd0, int fd1, struct predicate *pred_ptr);
+static size_t block_compare (word const *, word const *);
+
+/* Number of bytes to compare.  */
+static uintmax_t bytes = UINTMAX_MAX;
+
+/* Compare the two files already open on `file_desc[0]' and `file_desc[1]',
+   using `buffer[0]' and `buffer[1]'.  */
+
+static boolean
+cmp (int fd0, int fd1, struct predicate *pred_ptr)
+{
+  off_t byte_number = 1;       /* Byte number (1...) of difference. */
+  uintmax_t remaining = bytes; /* Remaining number of bytes to compare.  */
+  size_t read0, read1;         /* Number of bytes read from each file. */
+  size_t first_diff;           /* Offset (0...) in buffers of 1st diff. */
+  char *buffer0, *buffer1;
+  boolean first_block = true;
+  int differing = 0;
+  int f;
+  int offset_width = 0;
+  size_t buf_size = LARGE_BLOCK_SIZE;
+
+  /* Allocate word-aligned buffers, with space for sentinels at the end.  */
+
+  buffer0 = (char *) xmalloc (buf_size);
+  buffer1 = (char *) xmalloc (buf_size);
+
+  do
+    {
+      size_t bytes_to_read = buf_size;
+
+      if (remaining != UINTMAX_MAX)
+       {
+         if (remaining < bytes_to_read)
+           bytes_to_read = remaining;
+         remaining -= bytes_to_read;
+       }
+
+      if (first_block)
+       {
+          read0 = pred_ptr->args.samecontentargs.size;
+         buffer0 = pred_ptr->args.samecontentargs.buf;

Doesn't this leak the memory block correspondingn to the previous
value of buffer0?

+         first_block = false;
+       }
+      else
+        {
+          read0 = block_read (fd0, buffer0, bytes_to_read);
+#if 0
+      if (read0 == SIZE_MAX)
+       error (EXIT_TROUBLE, errno, "%s", file[0]);
+#endif
+       }
+
+      read1 = block_read (fd1, buffer1, bytes_to_read);
+#if 0
+      if (read1 == SIZE_MAX)
+       error (EXIT_TROUBLE, errno, "%s", file[1]);
+#endif
+
+      if (read0 != read1)
+       return false;
+
+      /* Insert sentinels for the block compare.  */
+
+      buffer0[read0] = ~buffer1[read0];
+      buffer1[read1] = ~buffer0[read1];

This will fail if read0 is SIZE_MAX, or read1 is SIZE_MAX.

+
+      first_diff = block_compare ((word *) buffer0, (word *) buffer1);
+
+      byte_number += first_diff;
+
+      if (first_diff < read0)
+       return false;
+    }
+  while (differing <= 0 && read0 == buf_size);
+
+  return differing == 0 ? true : false;
+}
+
+/* Compare two blocks of memory P0 and P1 until they differ.
+   If the blocks are not guaranteed to be different, put sentinels at the ends
+   of the blocks before calling this function.
+
+   Return the offset of the first byte that differs.  */
+
+static size_t
+block_compare (word const *p0, word const *p1)
+{
+  word const *l0, *l1;
+  char const *c0, *c1;
+
+  /* Find the rough position of the first difference by reading words,
+     not bytes.  */
+
+  for (l0 = p0, l1 = p1;  *l0 == *l1;  l0++, l1++)
+    continue;
+
+  /* Find the exact differing position (endianness independent).  */
+
+  for (c0 = (char const *) l0, c1 = (char const *) l1;
+       *c0 == *c1;
+       c0++, c1++)
+    continue;
+
+  return c0 - (char const *) p0;
+}
+
+/* Following macros adapted from src/system.h in GNU diffutils 2.9.
+
+   Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2010
+   Free Software Foundation, Inc.  */
+
+/* Do struct stat *S, *T describe the same special file?  */
+#if HAVE_STRUCT_STAT_ST_RDEV && defined S_ISBLK && defined S_ISCHR
+# define same_special_file(s, t) \
+    (((S_ISBLK ((s)->st_mode) && S_ISBLK ((t)->st_mode)) \
+      || (S_ISCHR ((s)->st_mode) && S_ISCHR ((t)->st_mode))) \
+     && (s)->st_rdev == (t)->st_rdev)
+#else
+# define same_special_file(s, t) 0
+#endif

I almost always avoid copying code from the header portion of a file
into the middle of another file, it makes it hard to do housekeeping
work on includes, declarations, typesefs, etc.    In fact much of this
file comparison code would be better placed into a separate module
(pred.c is already enormous).


+
+/* Do struct stat *S, *T describe the same file?  Answer -1 if unknown.  */
+#define same_file(s, t) \
+   ((((s)->st_ino == (t)->st_ino) && ((s)->st_dev == (t)->st_dev)) \
+     || same_special_file (s, t))

We already have pred_samefile, it's better to reuse that code I think.
  If there are cases it should detect but does not, then surely it
should be fixed too.


+
+/* Do struct stat *S, *T have the same file attributes?
+
+   POSIX says that two files are identical if st_ino and st_dev are
+   the same, but many file systems incorrectly assign the same (device,
+   inode) pair to two distinct files, including:
+
+   - GNU/Linux NFS servers that export all local file systems as a
+     single NFS file system, if a local device number (st_dev) exceeds
+     255, or if a local inode number (st_ino) exceeds 16777215.
+
+   - Network Appliance NFS servers in snapshot directories; see
+     Network Appliance bug #195.
+
+   - ClearCase MVFS; see bug id ATRia04618.
+
+   Check whether two files that purport to be the same have the same
+   attributes, to work around instances of this common bug.  Do not
+   inspect all attributes, only attributes useful in checking for this
+   bug.
+
+   It's possible for two distinct files on a buggy file system to have
+   the same attributes, but it's not worth slowing down all
+   implementations (or complicating the configuration) to cater to
+   these rare cases in buggy implementations.  */
+
+#define same_file_attributes(s, t) \
+   ((s)->st_mode == (t)->st_mode \
+     && (s)->st_nlink == (t)->st_nlink \
+     && (s)->st_uid == (t)->st_uid \
+     && (s)->st_gid == (t)->st_gid \
+     && (s)->st_size == (t)->st_size \
+     && (s)->st_mtime == (t)->st_mtime \
+     && (s)->st_ctime == (t)->st_ctime)
+
+boolean
+pred_samecontent (const char *pathname, struct stat *stat_buf, struct
predicate *pred_ptr)
+{
+  struct stat *st = &pred_ptr->args.samecontentargs.st;
+  size_t words_per_buffer;
+  int fd0;
+  int fd1;
+  boolean exit_status = false;
+  char *filename = pred_ptr->args.samecontentargs.filename;
+  (void) pathname;
+
+  /* If the files are links to the same inode and have the same file position,
+     they are identical.  */
+
+  if (0 < same_file (stat_buf, st) && same_file_attributes (stat_buf, st))
+    return true;

I think this check is spurious.   I would expect -samecontent to check
only the data in the file, not the metadata (except the length).  As a
specific example I would expect -samecontent to return true for two
files which are identical apart from the fact that they have different
owners.

+
+  /* If both input descriptors are associated with plain files,
+     conclude that the files differ if they have different sizes.  */
+
+  if (S_ISREG (stat_buf->st_mode) && S_ISREG (st->st_mode))
+    {
+      off_t s0 = stat_buf->st_size;
+      off_t s1 = st->st_size;
+      if (s0 != s1)
+       return false;
+    }
+
+  fd0 = open (filename, O_RDONLY | O_BINARY, 0);
+  fd1 = open (pathname, O_RDONLY | O_BINARY, 0);
+
+  if (!pred_ptr->args.samecontentargs.buf_read)
+    {
+      size_t read;
+      read = block_read (fd0, pred_ptr->args.samecontentargs.buf,
LARGE_BLOCK_SIZE);

Won't this make the predicate always fail?   It reads a bock from the
reference file before beginning comparison with the target file.   So
the comparison is mis-aligned I think.


+#if 0
+      if (read == SIZE_MAX)
+       error (EXIT_TROUBLE, errno, "%s", file[0]);
+#endif
+      pred_ptr->args.samecontentargs.size = read;
+      pred_ptr->args.samecontentargs.buf_read = true;
+    }
+
+  exit_status = cmp (fd0, fd1, pred_ptr);
+
+  close (fd0);
+  close (fd1);
+
+  return exit_status;
+}
 
 /*  1) fork to get a child; parent remembers the child pid
     2) child execs the command requested


So there are a few comments on the patch above, but I also have a few
comments about some things that could usefully have been included but
aren't in there:
1. An update to find.texi explaining the new feature
2. An update to find.1 describing it
3. A regression test case for the test suite.


I hope you find the feedback above helpful; I'm certainly glad to find
a volunteer willing to work on findutils.   Having said this, I'm not
certain that making such a check this way is really better than using
"-exec cmp".   Considering after all that this is a substantial amount
of extra code.   However, someone else from the list may well have an
opinion different to my own.

Thanks again for contributing!
James.
[Prev in Thread]
Current Thread
[Next in Thread]
Patch that adds a new test to find, address@hidden, 2010/04/05
- Re: Patch that adds a new test to find, James Youngman <=
- Re: Patch that adds a new test to find, address@hidden, 2010/04/05
  - Re: Patch that adds a new test to find, James Youngman, 2010/04/05
  - Re: Patch that adds a new test to find, Eric Blake, 2010/04/05
- Re: Patch that adds a new test to find, address@hidden, 2010/04/08
- Re: Patch that adds a new test to find, address@hidden, 2010/04/08
Prev by Date: Patch that adds a new test to find
Next by Date: [PATCH] frcode: avoid link failure due to multiple program_name definitions
Previous by thread: Patch that adds a new test to find
Next by thread: Re: Patch that adds a new test to find
Index(es):
- Date
- Thread