Easier to read diff -u output

bug-gnu-utils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Easier to read diff -u output

From:	Dan Hipschman
Subject:	Easier to read diff -u output
Date:	Tue, 14 Nov 2006 17:22:35 -0800
User-agent:	Mutt/1.5.9i

Hi,

I've been working on a patch to diff to make the output of diff -u more
readable in some cases.  Here is a small example of what I mean:

$ ./diff -u ~/foo/{a,b}
--- /home/dsh/foo/a     2006-11-09 12:08:21.000000000 -0800
+++ /home/dsh/foo/b     2006-11-09 12:08:25.000000000 -0800
@@ -1,22 +1,17 @@
 #include <stdio.h>

 static void
-print_hello(void)
+print_hello_stdout(void)
 {
-        const char *hello = "Hello there!\n";
-        int i;
-
-        for (i = 0; i < 10; ++i)
+        if (stdout)
         {
-                if (hello[i] == '\0')
-                        break;
-                putchar(hello[i]);
+                fprintf(stdout, "Hello\n");
         }
 }

 int
 main(void)
 {
-        print_hello();
+        print_hello_stdout();
         return 0;
 }

$ ./diff --more-readable -u ~/foo/{a,b}
--- /home/dsh/foo/a     2006-11-09 12:08:21.000000000 -0800
+++ /home/dsh/foo/b     2006-11-09 12:08:25.000000000 -0800
@@ -1,22 +1,17 @@
 #include <stdio.h>

 static void
-print_hello(void)
-{
-        const char *hello = "Hello there!\n";
-        int i;
-
-        for (i = 0; i < 10; ++i)
-        {
-                if (hello[i] == '\0')
-                        break;
-                putchar(hello[i]);
+print_hello_stdout(void)
+{
+        if (stdout)
+        {
+                fprintf(stdout, "Hello\n");
         }
 }

 int
 main(void)
 {
-        print_hello();
+        print_hello_stdout();
         return 0;
 }

I've got a larger and much more realistic example posted at
http://linux.ucla.edu/~dsh/patches/diff-more-readable.html

Anyway, the patch to add this new option, --more-readable, is appended
to this message.  The comment near the top of the patch explains roughly
what it does.  It works pretty well as is, but it still might be a
little conservative.  I've got some ideas for making it even better.
For example, we might have a list of regular expressions matching the
lines we're allowed to absorb, and the list could be specific to the
type of file being diffed.  That is, we might only absorb lines matching

^[[:space:]]*$
^[[:space:]]*{[[:space:]]*$
^[[:space:]]*}[[:space:]]*$

in C source files, etc.  I've spoken with Paul and he has the more
elegant idea of counting how many times each line occurs in the input
and only matching lines that occur multiple times.  I could also make
the parameters change_block_min and island_max command line options, but
in that case --more-readable isn't a very good name.  Maybe --absorb=M,N
makes more sense.  I hope this gets accepted, but comments are welcome,
too.

Thanks,
Dan

Index: analyze.c
===================================================================
RCS file: /sources/diffutils/diffutils/src/analyze.c,v
retrieving revision 1.24
diff -p -u -r1.24 analyze.c
--- analyze.c   13 Mar 2006 19:11:17 -0000      1.24
+++ analyze.c   15 Nov 2006 00:28:07 -0000
@@ -570,6 +570,83 @@ discard_confusing_lines (struct file_dat
   free (equiv_count[0]);
 }
 
+/* Look for small islands of unchanged lines amidst blocks of inserts
+   and deletes.  In input with repetitive structure, like source code,
+   these islands are often lines that were deleted as part of a block
+   of logical changes, and then inserted as another block.  E.g., this
+   happens a lot when editting C code.  You may remove some function
+   definition entirely and then later add an unrelated function in the
+   same part of the file, and diff will match the opening and closing
+   curly braces as common lines.  In diff -u output, this can cause
+   unrelated blocks of inserts and deletes to be intermingled, making
+   it harder to understand the logical changes to the file, as opposed
+   to the minimal set of physical changes.  Absorb those islands to
+   make diff -u output easier to read.  Note that this sacrifices the
+   optimality of the LCS algorithm.  */
+
+static void
+absorb_islands (struct file_data filevec[])
+{
+  static lin const change_block_min = 15;
+  static lin const island_max = 2;
+  char *c0 = filevec[0].changed;
+  char *c1 = filevec[1].changed;
+  char *block0 = NULL, *block1 = NULL;
+  char *island0, *island1;
+  char *const c0_end = c0 + filevec[0].buffered_lines;
+  char *const c1_end = c1 + filevec[1].buffered_lines;
+
+  while (c0 < c0_end)
+    {
+      /* Look for the start of a set of changes to both files
+         such that the changes are preceded by a common line.  */
+
+      for ( ; c0 < c0_end && c1 < c1_end && (!*c0 | !*c1); ++c0, ++c1)
+        {
+          while (*c0)
+            ++c0;
+          while (*c1)
+            ++c1;
+        }
+
+      if (!block0)
+        block0 = c0, block1 = c1;
+
+      /* Ignore the changes. */
+
+      while (*c0)
+        ++c0;
+      while (*c1)
+        ++c1;
+
+      /* Find the end of the island, if it is one.  */
+
+      island0 = c0;
+      island1 = c1;
+
+      while (c0 < c0_end && !*++c0 & !*++c1)
+        continue;
+
+      /* Test whether this is an island we can absorb.  If so, don't
+         absorb it yet, wait until we find the last island in the
+         current block of changes and absorb them all at once.  If
+         this isn't an island we can absorb, then it marks the end of
+         a block of changes, so now absorb all the islands we found.  */
+
+      if (c0 == c0_end || (!*c0 | !*c1)
+          || island_max < c0 - island0 || island_max < c1 - island1)
+        {
+          if (change_block_min <= (island0 - block0) + (island1 - block1))
+            {
+              memset (block0, 1, island0 - block0);
+              memset (block1, 1, island1 - block1);
+            }
+
+          block0 = block1 = NULL;
+        }
+    }
+}
+
 /* Adjust inserts/deletes of identical lines to join changes
    as much as possible.
 
@@ -907,6 +984,8 @@ diff_2_files (struct comparison *cmp)
       /* Modify the results slightly to make them prettier
         in cases where that can validly be done.  */
 
+      if (more_readable && output_style == OUTPUT_UNIFIED)
+        absorb_islands (cmp->file);
       shift_boundaries (cmp->file);
 
       /* Get the results of comparison in the form of a chain
Index: diff.c
===================================================================
RCS file: /sources/diffutils/diffutils/src/diff.c,v
retrieving revision 1.43
diff -p -u -r1.43 diff.c
--- diff.c      5 Jan 2006 07:23:55 -0000       1.43
+++ diff.c      15 Nov 2006 00:28:07 -0000
@@ -109,6 +109,7 @@ enum
   INHIBIT_HUNK_MERGE_OPTION,
   LEFT_COLUMN_OPTION,
   LINE_FORMAT_OPTION,
+  MORE_READABLE,
   NO_IGNORE_FILE_NAME_CASE_OPTION,
   NORMAL_OPTION,
   SDIFF_MERGE_ASSIST_OPTION,
@@ -172,6 +173,7 @@ static struct option const longopts[] =
   {"left-column", 0, 0, LEFT_COLUMN_OPTION},
   {"line-format", 1, 0, LINE_FORMAT_OPTION},
   {"minimal", 0, 0, 'd'},
+  {"more-readable", 0, 0, MORE_READABLE},
   {"new-file", 0, 0, 'N'},
   {"new-group-format", 1, 0, NEW_GROUP_FORMAT_OPTION},
   {"new-line-format", 1, 0, NEW_LINE_FORMAT_OPTION},
@@ -551,6 +553,10 @@ main (int argc, char **argv)
            specify_value (&line_format[i], optarg, "--line-format");
          break;
 
+        case MORE_READABLE:
+          more_readable = true;
+          break;
+
        case NO_IGNORE_FILE_NAME_CASE_OPTION:
          ignore_file_name_case = false;
          break;
@@ -901,6 +907,7 @@ static char const * const option_help_ms
   N_("--to-file=FILE2  Compare all operands to FILE2.  FILE2 can be a 
directory."),
   "",
   N_("--horizon-lines=NUM  Keep NUM lines of the common prefix and suffix."),
+  N_("--more-readable  Increase output size to make diff -u more readable."),
   N_("-d  --minimal  Try hard to find a smaller set of changes."),
   N_("--speed-large-files  Assume large files and many scattered small 
changes."),
   "",
Index: diff.h
===================================================================
RCS file: /sources/diffutils/diffutils/src/diff.h,v
retrieving revision 1.27
diff -p -u -r1.27 diff.h
--- diff.h      5 Jan 2006 07:23:55 -0000       1.27
+++ diff.h      15 Nov 2006 00:28:07 -0000
@@ -198,6 +198,9 @@ XTERN struct exclude *excluded;
    slower) but will find a guaranteed minimal set of changes.  */
 XTERN bool minimal;
 
+/* Discard some common lines to make the output of diff -u more readable.  */
+XTERN bool more_readable;
+
 /* Name of program the user invoked (for error messages).  */
 XTERN char *program_name;

[Prev in Thread]

Current Thread

[Next in Thread]

Easier to read diff -u output, Dan Hipschman <=
- Re: Easier to read diff -u output, Paul Eggert, 2006/11/15
- RE: Easier to read diff -u output, Bruce Korb, 2006/11/15
  - Re: Easier to read diff -u output, Dan Hipschman, 2006/11/15
    - Re: Easier to read diff -u output, Bruce Korb, 2006/11/16
    - Re: Easier to read diff -u output, Dan Hipschman, 2006/11/16

Prev by Date: Re: diffutils 2.8.1 ISO C90 compliance patch
Next by Date: Re: diffutils 2.8.1 ISO C90 compliance patch
Previous by thread: patch 2.5.4 strncasecmp
Next by thread: Re: Easier to read diff -u output
Index(es):
- Date
- Thread