sdiff Enhancement

bug-gnu-utils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

sdiff Enhancement

From:	Grant Stevens
Subject:	sdiff Enhancement
Date:	Mon, 26 Nov 2007 17:45:31 -0800 (PST)

Hello!

A project I worked recently required me to build an enhancement to sdiff. I'd 
like to contribute the enhancement to GNU diffutils. Paul Eggert assisted me 
with the legal paperwork and suggested I submit the enhancement to this address.

An abstract and diff -u output are attached. Can I add anything else to help 
the process along?

Thanks and best wishes ... Grant

       
---------------------------------
Never miss a thing.   Make Yahoo your homepage.

Enhancement to sdiff

A project I worked recently required that I "sdiff" two files, but I needed a
line-matchup that would align similar-but-not-identical lines. This note
describes the requirement.

Suppose we sdiff these two files:
$ cat file1
line1
close but not exact
another not exact
last line
$ cat file2
line1
line2
close but inexact
line3
line4
another inexact
last line
$ diff -W 43 -y file1 file2
line1                   line1
close but not exact  |  line2
another not exact    |  close but inexact
                     >  line3
                     >  line4
                     >  another inexact
last line               last line

Notice that identical lines are paired up nicely, but lines with any
differences are paired up without regard to their similarities.
I needed a tighter line-matching that would detect
similar-but-not-identical lines and associate them. I built the enhancement
to diffutils-2.8.1, using the previously unused -Y option for this feature.
The resulting output looks like this:
$ diff -W 43 -Y file1 file2 # (-Y = -y option, enhanced for tighter matching)
line1                   line1
                     >  line2
close but not exact  |  close but inexact
                     >  line3
                     >  line4
another not exact    |  another inexact
last line               last line

Notice that the similar lines are shown together.

diff -ur original/src/diff.c work/src/diff.c
--- original/src/diff.c 2002-03-24 02:35:28.000000000 -0500
+++ work/src/diff.c     2007-05-30 21:06:27.000000000 -0400
@@ -139,7 +139,7 @@
 }
 
 static char const shortopts[] =
-"0123456789abBcC:dD:eEfF:hHiI:lL:nNpPqrsS:tTuU:vwW:x:X:y";
+"0123456789abBcC:dD:eEfF:hHiI:lL:nNpPqrsS:tTuU:vwW:x:X:yY";
 
 /* Values for long options that do not have single-letter equivalents.  */
 enum
@@ -229,6 +229,7 @@
   {"show-c-function", 0, 0, 'p'},
   {"show-function-line", 1, 0, 'F'},
   {"side-by-side", 0, 0, 'y'},
+  {"side-by-side-tight-match", 0, 0, 'Y'},
   {"speed-large-files", 0, 0, 'H'},
   {"starting-file", 1, 0, 'S'},
   {"strip-trailing-cr", 0, 0, STRIP_TRAILING_CR_OPTION},
@@ -492,6 +493,11 @@
          specify_style (OUTPUT_SDIFF);
          break;
 
+       case 'Y':
+         specify_style (OUTPUT_SDIFF);
+          sdiff_tighter_match = 1;
+         break;
+
        case 'W':
          numval = strtoumax (optarg, &numend, 10);
          if (! (0 < numval && numval <= INT_MAX) || *numend)
@@ -854,7 +860,8 @@
   N_("-e  --ed  Output an ed script."),
   N_("--normal  Output a normal diff."),
   N_("-n  --rcs  Output an RCS format diff."),
-  N_("-y  --side-by-side  Output in two columns.\n\
+  N_("-y  --side-by-side  Output in two columns."),
+  N_("-Y  --side-by-side-tight-match  Output in two columns.\n\
   -W NUM  --width=NUM  Output at most NUM (default 130) print columns.\n\
   --left-column  Output only the left column of common lines.\n\
   --suppress-common-lines  Do not output common lines."),
diff -ur original/src/diff.h work/src/diff.h
--- original/src/diff.h 2002-03-11 16:24:42.000000000 -0500
+++ work/src/diff.h     2007-05-30 21:06:20.000000000 -0400
@@ -179,6 +179,9 @@
 XTERN unsigned int sdiff_half_width;
 XTERN unsigned int sdiff_column2_offset;
 
+/* Tell OUTPUT_SDIFF to use a tighter line-matching algorithm.  */
+XTERN bool sdiff_tighter_match;
+
 /* String containing all the command options diff received,
    with spaces between and at the beginning but none at the end.
    If there were no options given, this string is empty.  */
diff -ur original/src/side.c work/src/side.c
--- original/src/side.c 2002-02-07 13:17:04.000000000 -0500
+++ work/src/side.c     2007-05-31 16:17:05.000000000 -0400
@@ -22,8 +22,33 @@
 
 #include "diff.h"
 
+struct line_matchup
+{
+  struct line_matchup * next;
+  lin left;
+  lin right;
+};
+
 static void print_sdiff_common_lines (lin, lin);
 static void print_sdiff_hunk (struct change *);
+static struct line_matchup * find_best_match(
+  char const* const*, lin, lin,
+  char const* const*, lin, lin);
+static struct line_matchup * find_best_match1(
+  char const* const*, lin, lin,
+  char const* const*, lin, lin);
+static void free_line_matchup (struct line_matchup *);
+static void insert_matchup (
+  struct line_matchup * *,
+  struct line_matchup * *,
+  lin, lin, lin);
+static lin figure_score (
+  char const* const* leftlines,
+  char const* const* rightlines,
+  struct line_matchup * line_matchup);
+static lin matching_chars (
+  char const*, char const*,
+  char const*, char const*);
 
 /* Next line number to be printed in the two input files.  */
 static lin next0, next1;
@@ -238,6 +263,7 @@
 {
   lin first0, last0, first1, last1;
   register lin i, j;
+  struct line_matchup * line_matchup, * current_line_pair;
 
   /* Determine range of line numbers involved in each file.  */
   enum changes changes =
@@ -255,29 +281,373 @@
       fprintf (outfile, "c%ld,%ld\n", len0, len1);
     }
 
-  /* Print ``xxx  |  xxx '' lines */
-  if (changes == CHANGED)
+  line_matchup =
+    find_best_match (
+     files[0].linbuf, first0, last0,
+     files[1].linbuf, first1, last1);
+
+  for ( current_line_pair = line_matchup ;
+        current_line_pair ;
+        current_line_pair = current_line_pair->next )
+    {
+      if ( current_line_pair->left == -1 ) /* no line from left file */
+        {
+       print_1sdiff_line (
+          0, '>',
+          &files[1].linbuf[current_line_pair->right]);
+        next1 = current_line_pair->right + 1;
+        }
+      else if ( current_line_pair->right == -1 ) /* no line from right file */
+        {
+       print_1sdiff_line (
+          &files[0].linbuf[current_line_pair->left],
+          '<', 0);
+        next0 = current_line_pair->left + 1;
+        }
+      else /* lines from both files */
+        {
+       print_1sdiff_line (
+          &files[0].linbuf[current_line_pair->left],
+          '|',
+          &files[1].linbuf[current_line_pair->right]);
+        next0 = first0 = current_line_pair->left + 1;
+        next1 = first1 = current_line_pair->right + 1;
+        }
+    }
+
+  /* free the memory malloc'd by find_best_match() */
+  free_line_matchup (line_matchup);
+}
+
+/* We have a group of lines from the left file and a matching group from the
+   right file. Find lines with the most similarities, and pair them up.
+*/
+static struct line_matchup *
+find_best_match (
+  char const* const* leftlines,  const lin l_start, const lin l_end,
+  char const* const* rightlines, const lin r_start, const lin r_end)
+{
+  int incr, l_must_match, r_must_match, prior_left, prior_right;
+  struct line_matchup * best_match, * current_match;
+
+  best_match =
+    find_best_match1 (
+      leftlines,  l_start, l_end,
+      rightlines, r_start, r_end);
+
+  if ( sdiff_tighter_match )
+    {
+    /* At this point best_match has a record for every matched pair of lines. 
*/
+    /* Here we insert records for the unmatched lines. */
+    for ( current_match = best_match,
+          l_must_match = prior_left  = l_start,
+          r_must_match = prior_right = r_start ;
+          l_must_match <= l_end
+       || r_must_match <= r_end ;
+          /* see increments throughout loop */ )
+      {
+      if ( current_match )
+        {
+        prior_left  = MAX (prior_left,  current_match->left);
+        prior_right = MAX (prior_right, current_match->right);
+        }
+      if ( l_must_match < prior_left )
+        {
+        insert_matchup (&best_match, &current_match,
+          l_must_match, -1,
+          0 /* 0 = insert before current */);
+        ++ l_must_match;
+        }
+      else if ( r_must_match < prior_right )
+        {
+        insert_matchup (&best_match, &current_match,
+          -1, r_must_match,
+          0 /* 0 = insert before current */);
+        ++ r_must_match;
+        }
+      else if ( prior_left  == l_must_match
+             && prior_right == r_must_match
+             && l_start <= l_end
+             && r_start <= r_end )
+        {
+        ++ l_must_match;
+        ++ r_must_match;
+        }
+      else if ( l_must_match >= prior_left
+             && l_must_match <= l_end
+             && l_start <= l_end )
+        {
+        insert_matchup (&best_match, &current_match,
+          l_must_match, -1,
+          1 /* 1 = insert after current */);
+        ++ l_must_match;
+        }
+      else if ( r_must_match >= prior_right
+             && r_must_match <= r_end
+             && r_start <= r_end )
+        {
+        insert_matchup (&best_match, &current_match,
+          -1, r_must_match,
+          1 /* 1 = insert after current */);
+        ++ r_must_match;
+        }
+      else
+        break;
+      if ( current_match && current_match->next )
+        current_match = current_match->next;
+      }
+    }
+
+  return (best_match);
+}
+
+static void
+insert_matchup (
+  struct line_matchup * * first_matchup,
+  struct line_matchup * * current_matchup,
+  lin left, lin right,
+  lin insert_after /* 1 = insert new after current, 0 = insert before */)
+{
+  struct line_matchup * new_record =
+    (struct line_matchup *) xmalloc (sizeof (*new_record));
+  if ( ! * first_matchup ) /* insert first record in empty list */
+    {
+    * first_matchup = * current_matchup = new_record;
+    (*current_matchup)->left = left;
+    (*current_matchup)->right = right;
+    (*current_matchup)->next = (struct line_matchup *) 0;
+    }
+  else if ( insert_after ) /* insert new record after current_matchup */
+    {
+    new_record->left = left;
+    new_record->right = right;
+    new_record->next = (*current_matchup)->next;
+    (*current_matchup)->next = new_record;
+    }
+  else /* insert new record before current_matchup */
+    {
+    /* We can't actually insert the new record before the current, since we */
+    /* don't want to walk through the linked list to find the previous */
+    /* record to point its ->next to the new record. So to achieve the same */
+    /* result, we 1) insert the new record after the current, 2) populate */
+    /* the new record with the current record's values, and 3) populate the */
+    /* current record with the new values. */
+    new_record->left  = (*current_matchup)->left;
+    new_record->right = (*current_matchup)->right;
+    new_record->next  = (*current_matchup)->next;
+    (*current_matchup)->left = left;
+    (*current_matchup)->right = right;
+    (*current_matchup)->next = new_record;
+    }
+}
+
+static struct line_matchup *
+find_best_match1 (
+  char const* const* leftlines,  const lin l_start, const lin l_end,
+  char const* const* rightlines, const lin r_start, const lin r_end)
+{
+  int incr, max;
+  struct line_matchup * best_match, * current_match;
+  long best_score, current_score;
+
+  if ( sdiff_tighter_match )
     {
-      for (i = first0, j = first1;  i <= last0 && j <= last1;  i++, j++)
-       print_1sdiff_line (&files[0].linbuf[i], '|', &files[1].linbuf[j]);
-      changes = (i <= last0 ? OLD : 0) + (j <= last1 ? NEW : 0);
-      next0 = first0 = i;
-      next1 = first1 = j;
+    /* If we have searched past the end of either group of lines, no further */
+    /* search is possible. */
+    if ( l_start > l_end
+      || r_start > r_end )
+          return ((struct line_matchup *) 0);
+
+    best_match = (struct line_matchup *) xmalloc (sizeof (*best_match));
+    best_match->left  = l_start;
+    best_match->right = r_start;
+    best_match->next =
+      find_best_match1 (
+        leftlines,  l_start + 1, l_end,
+        rightlines, r_start + 1, r_end);
+    best_score = figure_score (leftlines, rightlines, best_match);
+  
+    current_match = (struct line_matchup *) xmalloc (sizeof (*current_match));
+  
+    max = MAX (l_end - l_start, r_end - r_start);
+    for ( incr = 1 ; incr <= max ; ++ incr )
+      {
+      int i = l_start + incr;
+      if ( i <= l_end )
+          {
+          current_match->left  = i;
+          current_match->right = r_start;
+          current_match->next =
+            find_best_match1 (
+              leftlines,  i + 1,       l_end,
+              rightlines, r_start + 1, r_end);
+          current_score = figure_score (leftlines, rightlines, current_match);
+          if ( current_score > best_score )
+                  {
+                  free_line_matchup (best_match);
+                  best_match = current_match;
+                  best_score = current_score;
+                  current_match = (struct line_matchup *) xmalloc (sizeof 
(*current_match));
+                  }
+          }
+      i = r_start + incr;
+      if ( i <= r_end )
+          {
+          current_match->left  = l_start;
+          current_match->right = i;
+          current_match->next =
+            find_best_match1 (
+              leftlines,  l_start + 1, l_end,
+              rightlines, i + 1,       r_end);
+          current_score = figure_score (leftlines, rightlines, current_match);
+          if ( current_score > best_score )
+                  {
+                  free_line_matchup (best_match);
+                  best_match = current_match;
+                  best_score = current_score;
+                  current_match = (struct line_matchup *) xmalloc (sizeof 
(*current_match));
+                  }
+          }
+      }
+    free_line_matchup (current_match);
     }
+  else /* sdiff_tighter_match was not selected */
+    {
+    /* If we have searched past the end of both groups of lines, no further */
+    /* search is possible. */
+    if ( l_start > l_end
+      && r_start > r_end )
+          return ((struct line_matchup *) 0);
+
+    /* This implementation just matches up lines in order, with no scoring. */
+    /* It is essentially the same as the legacy sdiff algorithm. */
+    best_match = (struct line_matchup *) xmalloc (sizeof (*best_match));
+    best_match->left  = l_start <= l_end ? l_start : -1;
+    best_match->right = r_start <= r_end ? r_start : -1;
+    best_match->next =
+      find_best_match1 (
+        leftlines,  l_start <= l_end ? l_start + 1 : l_start, l_end,
+        rightlines, r_start <= r_end ? r_start + 1 : r_start, r_end);
+    }
+
+  return (best_match);
+}
 
-  /* Print ``     >  xxx '' lines */
-  if (changes & NEW)
+static void
+free_line_matchup (struct line_matchup * line_matchup)
+{
+  struct line_matchup * current_line_pair;
+  /* free the memory malloc'd by find_best_match() */
+  for ( current_line_pair = line_matchup ;
+        current_line_pair ;
+        /* increment routine in loop */ )
     {
-      for (j = first1; j <= last1; ++j)
-       print_1sdiff_line (0, '>', &files[1].linbuf[j]);
-      next1 = j;
+      struct line_matchup * previous_line_pair = current_line_pair;
+      current_line_pair = current_line_pair->next; /* loop increment */
+      free ((char *) previous_line_pair);
     }
+}
 
-  /* Print ``xxx  <     '' lines */
-  if (changes & OLD)
+static lin
+figure_score (
+  char const* const* leftlines,
+  char const* const* rightlines,
+  struct line_matchup * line_matchup)
+{
+  int score = 0;
+  struct line_matchup * current_line_pair;
+  for ( current_line_pair = line_matchup ;
+        current_line_pair ;
+        current_line_pair = current_line_pair->next )
+    {
+    if ( current_line_pair->left  >= 0
+      && current_line_pair->right >= 0 )
+        score += matching_chars (
+          leftlines[current_line_pair->left], (char const*) 0,
+          rightlines[current_line_pair->right], (char const*) 0);
+    }
+  return (score);
+}
+
+static lin
+matching_chars (
+  char const* str1, char const* end1,
+  char const* str2, char const* end2)
+{
+  char const * s1, * s2, * e1, * e2, * c1, * c2;
+  lin best_middle_score, current_middle_score;
+
+  /* Find matching chars at the beginning of the strings. */
+  for ( s1 = str1, s2 = str2 ;
+        *s1 == *s2 && *s1 != '\0' && *s1 != '\n' ;
+        ++ s1, ++ s2 )
+    { }
+  /* If the caller didn't supply end of string values, we find them. */
+  if ( end1 == (char const*) 0
+    || end2 == (char const*) 0 )
+      {
+      for ( end1 = s1 ; * end1 && * end1 != '\n' ; ++ end1 )
+        { } /* find end of str1 */
+      for ( end2 = s2 ; * end2 && * end2 != '\n' ; ++ end2 )
+        { } /* find end of str2 */
+      }
+  /* Find matching chars at the end of the strings */
+  e1 = end1;
+  e2 = end2;
+  while ( e1 > s1
+       && e2 > s2
+       && * (e1 - 1) == * (e2 - 1) )
+        {
+        -- e1; /* walk backward thru strings until */
+        -- e2; /* we find a mismatch char */
+        }
+  /* If the middle section has zero characters, we return the matching chars */
+  /* from the beginning and end sections. */
+  if ( e1 == s1
+    || e2 == s2 )
+      return (
+        (s1 - str1) /* matching chars at beginning of strings */
+        + (end1 - e1)); /* matching chars at end of strings */
+
+  best_middle_score = 0;
+  for ( c1 = s1 ; c1 < e1 ; ++ c1 )
+    {
+    /* If we've already found *c1 earlier in s1, this later occurrence can't */
+    /* result in a better match, so we ignore it. */
+    char const* c = strchr (s1, *c1);
+    if ( c1 && c1 < s1 )
+      continue;
+    /* If *c1 does not occur in s2, it's no match; skip it. */
+    c = strchr (s2, *c1);
+    if ( ! c || c >= e2 )
+      continue;
+    /* We found a match; score it. */
+    current_middle_score =
+      1 /* for the char at c1 that matched. */
+      + matching_chars (c1 + 1, e1, c + 1, e2);
+    if ( current_middle_score > best_middle_score )
+      best_middle_score = current_middle_score;
+    }
+  for ( c2 = s2 + 1 ; c2 < e2 ; ++ c2 )
     {
-      for (i = first0; i <= last0; ++i)
-       print_1sdiff_line (&files[0].linbuf[i], '<', 0);
-      next0 = i;
+    /* If we've already found *c2 earlier in s2, this later occurrence can't */
+    /* result in a better match, so we ignore it. */
+    char const* c = strchr (s2, *c2);
+    if ( c2 && c2 < s2 )
+      continue;
+    /* If *c2 does not occur in s1, it's no match; skip it. */
+    c = strchr (s1, *c2);
+    if ( ! c || c >= e1 )
+      continue;
+    /* We found a match; score it. */
+    current_middle_score =
+      1 /* for the char at c2 that matched. */
+      + matching_chars (c + 1, e1, c2 + 1, e2);
+    if ( current_middle_score > best_middle_score )
+      best_middle_score = current_middle_score;
     }
+  return (
+    (s1 - str1) /* matching chars at beginning of strings */
+    + best_middle_score /* best match in middle of strings */
+    + (end1 - e1)); /* matching chars at end of strings */
 }

[Prev in Thread]

Current Thread

[Next in Thread]

sdiff Enhancement, Grant Stevens <=
- Re: sdiff Enhancement, Bruno Haible, 2007/11/28

Prev by Date: Re: gettext "plural forms" in Turkish
Next by Date: Re: gperf output licensing
Previous by thread: Subtle problem with gettext.info
Next by thread: Re: sdiff Enhancement
Index(es):
- Date
- Thread