striconveh, striconveha: keeping track of byte correspondences

bug-gnulib
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
striconveh, striconveha: keeping track of byte correspondences

From:	Bruno Haible
Subject:	striconveh, striconveha: keeping track of byte correspondences
Date:	Tue, 23 Jan 2007 02:16:35 +0100 (MET)
User-agent:	KMail/1.5.4
Hi,

The iconv routines allow to convert strings, but lack an important
functionality: the ability to transport information from the original
string to the converted string or back. Such as word breaks, line breaking
opportunities, or "wdiff" results.

This patch makes it possible. I add an optional 'offsets' argument to the
general conversion functions in the modules 'striconveh', 'striconveha'.
No need to make 'striconv' more complicated - users of 'striconv' can switch
to 'striconveh' very easily.

2007-01-22  Bruno Haible  <address@hidden>

        * lib/striconveh.h (mem_cd_iconveh, mem_iconveh): Add 'offsets'
        argument.
        * lib/striconveh.c (iconv_carefully_1): New function.
        (mem_cd_iconveh_internal, mem_cd_iconveh, mem_iconveh): Add 'offsets'
        argument.
        (str_cd_iconveh): Update.
        * lib/striconveha.h (mem_iconveha): Add 'offsets' argument.
        * lib/striconveha.c (mem_iconveha): Add 'offsets' argument.
        * tests/test-striconveh.c (MAGIC): New macro.
        (new_offsets): New function.
        (main): Test call with and without offsets.

*** lib/striconveh.h    21 Jan 2007 22:58:01 -0000      1.4
--- lib/striconveh.h    23 Jan 2007 01:03:24 -0000
***************
*** 47,52 ****
--- 47,56 ----
     (iconv_t)(-1) if FROM_CODESET is UTF-8).
     CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1)
     if TO_CODESET is UTF-8).
+    If OFFSETS is not NULL, it should point to an array of SRCLEN integers; 
this
+    array is filled with offsets into the result, i.e. the character starting
+    at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]],
+    and other offsets are set to (size_t)(-1).
     *RESULTP and *LENGTH should initially be a scratch buffer and its size,
     or *RESULTP can initially be NULL.
     May erase the contents of the memory at *RESULTP.
***************
*** 58,63 ****
--- 62,68 ----
         mem_cd_iconveh (const char *src, size_t srclen,
                       iconv_t cd, iconv_t cd1, iconv_t cd2,
                       enum iconv_ilseq_handler handler,
+                      size_t *offsets,
                       char **resultp, size_t *lengthp);
  
  /* Convert an entire string from one encoding to another, using iconv.
***************
*** 81,86 ****
--- 86,95 ----
  
  /* Convert an entire string from one encoding to another, using iconv.
     The original string is at [SRC,...,SRC+SRCLEN-1].
+    If OFFSETS is not NULL, it should point to an array of SRCLEN integers; 
this
+    array is filled with offsets into the result, i.e. the character starting
+    at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]],
+    and other offsets are set to (size_t)(-1).
     *RESULTP and *LENGTH should initially be a scratch buffer and its size,
     or *RESULTP can initially be NULL.
     May erase the contents of the memory at *RESULTP.
***************
*** 92,97 ****
--- 101,107 ----
         mem_iconveh (const char *src, size_t srclen,
                    const char *from_codeset, const char *to_codeset,
                    enum iconv_ilseq_handler handler,
+                   size_t *offsets,
                    char **resultp, size_t *lengthp);
  
  /* Convert an entire string from one encoding to another, using iconv.
*** lib/striconveh.c    21 Jan 2007 22:59:19 -0000      1.5
--- lib/striconveh.c    23 Jan 2007 01:03:25 -0000
***************
*** 119,129 ****
--- 119,186 ----
        iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, 
outbytesleft))
  # endif
  
+ /* iconv_carefully_1 is like iconv_carefully, except that it stops after
+    converting one character.  */
+ static size_t
+ iconv_carefully_1 (iconv_t cd,
+                  const char **inbuf, size_t *inbytesleft,
+                  char **outbuf, size_t *outbytesleft,
+                  bool *incremented)
+ {
+   const char *inptr = *inbuf;
+   const char *inptr_end = inptr + *inbytesleft;
+   char *outptr = *outbuf;
+   size_t outsize = *outbytesleft;
+   const char *inptr_before = inptr;
+   size_t res = (size_t)(-1);
+   size_t insize;
+ 
+   for (insize = 1; inptr + insize <= inptr_end; insize++)
+     {
+       res = iconv (cd,
+                  (ICONV_CONST char **) &inptr, &insize,
+                  &outptr, &outsize);
+       if (!(res == (size_t)(-1) && errno == EINVAL))
+       break;
+       /* We expect that no input bytes have been consumed so far.  */
+       if (inptr != inptr_before)
+       abort ();
+     }
+ 
+   *inbuf = inptr;
+   *inbytesleft = inptr_end - inptr;
+ # if !defined _LIBICONV_VERSION && !defined __GLIBC__
+   /* Irix iconv() inserts a NUL byte if it cannot convert.
+      NetBSD iconv() inserts a question mark if it cannot convert.
+      Only GNU libiconv and GNU libc are known to prefer to fail rather
+      than doing a lossy conversion.  */
+   if (res != (size_t)(-1) && res > 0)
+     {
+       /* iconv() has already incremented INPTR.  We cannot go back to a
+        previous INPTR, otherwise the state inside CD would become invalid,
+        if FROM_CODESET is a stateful encoding.  So, tell the caller that
+        *INBUF has already been incremented.  */
+       *incremented = (inptr > inptr_before);
+       errno = EILSEQ;
+       return (size_t)(-1);
+     }
+ # endif
+ 
+   if (res != (size_t)(-1))
+     {
+       *outbuf = outptr;
+       *outbytesleft = outsize;
+     }
+   *incremented = false;
+   return res;
+ }
+ 
  static int
  mem_cd_iconveh_internal (const char *src, size_t srclen,
                         iconv_t cd, iconv_t cd1, iconv_t cd2,
                         enum iconv_ilseq_handler handler,
                         size_t extra_alloc,
+                        size_t *offsets,
                         char **resultp, size_t *lengthp)
  {
    /* When a conversion error occurs, we cannot start using CD1 and CD2 at
***************
*** 141,146 ****
--- 198,204 ----
    char *result;
    size_t allocated;
    size_t length;
+   size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
  
    if (*lengthp >= sizeof (tmpbuf))
      {
***************
*** 153,158 ****
--- 211,226 ----
        allocated = sizeof (tmpbuf);
      }
    result = initial_result;
+ 
+   if (offsets != NULL)
+     {
+       size_t i;
+ 
+       for (i = 0; i < srclen; i++)
+       offsets[i] = (size_t)(-1);
+ 
+       last_length = (size_t)(-1);
+     }
    length = 0;
  
    /* First, try a direct conversion, and see whether a conversion error
***************
*** 176,191 ****
        size_t res;
        bool grow;
  
!       /* Use iconv_carefully instead of iconv here, because:
!          - If TO_CODESET is UTF-8, we can do the error handling in this loop,
!            no need for a second loop,
!          - With iconv() implementations other than GNU libiconv and GNU libc,
!            if we use iconv() in a big swoop, checking for an E2BIG return,
!            we lose the number of irreversible conversions.  */
!       res = iconv_carefully (cd,
!                              &inptr, &insize,
!                              &outptr, &outsize,
!                              &incremented);
  
        length = outptr - result;
        grow = (length + extra_alloc > allocated / 2);
--- 244,272 ----
        size_t res;
        bool grow;
  
!       if (offsets != NULL)
!         {
!           if (length != last_length) /* ensure that offset[] be increasing */
!             {
!               offsets[inptr - src] = length;
!               last_length = length;
!             }
!           res = iconv_carefully_1 (cd,
!                                    &inptr, &insize,
!                                    &outptr, &outsize,
!                                    &incremented);
!         }
!       else
!         /* Use iconv_carefully instead of iconv here, because:
!            - If TO_CODESET is UTF-8, we can do the error handling in this
!              loop, no need for a second loop,
!            - With iconv() implementations other than GNU libiconv and GNU
!              libc, if we use iconv() in a big swoop, checking for an E2BIG
!              return, we lose the number of irreversible conversions.  */
!         res = iconv_carefully (cd,
!                                &inptr, &insize,
!                                &outptr, &outsize,
!                                &incremented);
  
        length = outptr - result;
        grow = (length + extra_alloc > allocated / 2);
***************
*** 332,337 ****
--- 413,427 ----
    /* The direct conversion failed, handler != iconveh_error,
       and cd2 != (iconv_t)(-1).
       Use a conversion through UTF-8.  */
+   if (offsets != NULL)
+     {
+       size_t i;
+ 
+       for (i = 0; i < srclen; i++)
+       offsets[i] = (size_t)(-1);
+ 
+       last_length = (size_t)(-1);
+     }
    length = 0;
    {
  # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
***************
*** 362,372 ****
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
        if (in1size > 0)
          {
            if (cd1 != (iconv_t)(-1))
!             res1 = iconv_carefully (cd1,
!                                     (ICONV_CONST char **) &in1ptr, &in1size,
!                                     &out1ptr, &out1size,
!                                     &incremented1);
            else
              {
                /* FROM_CODESET is UTF-8.  */
--- 452,476 ----
        /* Conversion step 1: from FROM_CODESET to UTF-8.  */
        if (in1size > 0)
          {
+           if (offsets != NULL
+               && length != last_length) /* ensure that offset[] be increasing 
*/
+             {
+               offsets[in1ptr - src] = length;
+               last_length = length;
+             }
            if (cd1 != (iconv_t)(-1))
!             {
!               if (offsets != NULL)
!                 res1 = iconv_carefully_1 (cd1,
!                                           &in1ptr, &in1size,
!                                           &out1ptr, &out1size,
!                                           &incremented1);
!               else
!                 res1 = iconv_carefully (cd1,
!                                         &in1ptr, &in1size,
!                                         &out1ptr, &out1size,
!                                         &incremented1);
!             }
            else
              {
                /* FROM_CODESET is UTF-8.  */
***************
*** 418,424 ****
                    out1ptr += m;
                    out1size -= m;
                  }
!               while (in1size > 0);
              }
          }
        else if (do_final_flush1)
--- 522,528 ----
                    out1ptr += m;
                    out1size -= m;
                  }
!               while (offsets == NULL && in1size > 0);
              }
          }
        else if (do_final_flush1)
***************
*** 469,475 ****
        errno1 = errno;
        utf8len = out1ptr - utf8buf;
  
!       if (in1size == 0
            || utf8len > utf8bufsize / 2
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
          {
--- 573,580 ----
        errno1 = errno;
        utf8len = out1ptr - utf8buf;
  
!       if (offsets != NULL
!           || in1size == 0
            || utf8len > utf8bufsize / 2
            || (res1 == (size_t)(-1) && errno1 == E2BIG))
          {
***************
*** 726,735 ****
  mem_cd_iconveh (const char *src, size_t srclen,
                iconv_t cd, iconv_t cd1, iconv_t cd2,
                enum iconv_ilseq_handler handler,
                char **resultp, size_t *lengthp)
  {
    return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
!                                 resultp, lengthp);
  }
  
  char *
--- 831,841 ----
  mem_cd_iconveh (const char *src, size_t srclen,
                iconv_t cd, iconv_t cd1, iconv_t cd2,
                enum iconv_ilseq_handler handler,
+               size_t *offsets,
                char **resultp, size_t *lengthp)
  {
    return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
!                                 offsets, resultp, lengthp);
  }
  
  char *
***************
*** 744,750 ****
    char *result = NULL;
    size_t length = 0;
    int retval = mem_cd_iconveh_internal (src, strlen (src),
!                                       cd, cd1, cd2, handler, 1,
                                        &result, &length);
  
    if (retval < 0)
--- 850,856 ----
    char *result = NULL;
    size_t length = 0;
    int retval = mem_cd_iconveh_internal (src, strlen (src),
!                                       cd, cd1, cd2, handler, 1, NULL,
                                        &result, &length);
  
    if (retval < 0)
***************
*** 770,775 ****
--- 876,882 ----
  mem_iconveh (const char *src, size_t srclen,
             const char *from_codeset, const char *to_codeset,
             enum iconv_ilseq_handler handler,
+            size_t *offsets,
             char **resultp, size_t *lengthp)
  {
    if (srclen == 0)
***************
*** 778,784 ****
        *lengthp = 0;
        return 0;
      }
!   else if (c_strcasecmp (from_codeset, to_codeset) == 0)
      {
        char *result;
  
--- 885,891 ----
        *lengthp = 0;
        return 0;
      }
!   else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
      {
        char *result;
  
***************
*** 854,861 ****
  
        result = *resultp;
        length = *lengthp;
!       retval =
!       mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, &result, &length);
  
        if (retval < 0)
        {
--- 961,968 ----
  
        result = *resultp;
        length = *lengthp;
!       retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
!                              &result, &length);
  
        if (retval < 0)
        {
*** lib/striconveha.h   21 Jan 2007 22:59:19 -0000      1.1
--- lib/striconveha.h   23 Jan 2007 01:03:25 -0000
***************
*** 30,35 ****
--- 30,39 ----
  /* Convert an entire string from one encoding to another, using iconv.
     The original string is at [SRC,...,SRC+SRCLEN-1].
     The "from" encoding can also be a name defined for autodetection.
+    If OFFSETS is not NULL, it should point to an array of SRCLEN integers; 
this
+    array is filled with offsets into the result, i.e. the character starting
+    at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]],
+    and other offsets are set to (size_t)(-1).
     *RESULTP and *LENGTH should initially be a scratch buffer and its size,
     or *RESULTP can initially be NULL.
     May erase the contents of the memory at *RESULTP.
***************
*** 41,46 ****
--- 45,51 ----
         mem_iconveha (const char *src, size_t srclen,
                     const char *from_codeset, const char *to_codeset,
                     enum iconv_ilseq_handler handler,
+                    size_t *offsets,
                     char **resultp, size_t *lengthp);
  
  /* Convert an entire string from one encoding to another, using iconv.
*** lib/striconveha.c   21 Jan 2007 22:59:19 -0000      1.1
--- lib/striconveha.c   23 Jan 2007 01:03:25 -0000
***************
*** 147,156 ****
  mem_iconveha (const char *src, size_t srclen,
              const char *from_codeset, const char *to_codeset,
              enum iconv_ilseq_handler handler,
              char **resultp, size_t *lengthp)
  {
    int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
!                           resultp, lengthp);
    if (retval >= 0 || errno != EINVAL)
      return retval;
    else
--- 147,157 ----
  mem_iconveha (const char *src, size_t srclen,
              const char *from_codeset, const char *to_codeset,
              enum iconv_ilseq_handler handler,
+             size_t *offsets,
              char **resultp, size_t *lengthp)
  {
    int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
!                           offsets, resultp, lengthp);
    if (retval >= 0 || errno != EINVAL)
      return retval;
    else
***************
*** 168,174 ****
              {
                retval = mem_iconveha (src, srclen,
                                       from_codeset, to_codeset, handler,
!                                      resultp, lengthp);
                if (!(retval < 0 && errno == EILSEQ))
                  return retval;
                encodings++;
--- 169,175 ----
              {
                retval = mem_iconveha (src, srclen,
                                       from_codeset, to_codeset, handler,
!                                      offsets, resultp, lengthp);
                if (!(retval < 0 && errno == EILSEQ))
                  return retval;
                encodings++;
[Prev in Thread]
Current Thread
[Next in Thread]
striconveh, striconveha: keeping track of byte correspondences, Bruno Haible <=
Prev by Date: Re: Use "$(MKDIR_P) sys", not race-prone "test -d sys || mkdir sys".
Next by Date: Re: Use "$(MKDIR_P) sys", not race-prone "test -d sys || mkdir sys".
Previous by thread: gettimeofday module license
Next by thread: Re: an autoconf expert challenge
Index(es):
- Date
- Thread