[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
striconveh, striconveha: keeping track of byte correspondences
From: |
Bruno Haible |
Subject: |
striconveh, striconveha: keeping track of byte correspondences |
Date: |
Tue, 23 Jan 2007 02:16:35 +0100 (MET) |
User-agent: |
KMail/1.5.4 |
Hi,
The iconv routines allow to convert strings, but lack an important
functionality: the ability to transport information from the original
string to the converted string or back. Such as word breaks, line breaking
opportunities, or "wdiff" results.
This patch makes it possible. I add an optional 'offsets' argument to the
general conversion functions in the modules 'striconveh', 'striconveha'.
No need to make 'striconv' more complicated - users of 'striconv' can switch
to 'striconveh' very easily.
2007-01-22 Bruno Haible <address@hidden>
* lib/striconveh.h (mem_cd_iconveh, mem_iconveh): Add 'offsets'
argument.
* lib/striconveh.c (iconv_carefully_1): New function.
(mem_cd_iconveh_internal, mem_cd_iconveh, mem_iconveh): Add 'offsets'
argument.
(str_cd_iconveh): Update.
* lib/striconveha.h (mem_iconveha): Add 'offsets' argument.
* lib/striconveha.c (mem_iconveha): Add 'offsets' argument.
* tests/test-striconveh.c (MAGIC): New macro.
(new_offsets): New function.
(main): Test call with and without offsets.
*** lib/striconveh.h 21 Jan 2007 22:58:01 -0000 1.4
--- lib/striconveh.h 23 Jan 2007 01:03:24 -0000
***************
*** 47,52 ****
--- 47,56 ----
(iconv_t)(-1) if FROM_CODESET is UTF-8).
CD2 is the conversion descriptor from UTF-8 to TO_CODESET (or (iconv_t)(-1)
if TO_CODESET is UTF-8).
+ If OFFSETS is not NULL, it should point to an array of SRCLEN integers;
this
+ array is filled with offsets into the result, i.e. the character starting
+ at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]],
+ and other offsets are set to (size_t)(-1).
*RESULTP and *LENGTH should initially be a scratch buffer and its size,
or *RESULTP can initially be NULL.
May erase the contents of the memory at *RESULTP.
***************
*** 58,63 ****
--- 62,68 ----
mem_cd_iconveh (const char *src, size_t srclen,
iconv_t cd, iconv_t cd1, iconv_t cd2,
enum iconv_ilseq_handler handler,
+ size_t *offsets,
char **resultp, size_t *lengthp);
/* Convert an entire string from one encoding to another, using iconv.
***************
*** 81,86 ****
--- 86,95 ----
/* Convert an entire string from one encoding to another, using iconv.
The original string is at [SRC,...,SRC+SRCLEN-1].
+ If OFFSETS is not NULL, it should point to an array of SRCLEN integers;
this
+ array is filled with offsets into the result, i.e. the character starting
+ at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]],
+ and other offsets are set to (size_t)(-1).
*RESULTP and *LENGTH should initially be a scratch buffer and its size,
or *RESULTP can initially be NULL.
May erase the contents of the memory at *RESULTP.
***************
*** 92,97 ****
--- 101,107 ----
mem_iconveh (const char *src, size_t srclen,
const char *from_codeset, const char *to_codeset,
enum iconv_ilseq_handler handler,
+ size_t *offsets,
char **resultp, size_t *lengthp);
/* Convert an entire string from one encoding to another, using iconv.
*** lib/striconveh.c 21 Jan 2007 22:59:19 -0000 1.5
--- lib/striconveh.c 23 Jan 2007 01:03:25 -0000
***************
*** 119,129 ****
--- 119,186 ----
iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf,
outbytesleft))
# endif
+ /* iconv_carefully_1 is like iconv_carefully, except that it stops after
+ converting one character. */
+ static size_t
+ iconv_carefully_1 (iconv_t cd,
+ const char **inbuf, size_t *inbytesleft,
+ char **outbuf, size_t *outbytesleft,
+ bool *incremented)
+ {
+ const char *inptr = *inbuf;
+ const char *inptr_end = inptr + *inbytesleft;
+ char *outptr = *outbuf;
+ size_t outsize = *outbytesleft;
+ const char *inptr_before = inptr;
+ size_t res = (size_t)(-1);
+ size_t insize;
+
+ for (insize = 1; inptr + insize <= inptr_end; insize++)
+ {
+ res = iconv (cd,
+ (ICONV_CONST char **) &inptr, &insize,
+ &outptr, &outsize);
+ if (!(res == (size_t)(-1) && errno == EINVAL))
+ break;
+ /* We expect that no input bytes have been consumed so far. */
+ if (inptr != inptr_before)
+ abort ();
+ }
+
+ *inbuf = inptr;
+ *inbytesleft = inptr_end - inptr;
+ # if !defined _LIBICONV_VERSION && !defined __GLIBC__
+ /* Irix iconv() inserts a NUL byte if it cannot convert.
+ NetBSD iconv() inserts a question mark if it cannot convert.
+ Only GNU libiconv and GNU libc are known to prefer to fail rather
+ than doing a lossy conversion. */
+ if (res != (size_t)(-1) && res > 0)
+ {
+ /* iconv() has already incremented INPTR. We cannot go back to a
+ previous INPTR, otherwise the state inside CD would become invalid,
+ if FROM_CODESET is a stateful encoding. So, tell the caller that
+ *INBUF has already been incremented. */
+ *incremented = (inptr > inptr_before);
+ errno = EILSEQ;
+ return (size_t)(-1);
+ }
+ # endif
+
+ if (res != (size_t)(-1))
+ {
+ *outbuf = outptr;
+ *outbytesleft = outsize;
+ }
+ *incremented = false;
+ return res;
+ }
+
static int
mem_cd_iconveh_internal (const char *src, size_t srclen,
iconv_t cd, iconv_t cd1, iconv_t cd2,
enum iconv_ilseq_handler handler,
size_t extra_alloc,
+ size_t *offsets,
char **resultp, size_t *lengthp)
{
/* When a conversion error occurs, we cannot start using CD1 and CD2 at
***************
*** 141,146 ****
--- 198,204 ----
char *result;
size_t allocated;
size_t length;
+ size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
if (*lengthp >= sizeof (tmpbuf))
{
***************
*** 153,158 ****
--- 211,226 ----
allocated = sizeof (tmpbuf);
}
result = initial_result;
+
+ if (offsets != NULL)
+ {
+ size_t i;
+
+ for (i = 0; i < srclen; i++)
+ offsets[i] = (size_t)(-1);
+
+ last_length = (size_t)(-1);
+ }
length = 0;
/* First, try a direct conversion, and see whether a conversion error
***************
*** 176,191 ****
size_t res;
bool grow;
! /* Use iconv_carefully instead of iconv here, because:
! - If TO_CODESET is UTF-8, we can do the error handling in this loop,
! no need for a second loop,
! - With iconv() implementations other than GNU libiconv and GNU libc,
! if we use iconv() in a big swoop, checking for an E2BIG return,
! we lose the number of irreversible conversions. */
! res = iconv_carefully (cd,
! &inptr, &insize,
! &outptr, &outsize,
! &incremented);
length = outptr - result;
grow = (length + extra_alloc > allocated / 2);
--- 244,272 ----
size_t res;
bool grow;
! if (offsets != NULL)
! {
! if (length != last_length) /* ensure that offset[] be increasing */
! {
! offsets[inptr - src] = length;
! last_length = length;
! }
! res = iconv_carefully_1 (cd,
! &inptr, &insize,
! &outptr, &outsize,
! &incremented);
! }
! else
! /* Use iconv_carefully instead of iconv here, because:
! - If TO_CODESET is UTF-8, we can do the error handling in this
! loop, no need for a second loop,
! - With iconv() implementations other than GNU libiconv and GNU
! libc, if we use iconv() in a big swoop, checking for an E2BIG
! return, we lose the number of irreversible conversions. */
! res = iconv_carefully (cd,
! &inptr, &insize,
! &outptr, &outsize,
! &incremented);
length = outptr - result;
grow = (length + extra_alloc > allocated / 2);
***************
*** 332,337 ****
--- 413,427 ----
/* The direct conversion failed, handler != iconveh_error,
and cd2 != (iconv_t)(-1).
Use a conversion through UTF-8. */
+ if (offsets != NULL)
+ {
+ size_t i;
+
+ for (i = 0; i < srclen; i++)
+ offsets[i] = (size_t)(-1);
+
+ last_length = (size_t)(-1);
+ }
length = 0;
{
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
***************
*** 362,372 ****
/* Conversion step 1: from FROM_CODESET to UTF-8. */
if (in1size > 0)
{
if (cd1 != (iconv_t)(-1))
! res1 = iconv_carefully (cd1,
! (ICONV_CONST char **) &in1ptr, &in1size,
! &out1ptr, &out1size,
! &incremented1);
else
{
/* FROM_CODESET is UTF-8. */
--- 452,476 ----
/* Conversion step 1: from FROM_CODESET to UTF-8. */
if (in1size > 0)
{
+ if (offsets != NULL
+ && length != last_length) /* ensure that offset[] be increasing
*/
+ {
+ offsets[in1ptr - src] = length;
+ last_length = length;
+ }
if (cd1 != (iconv_t)(-1))
! {
! if (offsets != NULL)
! res1 = iconv_carefully_1 (cd1,
! &in1ptr, &in1size,
! &out1ptr, &out1size,
! &incremented1);
! else
! res1 = iconv_carefully (cd1,
! &in1ptr, &in1size,
! &out1ptr, &out1size,
! &incremented1);
! }
else
{
/* FROM_CODESET is UTF-8. */
***************
*** 418,424 ****
out1ptr += m;
out1size -= m;
}
! while (in1size > 0);
}
}
else if (do_final_flush1)
--- 522,528 ----
out1ptr += m;
out1size -= m;
}
! while (offsets == NULL && in1size > 0);
}
}
else if (do_final_flush1)
***************
*** 469,475 ****
errno1 = errno;
utf8len = out1ptr - utf8buf;
! if (in1size == 0
|| utf8len > utf8bufsize / 2
|| (res1 == (size_t)(-1) && errno1 == E2BIG))
{
--- 573,580 ----
errno1 = errno;
utf8len = out1ptr - utf8buf;
! if (offsets != NULL
! || in1size == 0
|| utf8len > utf8bufsize / 2
|| (res1 == (size_t)(-1) && errno1 == E2BIG))
{
***************
*** 726,735 ****
mem_cd_iconveh (const char *src, size_t srclen,
iconv_t cd, iconv_t cd1, iconv_t cd2,
enum iconv_ilseq_handler handler,
char **resultp, size_t *lengthp)
{
return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
! resultp, lengthp);
}
char *
--- 831,841 ----
mem_cd_iconveh (const char *src, size_t srclen,
iconv_t cd, iconv_t cd1, iconv_t cd2,
enum iconv_ilseq_handler handler,
+ size_t *offsets,
char **resultp, size_t *lengthp)
{
return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
! offsets, resultp, lengthp);
}
char *
***************
*** 744,750 ****
char *result = NULL;
size_t length = 0;
int retval = mem_cd_iconveh_internal (src, strlen (src),
! cd, cd1, cd2, handler, 1,
&result, &length);
if (retval < 0)
--- 850,856 ----
char *result = NULL;
size_t length = 0;
int retval = mem_cd_iconveh_internal (src, strlen (src),
! cd, cd1, cd2, handler, 1, NULL,
&result, &length);
if (retval < 0)
***************
*** 770,775 ****
--- 876,882 ----
mem_iconveh (const char *src, size_t srclen,
const char *from_codeset, const char *to_codeset,
enum iconv_ilseq_handler handler,
+ size_t *offsets,
char **resultp, size_t *lengthp)
{
if (srclen == 0)
***************
*** 778,784 ****
*lengthp = 0;
return 0;
}
! else if (c_strcasecmp (from_codeset, to_codeset) == 0)
{
char *result;
--- 885,891 ----
*lengthp = 0;
return 0;
}
! else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
{
char *result;
***************
*** 854,861 ****
result = *resultp;
length = *lengthp;
! retval =
! mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, &result, &length);
if (retval < 0)
{
--- 961,968 ----
result = *resultp;
length = *lengthp;
! retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
! &result, &length);
if (retval < 0)
{
*** lib/striconveha.h 21 Jan 2007 22:59:19 -0000 1.1
--- lib/striconveha.h 23 Jan 2007 01:03:25 -0000
***************
*** 30,35 ****
--- 30,39 ----
/* Convert an entire string from one encoding to another, using iconv.
The original string is at [SRC,...,SRC+SRCLEN-1].
The "from" encoding can also be a name defined for autodetection.
+ If OFFSETS is not NULL, it should point to an array of SRCLEN integers;
this
+ array is filled with offsets into the result, i.e. the character starting
+ at SRC[i] corresponds to the character starting at (*RESULTP)[OFFSETS[i]],
+ and other offsets are set to (size_t)(-1).
*RESULTP and *LENGTH should initially be a scratch buffer and its size,
or *RESULTP can initially be NULL.
May erase the contents of the memory at *RESULTP.
***************
*** 41,46 ****
--- 45,51 ----
mem_iconveha (const char *src, size_t srclen,
const char *from_codeset, const char *to_codeset,
enum iconv_ilseq_handler handler,
+ size_t *offsets,
char **resultp, size_t *lengthp);
/* Convert an entire string from one encoding to another, using iconv.
*** lib/striconveha.c 21 Jan 2007 22:59:19 -0000 1.1
--- lib/striconveha.c 23 Jan 2007 01:03:25 -0000
***************
*** 147,156 ****
mem_iconveha (const char *src, size_t srclen,
const char *from_codeset, const char *to_codeset,
enum iconv_ilseq_handler handler,
char **resultp, size_t *lengthp)
{
int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
! resultp, lengthp);
if (retval >= 0 || errno != EINVAL)
return retval;
else
--- 147,157 ----
mem_iconveha (const char *src, size_t srclen,
const char *from_codeset, const char *to_codeset,
enum iconv_ilseq_handler handler,
+ size_t *offsets,
char **resultp, size_t *lengthp)
{
int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler,
! offsets, resultp, lengthp);
if (retval >= 0 || errno != EINVAL)
return retval;
else
***************
*** 168,174 ****
{
retval = mem_iconveha (src, srclen,
from_codeset, to_codeset, handler,
! resultp, lengthp);
if (!(retval < 0 && errno == EILSEQ))
return retval;
encodings++;
--- 169,175 ----
{
retval = mem_iconveha (src, srclen,
from_codeset, to_codeset, handler,
! offsets, resultp, lengthp);
if (!(retval < 0 && errno == EILSEQ))
return retval;
encodings++;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- striconveh, striconveha: keeping track of byte correspondences,
Bruno Haible <=