Re: Feature request - base64 Filename Safe Alphabet

bug-coreutils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Feature request - base64 Filename Safe Alphabet

From:	Bo Borgerson
Subject:	Re: Feature request - base64 Filename Safe Alphabet
Date:	Mon, 05 May 2008 11:36:33 -0400
User-agent:	Thunderbird 2.0.0.12 (X11/20080227)

Simon Josefsson wrote:
> Your patch is rather difficult to read for me, since I'm not that
> familiar with the coreutils changes, and more importantly: to be applied
> to gnulib, I need a patch against gnulib.

Hi Simon,

Thanks for looking at this.

> Would you mind creating a patchset that applies to the gnulib git
> repository?

Not at all.

It wasn't very easy to read as a single revision, so I did it in two
steps.  The first step is pure addition: New functions and a definition
of the decode context structure.  The second step is still not the most
legible diff, but it should be a little easier to get your bearings in.

> I suspect your patch do things the way I suggested in the post to the
> gnulib list some time ago, which is nice.

Yes, I think so, at least in terms of interface.

Thanks again,

Bo

>From 3a9bdc6228eba0645bb482f88502bdf19aff609f Mon Sep 17 00:00:00 2001
From: Bo Borgerson <address@hidden>
Date: Mon, 5 May 2008 10:54:31 -0400
Subject: [PATCH] A coreutils compatible base64 - part 1

* lib/base64.c (get_4): Get four non-newline characters from the input buffer.
Use the context structure's buffer to create a contiguous block if necessary.
Currently unused.
(decode_4): Helper function to be used by base64_decode_ctx.  Currently unused.
(base64_decode_ctx_init): Initialize a decode context structure.
* lib/base64.h (struct base64_decode_context) To be used by base64_decode_ctx

Signed-off-by: Bo Borgerson <address@hidden>
---
 lib/base64.c |  135 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/base64.h |    8 +++
 2 files changed, 143 insertions(+), 0 deletions(-)

diff --git a/lib/base64.c b/lib/base64.c
index f237cd6..40ae640 100644
--- a/lib/base64.c
+++ b/lib/base64.c
@@ -300,6 +300,141 @@ isbase64 (char ch)
   return uchar_in_range (to_uchar (ch)) && 0 <= b64[to_uchar (ch)];
 }
 
+/* Initialize decode-context buffer, CTX.  */
+void
+base64_decode_ctx_init (struct base64_decode_context *ctx)
+{
+  ctx->i = 0;
+}
+
+/* If CTX->i is 0 or 4, there are four or more bytes in [*IN..IN_END), and
+   none of those four is a newline, then return *IN.  Otherwise, copy up to
+   4 - CTX->i non-newline bytes from that range into CTX->buf, starting at
+   index CTX->i and setting CTX->i to reflect the number of bytes copied,
+   and return CTX->buf.  In either case, advance *IN to point to the byte
+   after the last one processed, and set *N_NON_NEWLINE to the number of
+   verified non-newline bytes accessible through the returned pointer.  */
+static inline char *
+get_4 (struct base64_decode_context *ctx,
+       char const *restrict *in, char const *restrict in_end,
+       size_t *n_non_newline)
+{
+  if (ctx->i == 4)
+    ctx->i = 0;
+
+  if (ctx->i == 0)
+    {
+      char const *t = *in;
+      if (4 <= in_end - *in && memchr (t, '\n', 4) == NULL)
+       {
+         /* This is the common case: no newline.  */
+         *in += 4;
+         *n_non_newline = 4;
+         return (char *) t;
+       }
+    }
+
+  {
+    /* Copy non-newline bytes into BUF.  */
+    char const *p = *in;
+    while (p < in_end)
+      {
+       char c = *p++;
+       if (c != '\n')
+         {
+           ctx->buf[ctx->i++] = c;
+           if (ctx->i == 4)
+             break;
+         }
+      }
+
+    *in = p;
+    *n_non_newline = ctx->i;
+    return ctx->buf;
+  }
+}
+
+#define return_false                           \
+  do                                           \
+    {                                          \
+      *outp = out;                             \
+      return false;                            \
+    }                                          \
+  while (false)
+
+/* Decode up to four bytes of base64-encoded data, IN, of length INLEN
+   into the output buffer, *OUT, of size *OUTLEN bytes.  Return true if
+   decoding is successful, false otherwise.  If *OUTLEN is too small,
+   as many bytes as possible are written to *OUT.  On return, advance
+   *OUT to point to the byte after the last one written, and decrement
+   *OUTLEN to reflect the number of bytes remaining in *OUT.  */
+static inline bool
+decode_4 (char const *restrict in, size_t inlen,
+         char *restrict *outp, size_t *outleft)
+{
+  char *out = *outp;
+  if (inlen < 2)
+    return false;
+
+  if (!isbase64 (in[0]) || !isbase64 (in[1]))
+    return false;
+
+  if (*outleft)
+    {
+      *out++ = ((b64[to_uchar (in[0])] << 2)
+               | (b64[to_uchar (in[1])] >> 4));
+      --*outleft;
+    }
+
+  if (inlen == 2)
+    return_false;
+
+  if (in[2] == '=')
+    {
+      if (inlen != 4)
+       return_false;
+
+      if (in[3] != '=')
+       return_false;
+    }
+  else
+    {
+      if (!isbase64 (in[2]))
+       return_false;
+
+      if (*outleft)
+       {
+         *out++ = (((b64[to_uchar (in[1])] << 4) & 0xf0)
+                   | (b64[to_uchar (in[2])] >> 2));
+         --*outleft;
+       }
+
+      if (inlen == 3)
+       return_false;
+
+      if (in[3] == '=')
+       {
+         if (inlen != 4)
+           return_false;
+       }
+      else
+       {
+         if (!isbase64 (in[3]))
+           return_false;
+
+         if (*outleft)
+           {
+             *out++ = (((b64[to_uchar (in[2])] << 6) & 0xc0)
+                       | b64[to_uchar (in[3])]);
+             --*outleft;
+           }
+       }
+    }
+
+  *outp = out;
+  return true;
+}
+
 /* Decode base64 encoded input array IN of length INLEN to output
    array OUT that can hold *OUTLEN bytes.  Return true if decoding was
    successful, i.e. if the input was valid base64 data, false
diff --git a/lib/base64.h b/lib/base64.h
index 6bb9a97..2fc4d34 100644
--- a/lib/base64.h
+++ b/lib/base64.h
@@ -29,6 +29,12 @@
    integer >= n/k, i.e., the ceiling of n/k.  */
 # define BASE64_LENGTH(inlen) ((((inlen) + 2) / 3) * 4)
 
+struct base64_decode_context
+{
+  unsigned int i;
+  char buf[4];
+};
+
 extern bool isbase64 (char ch);
 
 extern void base64_encode (const char *restrict in, size_t inlen,
@@ -36,6 +42,8 @@ extern void base64_encode (const char *restrict in, size_t 
inlen,
 
 extern size_t base64_encode_alloc (const char *in, size_t inlen, char **out);
 
+extern void base64_decode_ctx_init (struct base64_decode_context *ctx);
+
 extern bool base64_decode (const char *restrict in, size_t inlen,
                           char *restrict out, size_t *outlen);
 
-- 
1.5.4.3


>From e91e0a1aeb5e4daa4f5a82fd48732511d8b0808e Mon Sep 17 00:00:00 2001
From: Bo Borgerson <address@hidden>
Date: Mon, 5 May 2008 11:09:10 -0400
Subject: [PATCH] A coreutils compatible base64 - part 2

* lib/base64.c (base64_decode_ctx): If a decode context structure was passed in
use it to ignore newlines.  If a context structure was _not_ passed in, continue
to treat newlines as garbage (this is the historical behavior).  Formerly
base64_decode.
(base64_decode_alloc_ctx): Formerly base64_decode_alloc.  Now takes a decode
context structure.
* lib/base64.h (base64_decode): Macro for four-argument calls.
(base64_decode_alloc): Likewise.

Signed-off-by: Bo Borgerson <address@hidden>
---
 lib/base64.c |  152 ++++++++++++++++++++++++++++++++--------------------------
 lib/base64.h |   16 +++++--
 2 files changed, 96 insertions(+), 72 deletions(-)

diff --git a/lib/base64.c b/lib/base64.c
index 40ae640..8aff430 100644
--- a/lib/base64.c
+++ b/lib/base64.c
@@ -52,6 +52,8 @@
 /* Get UCHAR_MAX. */
 #include <limits.h>
 
+#include <string.h>
+
 /* C89 compliant way to cast 'char' to 'unsigned char'. */
 static inline unsigned char
 to_uchar (char ch)
@@ -435,89 +437,102 @@ decode_4 (char const *restrict in, size_t inlen,
   return true;
 }
 
-/* Decode base64 encoded input array IN of length INLEN to output
-   array OUT that can hold *OUTLEN bytes.  Return true if decoding was
-   successful, i.e. if the input was valid base64 data, false
-   otherwise.  If *OUTLEN is too small, as many bytes as possible will
-   be written to OUT.  On return, *OUTLEN holds the length of decoded
-   bytes in OUT.  Note that as soon as any non-alphabet characters are
-   encountered, decoding is stopped and false is returned.  This means
-   that, when applicable, you must remove any line terminators that is
-   part of the data stream before calling this function.  */
+/* Decode base64-encoded input array IN of length INLEN to output array
+   OUT that can hold *OUTLEN bytes.  The input data may be interspersed
+   with newlines.  Return true if decoding was successful, i.e. if the
+   input was valid base64 data, false otherwise.  If *OUTLEN is too
+   small, as many bytes as possible will be written to OUT.  On return,
+   *OUTLEN holds the length of decoded bytes in OUT.  Note that as soon
+   as any non-alphabet, non-newline character is encountered, decoding
+   is stopped and false is returned.  If INLEN is zero, then process
+   only whatever data is stored in CTX.
+
+   Initially, CTX must have been initialized via base64_decode_ctx_init.
+   Subsequent calls to this function must reuse whatever state is recorded
+   in that buffer.  It is necessary for when a quadruple of base64 input
+   bytes spans two input buffers.
+
+   If CTX is NULL then newlines are treated as garbage and the input
+   buffer is processed as a unit.  */
+
 bool
-base64_decode (const char *restrict in, size_t inlen,
-              char *restrict out, size_t *outlen)
+base64_decode_ctx (struct base64_decode_context *ctx,
+                  const char *restrict in, size_t inlen,
+                  char *restrict out, size_t *outlen)
 {
   size_t outleft = *outlen;
+  bool ignore_newlines = ctx != NULL;
+  bool flush_ctx = false;
+  unsigned int ctx_i = 0;
 
-  while (inlen >= 2)
+  if (ignore_newlines)
     {
-      if (!isbase64 (in[0]) || !isbase64 (in[1]))
-       break;
+      ctx_i = ctx->i;
+      flush_ctx = inlen == 0;
+    }
+
 
-      if (outleft)
+  while (true)
+    {
+      size_t outleft_save = outleft;
+      if (ctx_i == 0 && !flush_ctx)
        {
-         *out++ = ((b64[to_uchar (in[0])] << 2)
-                   | (b64[to_uchar (in[1])] >> 4));
-         outleft--;
+         while (true)
+           {
+             /* Save a copy of outleft, in case we need to re-parse this
+                block of four bytes.  */
+             outleft_save = outleft;
+             if (!decode_4 (in, inlen, &out, &outleft))
+               break;
+
+             in += 4;
+             inlen -= 4;
+           }
        }
 
-      if (inlen == 2)
+      if (inlen == 0 && !flush_ctx)
        break;
 
-      if (in[2] == '=')
+      /* Handle the common case of 72-byte wrapped lines.
+        This also handles any other multiple-of-4-byte wrapping.  */
+      if (inlen && *in == '\n' && ignore_newlines)
        {
-         if (inlen != 4)
-           break;
-
-         if (in[3] != '=')
-           break;
-
+         ++in;
+         --inlen;
+         continue;
        }
-      else
-       {
-         if (!isbase64 (in[2]))
-           break;
 
-         if (outleft)
-           {
-             *out++ = (((b64[to_uchar (in[1])] << 4) & 0xf0)
-                       | (b64[to_uchar (in[2])] >> 2));
-             outleft--;
-           }
+      /* Restore OUT and OUTLEFT.  */
+      out -= outleft_save - outleft;
+      outleft = outleft_save;
 
-         if (inlen == 3)
+      {
+       char const *in_end = in + inlen;
+       char const *non_nl;
+
+       if (ignore_newlines)
+         non_nl = get_4 (ctx, &in, in_end, &inlen);
+       else
+         non_nl = in;  /* Might have nl in this case. */
+
+       /* If the input is empty or consists solely of newlines (0 
non-newlines),
+          then we're done.  Likewise if there are fewer than 4 bytes when not
+          flushing context and not treating newlines as garbage.  */
+       if (inlen == 0 || (inlen < 4 && !flush_ctx && ignore_newlines))
+         {
+           inlen = 0;
            break;
+         }
+       if (!decode_4 (non_nl, inlen, &out, &outleft))
+         break;
 
-         if (in[3] == '=')
-           {
-             if (inlen != 4)
-               break;
-           }
-         else
-           {
-             if (!isbase64 (in[3]))
-               break;
-
-             if (outleft)
-               {
-                 *out++ = (((b64[to_uchar (in[2])] << 6) & 0xc0)
-                           | b64[to_uchar (in[3])]);
-                 outleft--;
-               }
-           }
-       }
-
-      in += 4;
-      inlen -= 4;
+       inlen = in_end - in;
+      }
     }
 
   *outlen -= outleft;
 
-  if (inlen != 0)
-    return false;
-
-  return true;
+  return inlen == 0;
 }
 
 /* Allocate an output buffer in *OUT, and decode the base64 encoded
@@ -532,12 +547,13 @@ base64_decode (const char *restrict in, size_t inlen,
    input was invalid, in which case *OUT is NULL and *OUTLEN is
    undefined. */
 bool
-base64_decode_alloc (const char *in, size_t inlen, char **out,
-                    size_t *outlen)
+base64_decode_alloc_ctx (struct base64_decode_context *ctx,
+                        const char *in, size_t inlen, char **out,
+                        size_t *outlen)
 {
-  /* This may allocate a few bytes too much, depending on input,
-     but it's not worth the extra CPU time to compute the exact amount.
-     The exact amount is 3 * inlen / 4, minus 1 if the input ends
+  /* This may allocate a few bytes too many, depending on input,
+     but it's not worth the extra CPU time to compute the exact size.
+     The exact size is 3 * inlen / 4, minus 1 if the input ends
      with "=" and minus another 1 if the input ends with "==".
      Dividing before multiplying avoids the possibility of overflow.  */
   size_t needlen = 3 * (inlen / 4) + 2;
@@ -546,7 +562,7 @@ base64_decode_alloc (const char *in, size_t inlen, char 
**out,
   if (!*out)
     return true;
 
-  if (!base64_decode (in, inlen, *out, &needlen))
+  if (!base64_decode_ctx (ctx, in, inlen, *out, &needlen))
     {
       free (*out);
       *out = NULL;
diff --git a/lib/base64.h b/lib/base64.h
index 2fc4d34..9c31651 100644
--- a/lib/base64.h
+++ b/lib/base64.h
@@ -44,10 +44,18 @@ extern size_t base64_encode_alloc (const char *in, size_t 
inlen, char **out);
 
 extern void base64_decode_ctx_init (struct base64_decode_context *ctx);
 
-extern bool base64_decode (const char *restrict in, size_t inlen,
-                          char *restrict out, size_t *outlen);
+extern bool base64_decode_ctx (struct base64_decode_context *ctx,
+                              const char *restrict in, size_t inlen,
+                              char *restrict out, size_t *outlen);
 
-extern bool base64_decode_alloc (const char *in, size_t inlen,
-                                char **out, size_t *outlen);
+extern bool base64_decode_alloc_ctx (struct base64_decode_context *ctx,
+                                    const char *in, size_t inlen,
+                                    char **out, size_t *outlen);
+
+#define base64_decode(in, inlen, out, outlen) \
+       base64_decode_ctx (NULL, in, inlen, out, outlen)
+
+#define base64_decode_alloc(in, inlen, out, outlen) \
+       base64_decode_alloc_ctx (NULL, in, inlen, out, outlen)
 
 #endif /* BASE64_H */
-- 
1.5.4.3

[Prev in Thread]

Current Thread

[Next in Thread]

Re: Feature request - base64 Filename Safe Alphabet, Jim Meyering, 2008/05/05
- Re: Feature request - base64 Filename Safe Alphabet, Bo Borgerson, 2008/05/05
  - Re: Feature request - base64 Filename Safe Alphabet, Simon Josefsson, 2008/05/05
    - Re: Feature request - base64 Filename Safe Alphabet, Bo Borgerson <=
  - Re: Feature request - base64 Filename Safe Alphabet, Jim Meyering, 2008/05/05

Prev by Date: Re: Feature request - base64 Filename Safe Alphabet
Next by Date: Re: Feature request - base64 Filename Safe Alphabet
Previous by thread: Re: Feature request - base64 Filename Safe Alphabet
Next by thread: Re: Feature request - base64 Filename Safe Alphabet
Index(es):
- Date
- Thread