guile-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Implement efficient 'scm_unget_bytes' and 'unget-bytevector'


From: Mark H Weaver
Subject: [PATCH] Implement efficient 'scm_unget_bytes' and 'unget-bytevector'
Date: Sat, 06 Apr 2013 03:47:45 -0400
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/24.3 (gnu/linux)

Mark H Weaver <address@hidden> writes:

> I discovered that 'scm_unget_byte' is kind of dumb.  It puts the bytes
> at the beginning of the pushback buffer instead of the end.  This means
> that every time you unget a byte, it has to shift up the existing
> contents of the buffer, so ungetting N bytes takes O(N^2) time.
>
> This patch implements a function 'scm_unget_bytes' that enables large
> buffers to be unread efficiently.  It keeps the bytes at the end of the
> buffer instead of the beginning, but it can cope if some external code
> manipulates the pushback buffer by hand and puts the bytes at the
> beginning.

Here's an improved patch that also exports 'unget-bytevector' from
(ice-9 binary-ports).  I've used it to unget 15 megabytes, and it was
quite fast.  Unfortunately, I'm at a bit of a loss of where to document
it in the manual.

Comments and suggestions solicited.

      Mark


>From 00c36fdd2e4d94a37fa416e3bc8436f66bba612a Mon Sep 17 00:00:00 2001
From: Mark H Weaver <address@hidden>
Date: Sat, 6 Apr 2013 01:42:45 -0400
Subject: [PATCH] Implement efficient 'scm_unget_bytes' and
 'unget-bytevector'.

* libguile/ports.c (scm_i_unget_bytes): New static function.
  (scm_unget_bytes): New API function.
  (scm_unget_byte): Rewrite to simply call 'scm_i_unget_bytes'.
  (scm_ungetc, scm_peek_char, looking_at_bytes): Use 'scm_i_unget_bytes'.

* libguile/ports.h: Add prototype for 'scm_unget_bytes'.

* libguile/fports.c (scm_setvbuf): Use 'scm_unget_bytes'.

* libguile/r6rs-ports.c (scm_unget_bytevector): New procedure.

* module/ice-9/binary-ports.scm (unget-bytevector): New export.

* test-suite/tests/ports.test ("unget-bytevector"): Add test.
---
 libguile/fports.c             |    3 +-
 libguile/ports.c              |  130 ++++++++++++++++++++++++-----------------
 libguile/ports.h              |    1 +
 libguile/r6rs-ports.c         |   43 ++++++++++++++
 module/ice-9/binary-ports.scm |    1 +
 test-suite/tests/ports.test   |   21 ++++++-
 6 files changed, 143 insertions(+), 56 deletions(-)

diff --git a/libguile/fports.c b/libguile/fports.c
index f6c3c92..ffe4334 100644
--- a/libguile/fports.c
+++ b/libguile/fports.c
@@ -225,8 +225,7 @@ SCM_DEFINE (scm_setvbuf, "setvbuf", 2, 1, 0,
 
   if (ndrained > 0)
     /* Put DRAINED back to PORT.  */
-    while (ndrained-- > 0)
-      scm_unget_byte (drained[ndrained], port);
+    scm_unget_bytes ((unsigned char *) drained, ndrained, port);
 
   return SCM_UNSPECIFIED;
 }
diff --git a/libguile/ports.c b/libguile/ports.c
index 47dc165..9068c5c 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1789,52 +1789,25 @@ scm_end_input (SCM port)
 
 
 
-void 
-scm_unget_byte (int c, SCM port)
-#define FUNC_NAME "scm_unget_byte"
+static void
+scm_i_unget_bytes (const unsigned char *buf, size_t len, SCM port)
+#define FUNC_NAME "scm_unget_bytes"
 {
   scm_t_port *pt = SCM_PTAB_ENTRY (port);
+  size_t old_len, new_len;
 
   scm_i_clear_pending_eof (port);
-  if (pt->read_buf == pt->putback_buf)
-    /* already using the put-back buffer.  */
-    {
-      /* enlarge putback_buf if necessary.  */
-      if (pt->read_end == pt->read_buf + pt->read_buf_size
-         && pt->read_buf == pt->read_pos)
-       {
-         size_t new_size = pt->read_buf_size * 2;
-         unsigned char *tmp = (unsigned char *)
-           scm_gc_realloc (pt->putback_buf, pt->read_buf_size, new_size,
-                           "putback buffer");
-
-         pt->read_pos = pt->read_buf = pt->putback_buf = tmp;
-         pt->read_end = pt->read_buf + pt->read_buf_size;
-         pt->read_buf_size = pt->putback_buf_size = new_size;
-       }
 
-      /* shift any existing bytes to buffer + 1.  */
-      if (pt->read_pos == pt->read_end)
-       pt->read_end = pt->read_buf + 1;
-      else if (pt->read_pos != pt->read_buf + 1)
-       {
-         int count = pt->read_end - pt->read_pos;
-
-         memmove (pt->read_buf + 1, pt->read_pos, count);
-         pt->read_end = pt->read_buf + 1 + count;
-       }
-
-      pt->read_pos = pt->read_buf;
-    }
-  else
+  if (pt->read_buf != pt->putback_buf)
     /* switch to the put-back buffer.  */
     {
       if (pt->putback_buf == NULL)
        {
+          pt->putback_buf_size = (len > SCM_INITIAL_PUTBACK_BUF_SIZE
+                                  ? len : SCM_INITIAL_PUTBACK_BUF_SIZE);
          pt->putback_buf
            = (unsigned char *) scm_gc_malloc_pointerless
-           (SCM_INITIAL_PUTBACK_BUF_SIZE, "putback buffer");
-         pt->putback_buf_size = SCM_INITIAL_PUTBACK_BUF_SIZE;
+           (pt->putback_buf_size, "putback buffer");
        }
 
       pt->saved_read_buf = pt->read_buf;
@@ -1842,12 +1815,59 @@ scm_unget_byte (int c, SCM port)
       pt->saved_read_end = pt->read_end;
       pt->saved_read_buf_size = pt->read_buf_size;
 
-      pt->read_pos = pt->read_buf = pt->putback_buf;
-      pt->read_end = pt->read_buf + 1;
+      /* Put read_pos at the end of the buffer, so that ungets will not
+         have to shift the buffer contents each time.  */
+      pt->read_buf = pt->putback_buf;
+      pt->read_pos = pt->read_end = pt->putback_buf + pt->putback_buf_size;
       pt->read_buf_size = pt->putback_buf_size;
     }
 
-  *pt->read_buf = c;
+  old_len = pt->read_end - pt->read_pos;
+  new_len = old_len + len;
+
+  if (new_len > pt->read_buf_size)
+    /* The putback buffer needs to be enlarged.  */
+    {
+      size_t new_buf_size;
+      unsigned char *new_buf, *new_end, *new_pos;
+
+      new_buf_size = pt->read_buf_size * 2;
+      if (new_buf_size < new_len)
+        new_buf_size = new_len;
+
+      new_buf = (unsigned char *)
+        scm_gc_malloc_pointerless (new_buf_size, "putback buffer");
+
+      /* Put the bytes at the end of the buffer, so that future
+         ungets won't need to shift the buffer.  */
+      new_end = new_buf + new_buf_size;
+      new_pos = new_end - old_len;
+      memcpy (new_pos, pt->read_pos, old_len);
+
+      pt->read_buf = pt->putback_buf = new_buf;
+      pt->read_pos = new_pos;
+      pt->read_end = new_end;
+      pt->read_buf_size = pt->putback_buf_size = new_buf_size;
+    }
+  else if (pt->read_buf + len < pt->read_pos)
+    /* If needed, shift the existing buffer contents up.
+       This should not happen unless some external code
+       manipulates the putback buffer pointers.  */
+    {
+      unsigned char *new_end = pt->read_buf + pt->read_buf_size;
+      unsigned char *new_pos = new_end - old_len;
+
+      memmove (new_pos, pt->read_pos, old_len);
+      pt->read_pos = new_pos;
+      pt->read_end = new_end;
+    }
+
+  /* Move read_pos back and copy the bytes there.  */
+  pt->read_pos -= len;
+  memcpy (pt->read_buf + (pt->read_pos - pt->read_buf), buf, len);
+
+  if (pt->rw_active == SCM_PORT_WRITE)
+    scm_flush (port);
 
   if (pt->rw_random)
     pt->rw_active = SCM_PORT_READ;
@@ -1855,6 +1875,21 @@ scm_unget_byte (int c, SCM port)
 #undef FUNC_NAME
 
 void
+scm_unget_bytes (const unsigned char *buf, size_t len, SCM port)
+{
+  scm_i_unget_bytes (buf, len, port);
+}
+
+void
+scm_unget_byte (int c, SCM port)
+{
+  unsigned char byte;
+
+  byte = c;
+  scm_i_unget_bytes (&byte, 1, port);
+}
+
+void
 scm_ungetc (scm_t_wchar c, SCM port)
 #define FUNC_NAME "scm_ungetc"
 {
@@ -1863,7 +1898,6 @@ scm_ungetc (scm_t_wchar c, SCM port)
   char result_buf[10];
   const char *encoding;
   size_t len;
-  int i;
 
   if (pt->encoding != NULL)
     encoding = pt->encoding;
@@ -1881,8 +1915,7 @@ scm_ungetc (scm_t_wchar c, SCM port)
                        "conversion to port encoding failed",
                        SCM_BOOL_F, SCM_MAKE_CHAR (c));
 
-  for (i = len - 1; i >= 0; i--)
-    scm_unget_byte (result[i], port);
+  scm_i_unget_bytes ((unsigned char *) result, len, port);
 
   if (SCM_UNLIKELY (result != result_buf))
     free (result);
@@ -1941,7 +1974,7 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
   SCM result;
   scm_t_wchar c;
   char bytes[SCM_MBCHAR_BUF_SIZE];
-  long column, line, i;
+  long column, line;
   size_t len;
 
   if (SCM_UNBNDP (port))
@@ -1953,8 +1986,7 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
 
   err = get_codepoint (port, &c, bytes, &len);
 
-  for (i = len - 1; i >= 0; i--)
-    scm_unget_byte (bytes[i], port);
+  scm_i_unget_bytes ((unsigned char *) bytes, len, port);
 
   SCM_COL (port) = column;
   SCM_LINUM (port) = line;
@@ -2336,7 +2368,6 @@ static int
 looking_at_bytes (SCM port, const unsigned char *bytes, int len)
 {
   scm_t_port *pt = SCM_PTAB_ENTRY (port);
-  int result;
   int i = 0;
 
   while (i < len && scm_peek_byte_or_eof (port) == bytes[i])
@@ -2344,13 +2375,8 @@ looking_at_bytes (SCM port, const unsigned char *bytes, 
int len)
       pt->read_pos++;
       i++;
     }
-
-  result = (i == len);
-
-  while (i > 0)
-    scm_unget_byte (bytes[--i], port);
-
-  return result;
+  scm_i_unget_bytes (bytes, i, port);
+  return (i == len);
 }
 
 static const unsigned char scm_utf8_bom[3]    = {0xEF, 0xBB, 0xBF};
diff --git a/libguile/ports.h b/libguile/ports.h
index ca5bf2f..39317f8 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -302,6 +302,7 @@ SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t 
start, size_t end,
 SCM_API void scm_flush (SCM port);
 SCM_API void scm_end_input (SCM port);
 SCM_API int scm_fill_input (SCM port);
+SCM_API void scm_unget_bytes (const unsigned char *buf, size_t len, SCM port);
 SCM_API void scm_unget_byte (int c, SCM port);
 SCM_API void scm_ungetc (scm_t_wchar c, SCM port);
 SCM_API void scm_ungets (const char *s, int n, SCM port);
diff --git a/libguile/r6rs-ports.c b/libguile/r6rs-ports.c
index 48f9f26..aa3c935 100644
--- a/libguile/r6rs-ports.c
+++ b/libguile/r6rs-ports.c
@@ -714,6 +714,49 @@ SCM_DEFINE (scm_put_bytevector, "put-bytevector", 2, 2, 0,
 }
 #undef FUNC_NAME
 
+SCM_DEFINE (scm_unget_bytevector, "unget-bytevector", 2, 2, 0,
+           (SCM port, SCM bv, SCM start, SCM count),
+           "Unget the contents of @var{bv} to @var{port}, optionally "
+           "starting at index @var{start} and limiting to @var{count} "
+           "octets.")
+#define FUNC_NAME s_scm_unget_bytevector
+{
+  char *c_bv;
+  unsigned c_start, c_count, c_len;
+
+  SCM_VALIDATE_BINARY_INPUT_PORT (1, port);
+  SCM_VALIDATE_BYTEVECTOR (2, bv);
+
+  c_len = SCM_BYTEVECTOR_LENGTH (bv);
+  c_bv = (char *) SCM_BYTEVECTOR_CONTENTS (bv);
+
+  if (!scm_is_eq (start, SCM_UNDEFINED))
+    {
+      c_start = scm_to_uint (start);
+
+      if (!scm_is_eq (count, SCM_UNDEFINED))
+       {
+         c_count = scm_to_uint (count);
+         if (SCM_UNLIKELY (c_start + c_count > c_len))
+           scm_out_of_range (FUNC_NAME, count);
+       }
+      else
+       {
+         if (SCM_UNLIKELY (c_start >= c_len))
+           scm_out_of_range (FUNC_NAME, start);
+         else
+           c_count = c_len - c_start;
+       }
+    }
+  else
+    c_start = 0, c_count = c_len;
+
+  scm_unget_bytes ((unsigned char *) c_bv + c_start, c_count, port);
+
+  return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
 
 
 /* Bytevector output port ("bop" for short).  */
diff --git a/module/ice-9/binary-ports.scm b/module/ice-9/binary-ports.scm
index c07900b..cd7d155 100644
--- a/module/ice-9/binary-ports.scm
+++ b/module/ice-9/binary-ports.scm
@@ -40,6 +40,7 @@
             get-string-n!
             put-u8
             put-bytevector
+            unget-bytevector
             open-bytevector-output-port
             make-custom-binary-output-port))
 
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 0dbd3b2..7d16399 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -24,8 +24,12 @@
   #:use-module (ice-9 popen)
   #:use-module (ice-9 rdelim)
   #:use-module (rnrs bytevectors)
-  #:use-module ((rnrs io ports) #:select (open-bytevector-input-port
-                                          open-bytevector-output-port)))
+  #:use-module ((ice-9 binary-ports) #:select (open-bytevector-input-port
+                                               open-bytevector-output-port
+                                               put-bytevector
+                                               get-bytevector-n
+                                               get-bytevector-all
+                                               unget-bytevector)))
 
 (define (display-line . args)
   (for-each display args)
@@ -1236,6 +1240,19 @@
 
 
 
+(pass-if-equal "unget-bytevector"
+    #vu8(10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 200 201 202 203
+            1 2 3 4 251 253 254 255)
+  (let ((port (open-bytevector-input-port #vu8(1 2 3 4 251 253 254 255))))
+    (unget-bytevector port #vu8(200 201 202 203))
+    (unget-bytevector port #vu8(20 21 22 23 24))
+    (unget-bytevector port #vu8(10 11 12 13 14 15 16 17 18 19) 4)
+    (unget-bytevector port #vu8(10 11 12 13 14 15 16 17 18 19) 2 2)
+    (unget-bytevector port #vu8(10 11))
+    (get-bytevector-all port)))
+
+
+
 (with-test-prefix "unicode byte-order marks (BOMs)"
 
   (define (bv-read-test* encoding bv proc)
-- 
1.7.10.4


reply via email to

[Prev in Thread] Current Thread [Next in Thread]