guile-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: byte-order marks


From: Andy Wingo
Subject: Re: byte-order marks
Date: Wed, 30 Jan 2013 10:20:27 +0100
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/24.2 (gnu/linux)

Hi,

On Tue 29 Jan 2013 18:09, Mark H Weaver <address@hidden> writes:

> Having slept on this, I think I agree that 'open-input-file' should
> auto-consume BOMs.

Patch attached.

> However, 'open-input-file' should not auto-detect the encoding by
> default,

The ball is in your court now :)

> and should only consume BOMs that match the specified encoding.

What do you mean by "specified encoding"?

> 'scm_i_scan_for_file_encoding' should look for (but not consume) BOMs as
> a last resort, but only if no coding declaration is found.

I removed the BOM handling from this routine entirely.

>> But I have another patch that fixes the (sxml simple) problem, so I'm
>> also OK with punting on this issue for now.
>
> IMO, BOMs should probably also be consumed by (sxml simple), but again
> only if the BOM is already in the previously specified encoding.

I will punt on this one.


>From 5512fe4f93e4e583ab538ae02dd98e5825252dc9 Mon Sep 17 00:00:00 2001
From: Andy Wingo <address@hidden>
Date: Wed, 30 Jan 2013 10:17:25 +0100
Subject: [PATCH] detect and consume byte-order marks for textual ports

* libguile/ports.h:
* libguile/ports.c (scm_consume_byte_order_mark): New procedure.

* libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we
  are opening a file in "r" mode.

* libguile/read.c (scm_i_scan_for_encoding): Don't do anything about
  byte-order marks.

* libguile/load.c (scm_primitive_load): Add a note about the duplicate
  encoding scan.

* test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and
  UTF-16LE BOM handling.
---
 libguile/fports.c             |   35 +++++++++--------
 libguile/load.c               |    3 ++
 libguile/ports.c              |   85 ++++++++++++++++++++++++++++++++++++++++-
 libguile/ports.h              |    3 +-
 libguile/read.c               |   14 +------
 test-suite/tests/filesys.test |   59 +++++++++++++++++++++++++++-
 6 files changed, 169 insertions(+), 30 deletions(-)

diff --git a/libguile/fports.c b/libguile/fports.c
index 10cf671..fbc0530 100644
--- a/libguile/fports.c
+++ b/libguile/fports.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- *   2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, 
Inc.
+ *   2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software 
Foundation, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
 #define FUNC_NAME s_scm_open_file
 {
   SCM port;
-  int fdes, flags = 0, use_encoding = 1;
+  int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0;
   unsigned int retries;
   char *file, *md, *ptr;
 
@@ -415,6 +415,8 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
     {
     case 'r':
       flags |= O_RDONLY;
+      consume_bom = 1;
+      scan_for_encoding = 1;
       break;
     case 'w':
       flags |= O_WRONLY | O_CREAT | O_TRUNC;
@@ -432,9 +434,12 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
        {
        case '+':
          flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR;
+          consume_bom = 0;
          break;
        case 'b':
-         use_encoding = 0;
+         scan_for_encoding = 0;
+          consume_bom = 0;
+          binary = 1;
 #if defined (O_BINARY)
          flags |= O_BINARY;
 #endif
@@ -473,21 +478,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
   port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode),
                              fport_canonicalize_filename (filename));
 
-  if (use_encoding)
-    {
-      /* If this file has a coding declaration, use that as the port
-        encoding.  */
-      if (SCM_INPUT_PORT_P (port))
-       {
-         char *enc = scm_i_scan_for_encoding (port);
-         if (enc != NULL)
-           scm_i_set_port_encoding_x (port, enc);
-       }
-    }
-  else
+  if (consume_bom) 
+    scm_consume_byte_order_mark (port);
+
+  if (binary)
     /* If this is a binary file, use the binary-friendly ISO-8859-1
        encoding.  */
     scm_i_set_port_encoding_x (port, NULL);
+  else if (scan_for_encoding)
+    /* If this is an input port and the file has a coding declaration,
+       use that as the port encoding.  */
+    {
+      char *enc = scm_i_scan_for_encoding (port);
+      if (enc != NULL)
+        scm_i_set_port_encoding_x (port, enc);
+    }
 
   scm_dynwind_end ();
 
diff --git a/libguile/load.c b/libguile/load.c
index 84b6705..476461c 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -106,6 +106,9 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
     scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
     scm_i_dynwind_current_load_port (port);
 
+    /* FIXME: For better or for worse, scm_open_file already scans the
+       file for an encoding.  This scans again; necessary for this
+       logic, but unnecessary overall.  */
     encoding = scm_i_scan_for_encoding (port);
     if (encoding)
       scm_i_set_port_encoding_x (port, encoding);
diff --git a/libguile/ports.c b/libguile/ports.c
index 55808e2..9b1be9b 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- *   2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, 
Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, 
"set-port-filename!", 2, 0, 0,
 }
 #undef FUNC_NAME
 
+SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
+            (SCM port),
+            "Peek ahead in @var{port} for a byte-order mark (\\uFEFF) 
encoded\n"
+            "in UTF-8 or in UTF-16.  If found, consume the byte-order mark\n"
+            "and set the port to the indicated encoding.\n"
+            "\n"
+            "As a special case, if the port's encoding is already UTF-16LE\n"
+            "or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
+            "has already asked for an explicit byte order.  In this case no\n"
+            "scan is performed, and the byte-order mark (if any) is left in\n"
+            "the port.\n"
+            "\n"
+            "Return @code{#t} if a byte-order mark was consumed, and\n"
+            "@code{#f} otherwise.")
+#define FUNC_NAME s_scm_consume_byte_order_mark
+{
+  scm_t_port *pt;
+  const char *enc;
+
+  SCM_VALIDATE_PORT (1, port);
+
+  pt = SCM_PTAB_ENTRY (port);
+  enc = pt->encoding;
+
+  if (enc && strcasecmp (enc, "UTF-16BE") == 0)
+    return SCM_BOOL_F;
+
+  if (enc && strcasecmp (enc, "UTF-16LE") == 0)
+    return SCM_BOOL_F;
+
+  switch (scm_peek_byte_or_eof (port))
+    {
+    case 0xEF:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xBB:
+          scm_get_byte_or_eof (port);
+          switch (scm_peek_byte_or_eof (port))
+            {
+            case 0xBF:
+              scm_get_byte_or_eof (port);
+              scm_i_set_port_encoding_x (port, "UTF-8");
+              return SCM_BOOL_T;
+            default:
+              scm_unget_byte (0xBB, port);
+              scm_unget_byte (0xEF, port);
+              return SCM_BOOL_F;
+            }
+        default:
+          scm_unget_byte (0xEF, port);
+          return SCM_BOOL_F;
+        }
+    case 0xFE:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xFF:
+          scm_get_byte_or_eof (port);
+          scm_i_set_port_encoding_x (port, "UTF-16BE");
+          return SCM_BOOL_T;
+        default:
+          scm_unget_byte (0xFE, port);
+          return SCM_BOOL_F;
+        }
+    case 0xFF:
+      scm_get_byte_or_eof (port);
+      switch (scm_peek_byte_or_eof (port))
+        {
+        case 0xFE:
+          scm_get_byte_or_eof (port);
+          scm_i_set_port_encoding_x (port, "UTF-16LE");
+          return SCM_BOOL_T;
+        default:
+          scm_unget_byte (0xFF, port);
+          return SCM_BOOL_F;
+        }
+    default:
+      return SCM_BOOL_F;
+    }
+}
+#undef FUNC_NAME
+
 /* A fluid specifying the default encoding for newly created ports.  If it is
    a string, that is the encoding.  If it is #f, it is in the "native"
    (Latin-1) encoding.  */
diff --git a/libguile/ports.h b/libguile/ports.h
index d4d59b7..2f32345 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -4,7 +4,7 @@
 #define SCM_PORTS_H
 
 /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- *   2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -303,6 +303,7 @@ SCM_API SCM scm_port_column (SCM port);
 SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
 SCM_API SCM scm_port_filename (SCM port);
 SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
+SCM_API SCM scm_consume_byte_order_mark (SCM port);
 SCM_INTERNAL const char *scm_i_default_port_encoding (void);
 SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
 SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
diff --git a/libguile/read.c b/libguile/read.c
index 222891b..a8f7744 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
  * 
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port)
   char header[SCM_ENCODING_SEARCH_SIZE+1];
   size_t bytes_read, encoding_length, i;
   char *encoding = NULL;
-  int utf8_bom = 0;
   char *pos, *encoding_start;
   int in_comment;
 
@@ -2027,13 +2026,9 @@ scm_i_scan_for_encoding (SCM port)
 
       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
       header[bytes_read] = '\0';
-      scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+      scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR));
     }
 
-  if (bytes_read > 3 
-      && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
-    utf8_bom = 1;
-
   /* search past "coding[:=]" */
   pos = header;
   while (1)
@@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port)
     /* This wasn't in a comment */
     return NULL;
 
-  if (utf8_bom && strcmp(encoding, "UTF-8"))
-    scm_misc_error (NULL,
-                   "the port input declares the encoding ~s but is encoded as 
UTF-8",
-                   scm_list_1 (scm_from_locale_string (encoding)));
-
   return encoding;
 }
 
diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test
index a6bfb6e..8bd974d 100644
--- a/test-suite/tests/filesys.test
+++ b/test-suite/tests/filesys.test
@@ -1,6 +1,6 @@
 ;;;; filesys.test --- test file system functions -*- scheme -*-
 ;;;; 
-;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
 ;;;; 
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -17,6 +17,8 @@
 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
 
 (define-module (test-suite test-filesys)
+  #:use-module (ice-9 rdelim)
+  #:use-module (ice-9 binary-ports)
   #:use-module (test-suite lib)
   #:use-module (test-suite guile-test))
 
@@ -127,3 +129,58 @@
 
 (delete-file (test-file))
 (delete-file (test-symlink))
+
+(let ((s "\ufeffHello, world!"))
+  (define* (test-encoding encoding #:optional (ambient "ISO-8859-1"))
+    (with-fluids ((%default-port-encoding ambient))
+      (let* ((bytes (catch 'misc-error
+                      (lambda ()
+                        (call-with-values open-bytevector-output-port
+                          (lambda (port get-bytevector)
+                            (set-port-encoding! port encoding)
+                            (display s port)
+                            (get-bytevector))))
+                      (lambda args
+                        (throw 'unresolved))))
+             (name (string-copy "myfile-XXXXXX"))
+             (port (mkstemp! name)))
+        (put-bytevector port bytes)
+        (close-port port)
+        (let ((contents (call-with-input-file name read-string)))
+          (delete-file name)
+          contents))))
+
+  (pass-if "UTF-8"
+    (equal? (test-encoding "UTF-8")
+            "Hello, world!"))
+
+  (pass-if "UTF-16BE"
+    (equal? (test-encoding "UTF-16BE")
+            "Hello, world!"))
+
+  (pass-if "UTF-16LE"
+    (equal? (test-encoding "UTF-16LE")
+            "Hello, world!"))
+
+  (pass-if "UTF-8 (ambient)"
+    (equal? (test-encoding "UTF-8" "UTF-8")
+            "Hello, world!"))
+
+  (pass-if "UTF-8 (UTF-16 ambient)"
+    (equal? (test-encoding "UTF-8" "UTF-16")
+            "Hello, world!"))
+
+  ;; Unicode 6.2 section 16.8:
+  ;;
+  ;; For compatibility with versions of the Unicode Standard prior to
+  ;; Version 3.2, the code point U+FEFF has the word-joining semantics
+  ;; of zero width no-break space when it is not used as a BOM.  [...]
+  ;;
+  ;; Where the byte order is explicitly specified, such as in UTF-16BE
+  ;; or UTF-16LE, then all U+FEFF characters -- even at the very
+  ;; beginning of the text -- are to be interpreted as zero width
+  ;; no-break spaces.
+  ;;
+  (pass-if "UTF-16LE (ambient)"
+    (equal? (test-encoding "UTF-16LE" "UTF-16LE")
+            "\ufeffHello, world!")))
-- 
1.7.10.4

-- 
http://wingolog.org/

reply via email to

[Prev in Thread] Current Thread [Next in Thread]