guile-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

byte-order marks


From: Andy Wingo
Subject: byte-order marks
Date: Mon, 28 Jan 2013 22:42:09 +0100
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/24.2 (gnu/linux)

What do people think about this attached patch?

Andy

>From 831c3418941f2d643f91e3076ef9458f700a2c59 Mon Sep 17 00:00:00 2001
From: Andy Wingo <address@hidden>
Date: Mon, 28 Jan 2013 22:41:34 +0100
Subject: [PATCH] detect and consume byte-order marks for textual ports

* libguile/read.c (scm_i_scan_for_encoding): If we see a BOM, use it in
  preference to any "coding" declaration, and consume it.  This only
  happens in textual mode.

* libguile/load.c (scm_primitive_load): Add a note about the duplicate
  encoding scan.

* test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and
  UTF-16LE BOM handling.
---
 libguile/load.c               |    4 ++++
 libguile/read.c               |   39 +++++++++++++++++++++++++++------------
 test-suite/tests/filesys.test |   34 +++++++++++++++++++++++++++++++++-
 3 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/libguile/load.c b/libguile/load.c
index 84b6705..b5e430e 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -106,6 +106,10 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
     scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
     scm_i_dynwind_current_load_port (port);
 
+    /* FIXME: For better or for worse, scm_open_file already scans the
+       file for an encoding.  This scans again; necessary for this
+       logic, but unnecessary overall.  As scanning for an encoding
+       consumes a BOM, this might mean we miss a BOM.  */
     encoding = scm_i_scan_for_encoding (port);
     if (encoding)
       scm_i_set_port_encoding_x (port, encoding);
diff --git a/libguile/read.c b/libguile/read.c
index 222891b..1a7462f 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
 /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- *   2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ *   2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
  * 
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -1975,9 +1975,10 @@ scm_get_hash_procedure (int c)
 
 #define SCM_ENCODING_SEARCH_SIZE (500)
 
-/* Search the first few hundred characters of a file for an Emacs-like coding
-   declaration.  Returns either NULL or a string whose storage has been
-   allocated with `scm_gc_malloc ()'.  */
+/* Search the first few hundred characters of a file for an Emacs-like
+   coding declaration.  Returns either NULL or a string whose storage
+   has been allocated with `scm_gc_malloc ()'.  If a BOM is present, it
+   is consumed and used in preference to any coding declaration.  */
 char *
 scm_i_scan_for_encoding (SCM port)
 {
@@ -1985,7 +1986,6 @@ scm_i_scan_for_encoding (SCM port)
   char header[SCM_ENCODING_SEARCH_SIZE+1];
   size_t bytes_read, encoding_length, i;
   char *encoding = NULL;
-  int utf8_bom = 0;
   char *pos, *encoding_start;
   int in_comment;
 
@@ -2030,9 +2030,26 @@ scm_i_scan_for_encoding (SCM port)
       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
     }
 
-  if (bytes_read > 3 
+  /* If there is a byte-order mark, consume it, and use its
+     encoding.  */
+  if (bytes_read >= 3
       && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
-    utf8_bom = 1;
+    {
+      pt->read_pos += 3;
+      return "UTF-8";
+    }
+  else if (bytes_read >= 2
+           && header[0] == '\xfe' && header[1] == '\xff')
+    {
+      pt->read_pos += 2;
+      return "UTF-16BE";
+    }
+  else if (bytes_read >= 2
+           && header[0] == '\xff' && header[1] == '\xfe')
+    {
+      pt->read_pos += 2;
+      return "UTF-16LE";
+    }
 
   /* search past "coding[:=]" */
   pos = header;
@@ -2102,11 +2119,6 @@ scm_i_scan_for_encoding (SCM port)
     /* This wasn't in a comment */
     return NULL;
 
-  if (utf8_bom && strcmp(encoding, "UTF-8"))
-    scm_misc_error (NULL,
-                   "the port input declares the encoding ~s but is encoded as 
UTF-8",
-                   scm_list_1 (scm_from_locale_string (encoding)));
-
   return encoding;
 }
 
@@ -2117,6 +2129,9 @@ SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
             "The coding declaration is of the form\n"
             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
             "\n"
+            "If a UTF-8 or UTF-16 BOM is present, it is consumed, and used 
in\n"
+            "preference to any coding declaration.\n"
+            "\n"
             "Returns a string containing the character encoding of the file\n"
             "if a declaration was found, or @code{#f} otherwise.\n")
 #define FUNC_NAME s_scm_file_encoding
diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test
index a6bfb6e..ecbb3f1 100644
--- a/test-suite/tests/filesys.test
+++ b/test-suite/tests/filesys.test
@@ -1,6 +1,6 @@
 ;;;; filesys.test --- test file system functions -*- scheme -*-
 ;;;; 
-;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
 ;;;; 
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -17,6 +17,8 @@
 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
 
 (define-module (test-suite test-filesys)
+  #:use-module (ice-9 rdelim)
+  #:use-module (ice-9 binary-ports)
   #:use-module (test-suite lib)
   #:use-module (test-suite guile-test))
 
@@ -127,3 +129,33 @@
 
 (delete-file (test-file))
 (delete-file (test-symlink))
+
+(let ((s "\ufeffHello, world!"))
+  (define (test-encoding encoding)
+    (with-fluids ((%default-port-encoding "ISO-8859-1"))
+      (let* ((bytes (catch 'misc-error
+                      (lambda ()
+                        (call-with-values open-bytevector-output-port
+                          (lambda (port get-bytevector)
+                            (set-port-encoding! port encoding)
+                            (display s port)
+                            (get-bytevector))))
+                      (lambda args
+                        (throw 'unresolved))))
+             (name (string-copy "myfile-XXXXXX"))
+             (port (mkstemp! name)))
+        (put-bytevector port bytes)
+        (close-port port)
+        (let ((contents (call-with-input-file name read-string)))
+          (delete-file name)
+          (equal? contents
+                  (substring s 1))))))
+
+  (pass-if "UTF-8"
+    (test-encoding "UTF-8"))
+
+  (pass-if "UTF-16BE"
+    (test-encoding "UTF-16BE"))
+
+  (pass-if "UTF-16LE"
+    (test-encoding "UTF-16LE")))
-- 
1.7.10.4

-- 
http://wingolog.org/

reply via email to

[Prev in Thread] Current Thread [Next in Thread]