emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master f3f1947 1/3: Fix Hexl handling of coding-systems with BOM


From: Eli Zaretskii
Subject: master f3f1947 1/3: Fix Hexl handling of coding-systems with BOM
Date: Tue, 11 May 2021 09:37:58 -0400 (EDT)

branch: master
commit f3f1947e5b5beeef9c004cfa2bf591dc0c0331b8
Author: Eli Zaretskii <eliz@gnu.org>
Commit: Eli Zaretskii <eliz@gnu.org>

    Fix Hexl handling of coding-systems with BOM
    
    * lisp/international/mule-cmds.el (encode-coding-char): If
    CODING-SYSTEM produces BOM, remove the BOM bytes from the produced
    byte sequence.  (Bug#48324)
    
    * lisp/hexl.el (hexl-mode): Use bufferpos-to-filepos to convert
    point to offset into the original file.
    (hexl-mode-exit, hexl-maybe-dehexlify-buffer): Use
    filepos-to-bufferpos to restore point in the original buffer.
    (hexl-mode, hexl-insert-multibyte-char)
    (hexl-self-insert-command, hexl-insert-hex-char)
    (hexl-insert-decimal-char, hexl-insert-octal-char)
    (hexl-find-file): Enhance the doc strings, mainly explaining the
    complications of inserting multibyte characters.
    (hexl-insert-multibyte-char): Don't treat CH as unibyte if the
    coding-system isn't ASCII-compatible.  Don't treat null bytes as
    multibyte.
---
 lisp/hexl.el                    | 136 +++++++++++++++++++++++++++++-----------
 lisp/international/mule-cmds.el |  28 +++++++--
 2 files changed, 120 insertions(+), 44 deletions(-)

diff --git a/lisp/hexl.el b/lisp/hexl.el
index 85c3a53..8bfc1fb 100644
--- a/lisp/hexl.el
+++ b/lisp/hexl.el
@@ -303,22 +303,30 @@ also supported.
 
 There are several ways to change text in hexl mode:
 
-ASCII characters (character between space (0x20) and tilde (0x7E)) are
-bound to self-insert so you can simply type the character and it will
-insert itself (actually overstrike) into the buffer.
+Self-inserting characters are bound to `hexl-self-insert' so you
+can simply type the character and it will insert itself (actually
+overstrike) into the buffer.  However, inserting non-ASCII characters
+requires caution: the buffer's coding-system should correspond to
+the encoding on disk, and multibyte characters should be inserted
+with cursor on the first byte of a multibyte sequence whose length
+is identical to the length of the multibyte sequence to be inserted,
+otherwise this could produce invalid multibyte sequences.  Non-ASCII
+characters in ISO-2022 encodings should preferably inserted byte by
+byte, to avoid problems caused by the designation sequences before
+the actual characters.
 
 \\[hexl-quoted-insert] followed by another keystroke allows you to insert the 
key even if
 it isn't bound to self-insert.  An octal number can be supplied in place
 of another key to insert the octal number's ASCII representation.
 
-\\[hexl-insert-hex-char] will insert a given hexadecimal value (if it is 
between 0 and 0xFF)
-into the buffer at the current point.
+\\[hexl-insert-hex-char] will insert a given hexadecimal value
+into the buffer at the current address.
 
-\\[hexl-insert-octal-char] will insert a given octal value (if it is between 0 
and 0377)
-into the buffer at the current point.
+\\[hexl-insert-octal-char] will insert a given octal value
+into the buffer at the current address.
 
-\\[hexl-insert-decimal-char] will insert a given decimal value (if it is 
between 0 and 255)
-into the buffer at the current point.
+\\[hexl-insert-decimal-char] will insert a given decimal value
+into the buffer at the current address..
 
 \\[hexl-mode-exit] will exit `hexl-mode'.
 
@@ -332,26 +340,16 @@ You can use \\[hexl-find-file] to visit a file in Hexl 
mode.
   (unless (eq major-mode 'hexl-mode)
     (let ((modified (buffer-modified-p))
          (inhibit-read-only t)
-         (original-point (- (point) (point-min))))
-      (and (eobp) (not (bobp))
-          (setq original-point (1- original-point)))
+          (point-offset (bufferpos-to-filepos (point) 'exact)))
       ;; If `hexl-mode' is invoked with an argument the buffer is assumed to
       ;; be in hexl format.
       (when (memq arg '(1 nil))
-       ;; If the buffer's EOL type is -dos, we need to account for
-       ;; extra CR characters added when hexlify-buffer writes the
-       ;; buffer to a file.
-        ;; FIXME: This doesn't take into account multibyte coding systems.
-       (when (eq (coding-system-eol-type buffer-file-coding-system) 1)
-          (setq original-point (+ (count-lines (point-min) (point))
-                                 original-point))
-         (or (bolp) (setq original-point (1- original-point))))
         (hexlify-buffer)
         (restore-buffer-modified-p modified))
       (setq hexl-max-address
             (+ (* (/ (1- (buffer-size)) (hexl-line-displen)) 16) 15))
       (condition-case nil
-         (hexl-goto-address original-point)
+         (hexl-goto-address point-offset)
        (error nil)))
 
     (let ((max-address hexl-max-address))
@@ -440,7 +438,8 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode.
 (defun hexl-find-file (filename)
   "Edit file FILENAME as a binary file in hex dump format.
 Switch to a buffer visiting file FILENAME, creating one if none exists,
-and edit the file in `hexl-mode'."
+and edit the file in `hexl-mode'.  The buffer's coding-system will be
+no-conversion, unlike if you visit it normally and then invoke `hexl-mode'."
   (interactive
    (list
     (let ((completion-ignored-extensions nil))
@@ -478,17 +477,11 @@ With arg, don't unhexlify buffer."
   (if (or (eq arg 1) (not arg))
       (let ((modified (buffer-modified-p))
            (inhibit-read-only t)
-           (original-point (1+ (hexl-current-address))))
+            (point-offset (hexl-current-address)))
        (dehexlify-buffer)
        (remove-hook 'write-contents-functions #'hexl-save-buffer t)
        (restore-buffer-modified-p modified)
-       (goto-char original-point)
-       ;; Maybe adjust point for the removed CR characters.
-       (when (eq (coding-system-eol-type buffer-file-coding-system) 1)
-         (setq original-point (- original-point
-                                 (count-lines (point-min) (point))))
-         (or (bobp) (setq original-point (1+ original-point))))
-       (goto-char original-point)))
+       (goto-char (filepos-to-bufferpos point-offset 'exact))))
 
   (remove-hook 'change-major-mode-hook #'hexl-maybe-dehexlify-buffer t)
   (major-mode-restore))
@@ -499,11 +492,11 @@ Ask the user for confirmation."
   (if (y-or-n-p "Convert contents back to binary format? ")
       (let ((modified (buffer-modified-p))
            (inhibit-read-only t)
-           (original-point (1+ (hexl-current-address))))
+            (point-offset (hexl-current-address)))
        (dehexlify-buffer)
        (remove-hook 'write-contents-functions #'hexl-save-buffer t)
        (restore-buffer-modified-p modified)
-       (goto-char original-point))))
+       (goto-char (filepos-to-bufferpos point-offset 'exact)))))
 
 (defun hexl-current-address (&optional validate)
   "Return current hexl-address."
@@ -879,14 +872,27 @@ This discards the buffer's undo information."
   "Insert a possibly multibyte character CH NUM times.
 
 Non-ASCII characters are first encoded with `buffer-file-coding-system',
-and their encoded form is inserted byte by byte."
+and their encoded form is inserted byte by byte.  Note that if the
+hexl buffer was produced by `hexl-find-file', its coding-system
+is no-conversion.
+
+Inserting non-ASCII characters requires caution: the buffer's
+coding-system should correspond to the encoding on disk, and
+multibyte characters should be inserted with cursor on the first
+byte of a multibyte sequence whose length is identical to the
+length of the multibyte sequence to be inserted, otherwise this
+could produce invalid multibyte sequences.  Non-ASCII characters
+in ISO-2022 encodings should preferably inserted byte by byte, to
+avoid problems caused by the designation sequences before the
+actual characters."
   (let ((charset (char-charset ch))
        (coding (if (or (null buffer-file-coding-system)
                        ;; coding-system-type equals t means undecided.
                        (eq (coding-system-type buffer-file-coding-system) t))
                    (default-value 'buffer-file-coding-system)
                  buffer-file-coding-system)))
-    (cond ((and (> ch 0) (< ch 256))
+    (cond ((and (>= ch 0) (< ch 256)
+                (coding-system-get coding :ascii-compatible-p))
           (hexl-insert-char ch num))
          ((eq charset 'unknown)
           (error
@@ -924,7 +930,19 @@ and their encoded form is inserted byte by byte."
 Interactively, with a numeric argument, insert this character that many times.
 
 Non-ASCII characters are first encoded with `buffer-file-coding-system',
-and their encoded form is inserted byte by byte."
+and their encoded form is inserted byte by byte.  Note that if the
+hexl buffer was produced by `hexl-find-file', its coding-system
+is no-conversion.
+
+Inserting non-ASCII characters requires caution: the buffer's
+coding-system should correspond to the encoding on disk, and
+multibyte characters should be inserted with cursor on the first
+byte of a multibyte sequence whose length is identical to the
+length of the multibyte sequence to be inserted, otherwise this
+could produce invalid multibyte sequences.  Non-ASCII characters
+in ISO-2022 encodings should preferably inserted byte by byte, to
+avoid problems caused by the designation sequences before the
+actual characters."
   (interactive "p")
   (hexl-insert-multibyte-char last-command-event arg))
 
@@ -964,7 +982,21 @@ CH must be a unibyte character whose value is between 0 
and 255."
 ;; hex conversion
 
 (defun hexl-insert-hex-char (arg)
-  "Insert a character given by its hexadecimal code ARG times at point."
+  "Insert a character given by its hexadecimal code ARG times at point.
+
+Values above 0xFF are treated as multibyte characters, and first encoded
+using `buffer-file-coding-system'.  Note that if the hexl buffer was
+produced by `hexl-find-file', its coding-system is no-conversion.
+
+Inserting non-ASCII characters requires caution: the buffer's
+coding-system should correspond to the encoding on disk, and
+multibyte characters should be inserted with cursor on the first
+byte of a multibyte sequence whose length is identical to the
+length of the multibyte sequence to be inserted, otherwise this
+could produce invalid multibyte sequences.  Non-ASCII characters
+in ISO-2022 encodings should preferably inserted byte by byte, to
+avoid problems caused by the designation sequences before the
+actual characters."
   (interactive "p")
   (let ((num (hexl-hex-string-to-integer (read-string "Hex number: "))))
     (if (< num 0)
@@ -997,7 +1029,21 @@ Embedded whitespace, dashes, and periods in the string 
are ignored."
       (setq arg (- arg 1)))))
 
 (defun hexl-insert-decimal-char (arg)
-  "Insert a character given by its decimal code ARG times at point."
+  "Insert a character given by its decimal code ARG times at point.
+
+Values above 256 are treated as multibyte characters, and first encoded
+using `buffer-file-coding-system'.  Note that if the hexl buffer was
+produced by `hexl-find-file', its coding-system is no-conversion.
+
+Inserting non-ASCII characters requires caution: the buffer's
+coding-system should correspond to the encoding on disk, and
+multibyte characters should be inserted with cursor on the first
+byte of a multibyte sequence whose length is identical to the
+length of the multibyte sequence to be inserted, otherwise this
+could produce invalid multibyte sequences.  Non-ASCII characters
+in ISO-2022 encodings should preferably inserted byte by byte, to
+avoid problems caused by the designation sequences before the
+actual characters."
   (interactive "p")
   (let ((num (string-to-number (read-string "Decimal Number: "))))
     (if (< num 0)
@@ -1005,7 +1051,21 @@ Embedded whitespace, dashes, and periods in the string 
are ignored."
       (hexl-insert-multibyte-char num arg))))
 
 (defun hexl-insert-octal-char (arg)
-  "Insert a character given by its octal code ARG times at point."
+  "Insert a character given by its octal code ARG times at point.
+
+Values above \377 are treated as multibyte characters, and first encoded
+using `buffer-file-coding-system'.  Note that if the hexl buffer was
+produced by `hexl-find-file', its coding-system is no-conversion.
+
+Inserting non-ASCII characters requires caution: the buffer's
+coding-system should correspond to the encoding on disk, and
+multibyte characters should be inserted with cursor on the first
+byte of a multibyte sequence whose length is identical to the
+length of the multibyte sequence to be inserted, otherwise this
+could produce invalid multibyte sequences.  Non-ASCII characters
+in ISO-2022 encodings should preferably inserted byte by byte, to
+avoid problems caused by the designation sequences before the
+actual characters."
   (interactive "p")
   (let ((num (hexl-octal-string-to-integer (read-string "Octal Number: "))))
     (if (< num 0)
diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el
index b99db46..7f8d98b 100644
--- a/lisp/international/mule-cmds.el
+++ b/lisp/international/mule-cmds.el
@@ -2963,18 +2963,22 @@ STR should be a unibyte string."
    str " "))
 
 (defun encode-coding-char (char coding-system &optional charset)
-  "Encode CHAR by CODING-SYSTEM and return the resulting string.
+  "Encode CHAR by CODING-SYSTEM and return the resulting string of bytes.
 If CODING-SYSTEM can't safely encode CHAR, return nil.
 The 3rd optional argument CHARSET, if non-nil, is a charset preferred
 on encoding."
   (let* ((str1 (string char))
         (str2 (string char char))
         (found (find-coding-systems-string str1))
-       enc1 enc2 i1 i2)
-    (if (eq (car-safe found) 'undecided) ;Aka (not (multibyte-string-p str1))
-        ;; `char' is ASCII.
+         (bom-p (coding-system-get coding-system :bom))
+        enc1 enc2 i0 i1 i2)
+    ;; If CHAR is ASCII and CODING-SYSTEM doesn't prepend a BOM, just
+    ;; encode CHAR.
+    (if (and (eq (car-safe found) 'undecided)
+             (null bom-p))
        (encode-coding-string str1 coding-system)
-      (when (memq (coding-system-base coding-system) found)
+      (when (or (eq (car-safe found) 'undecided)
+                (memq (coding-system-base coding-system) found))
        ;; We must find the encoded string of CHAR.  But, just encoding
        ;; CHAR will put extra control sequences (usually to designate
        ;; ASCII charset) at the tail if type of CODING is ISO 2022.
@@ -2995,7 +2999,19 @@ on encoding."
        ;; Now (substring enc1 i1) and (substring enc2 i2) are the same,
        ;; and they are the extra control sequences at the tail to
        ;; exclude.
-       (substring enc2 0 i2)))))
+
+        ;; We also need to exclude the leading 2 or 3 bytes if they
+        ;; come from a BOM.
+        (setq i0
+              (if bom-p
+                  (cond
+                   ((eq (coding-system-type coding-system) 'utf-8)
+                    3)
+                   ((eq (coding-system-type coding-system) 'utf-16)
+                    2)
+                   (t 0))
+                0))
+       (substring enc2 i0 i2)))))
 
 ;; Backwards compatibility.  These might be better with :init-value t,
 ;; but that breaks loadup.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]