emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] emacs-25 5c59970 2/3: * lisp/character-fold.el: Add suppor


From: Artur Malabarba
Subject: [Emacs-diffs] emacs-25 5c59970 2/3: * lisp/character-fold.el: Add support for multi-char matches
Date: Sat, 28 Nov 2015 16:22:15 +0000

branch: emacs-25
commit 5c5997002b0b0aded744d5828158243fd546b3ec
Author: Artur Malabarba <address@hidden>
Commit: Artur Malabarba <address@hidden>

    * lisp/character-fold.el: Add support for multi-char matches
    
    (character-fold-table): Now has an extra-slot. This is a second
    char-table that holds multi-character matches.  See docstring for
    details.
    (character-fold-to-regexp): Can build branching regexps when a
    character's entry the extra slot of `character-fold-table' matches the
    characters that succeed it.
---
 lisp/character-fold.el                 |  173 +++++++++++++++++++++-----------
 test/automated/character-fold-tests.el |   45 +++++++--
 2 files changed, 149 insertions(+), 69 deletions(-)

diff --git a/lisp/character-fold.el b/lisp/character-fold.el
index 749d113..0086345 100644
--- a/lisp/character-fold.el
+++ b/lisp/character-fold.el
@@ -22,11 +22,15 @@
 
 ;;; Code:
 
+(eval-and-compile (put 'character-fold-table 'char-table-extra-slots 1))
 
 (defconst character-fold-table
   (eval-when-compile
     (let ((equiv (make-char-table 'character-fold-table))
+          (equiv-multi (make-char-table 'character-fold-table))
           (table (unicode-property-table-internal 'decomposition)))
+      (set-char-table-extra-slot equiv 0 equiv-multi)
+
       ;; Ensure the table is populated.
       (let ((func (char-table-extra-slot table 1)))
         (map-char-table (lambda (char v)
@@ -36,63 +40,61 @@
 
       ;; Compile a list of all complex characters that each simple
       ;; character should match.
+      ;; In summary this loop does 3 things:
+      ;; - A complex character might be allowed to match its decomp.
+      ;; - The decomp is allowed to match the complex character.
+      ;; - A single char of the decomp might be allowed to match the
+      ;;   character.
+      ;; Some examples in the comments below.
       (map-char-table
        (lambda (char decomp)
          (when (consp decomp)
-           (if (symbolp (car decomp))
-               ;; Discard a possible formatting tag.
-               (setq decomp (cdr decomp))
-             ;; If there's no formatting tag, ensure that char matches
-             ;; its decomp exactly.  This is because we want 'ä' to
-             ;; match 'ä', but we don't want '¹' to match '1'.
-             (aset equiv char
-                   (cons (apply #'string decomp)
-                         (aref equiv char))))
-           ;; Finally, figure out whether char has a simpler
-           ;; equivalent (char-aux). If so, ensure that char-aux
-           ;; matches char and maybe its decomp too.
-
            ;; Skip trivial cases like ?a decomposing to (?a).
-           (unless (or (and (eq char (car decomp))
-                            (not  (cdr decomp))))
-             (let ((dec-aux decomp)
-                   (fold-decomp t)
-                   char-aux found)
-               (while (and dec-aux (not found))
-                 (setq char-aux (pop dec-aux))
-                 ;; Is char-aux a number or letter, per unicode standard?
-                 (setq found (memq (get-char-code-property char-aux 
'general-category)
-                                   '(Lu Ll Lt Lm Lo Nd Nl No))))
-               (if found
-                   ;; Check if the decomp has more than one letter,
-                   ;; because then we don't want the first letter to
-                   ;; match the decomposition.  This is because we
-                   ;; want 'f' to match 'ff' but not 'ff'.
-                   (dolist (char-aux dec-aux)
-                     (when (and fold-decomp
-                                (memq (get-char-code-property char-aux 
'general-category)
-                                      '(Lu Ll Lt Lm Lo Nd Nl No)))
-                       (setq fold-decomp nil)))
-                 ;; If there's no number or letter on the
-                 ;; decomp, take the first character in it.
-                 (setq found (car-safe decomp)))
-               ;; Finally, we only fold multi-char decomp if at
-               ;; least one of the chars is non-spacing (combining).
-               (when fold-decomp
-                 (setq fold-decomp nil)
-                 (dolist (char-aux decomp)
-                   (when (and (not fold-decomp)
-                              (> (get-char-code-property char-aux 
'canonical-combining-class) 0))
-                     (setq fold-decomp t))))
-               ;; Add char to the list of characters that char-aux can
-               ;; represent. Also possibly add its decomp, so we can
-               ;; match multi-char representations like (format "a%c" 769)
-               (when (and found (not (eq char char-aux)))
-                 (let ((chars (cons (char-to-string char) (aref equiv 
char-aux))))
-                   (aset equiv char-aux
-                         (if fold-decomp
-                             (cons (apply #'string decomp) chars)
-                           chars))))))))
+           (unless (and (not (cdr decomp))
+                        (eq char (car decomp)))
+             (if (symbolp (car decomp))
+                 ;; Discard a possible formatting tag.
+                 (setq decomp (cdr decomp))
+               ;; If there's no formatting tag, ensure that char matches
+               ;; its decomp exactly.  This is because we want 'ä' to
+               ;; match 'ä', but we don't want '¹' to match '1'.
+               (aset equiv char
+                     (cons (apply #'string decomp)
+                           (aref equiv char))))
+
+             ;; Allow the entire decomp to match char.  If decomp has
+             ;; multiple characters, this is done by adding an entry
+             ;; to the alist of the first character in decomp.  This
+             ;; allows 'ff' to match 'ff', 'ä' to match 'ä', and '1' to
+             ;; match '¹'.
+             (let ((make-decomp-match-char
+                    (lambda (decomp char)
+                      (if (cdr decomp)
+                          (aset equiv-multi (car decomp)
+                                (cons (cons (apply #'string (cdr decomp))
+                                            (regexp-quote (string char)))
+                                      (aref equiv-multi (car decomp))))
+                        (aset equiv (car decomp)
+                              (cons (char-to-string char)
+                                    (aref equiv (car decomp))))))))
+               (funcall make-decomp-match-char decomp char)
+               ;; Do it again, without the non-spacing characters.
+               ;; This allows 'a' to match 'ä'.
+               (let ((simpler-decomp nil)
+                     (found-one nil))
+                 (dolist (c decomp)
+                   (if (> (get-char-code-property c 
'canonical-combining-class) 0)
+                       (setq found-one t)
+                     (push c simpler-decomp)))
+                 (when (and simpler-decomp found-one)
+                   (funcall make-decomp-match-char simpler-decomp char)
+                   ;; Finally, if the decomp only had one spacing
+                   ;; character, we allow this character to match the
+                   ;; decomp.  This is to let 'a' match 'ä'.
+                   (unless (cdr simpler-decomp)
+                     (aset equiv (car simpler-decomp)
+                           (cons (apply #'string decomp)
+                                 (aref equiv (car simpler-decomp)))))))))))
        table)
 
       ;; Add some manual entries.
@@ -112,7 +114,27 @@
              (aset equiv char re))))
        equiv)
       equiv))
-  "Used for folding characters of the same group during search.")
+  "Used for folding characters of the same group during search.
+This is a char-table with the `character-fold-table' subtype.
+
+Let us refer to the character in question by char-x.
+Each entry is either nil (meaning char-x only matches literally)
+or a regexp.  This regexp should match anything that char-x can
+match by itself \(including char-x).  For instance, the default
+regexp for the ?+ character is \"[+⁺₊﬩﹢+]\".
+
+This table also has one extra slot which is also a char-table.
+Each entry in the extra slot is an alist used for multi-character
+matching (which may be nil).  The elements of the alist should
+have the form (SUFFIX . OTHER-REGEXP).  If the characters after
+char-x are equal to SUFFIX, then this combination of char-x +
+SUFFIX is allowed to match OTHER-REGEXP.  This is in addition to
+char-x being allowed to match REGEXP.
+For instance, the default alist for ?f includes:
+    \((\"fl\" . \"ffl\") (\"fi\" . \"ffi\")
+     (\"i\" . \"fi\") (\"f\" . \"ff\"))
+
+Exceptionally for the space character (32), ALIST is ignored.")
 
 (defun character-fold--make-space-string (n)
   "Return a string that matches N spaces."
@@ -122,13 +144,17 @@
                  (make-list n (or (aref character-fold-table ?\s) " ")))))
 
 ;;;###autoload
-(defun character-fold-to-regexp (string &optional _lax)
+(defun character-fold-to-regexp (string &optional _lax from)
   "Return a regexp matching anything that character-folds into STRING.
 Any character in STRING that has an entry in
 `character-fold-table' is replaced with that entry (which is a
-regexp) and other characters are `regexp-quote'd."
+regexp) and other characters are `regexp-quote'd.
+
+FROM is for internal use.  It specifies an index in the STRING
+from which to start."
   (let ((spaces 0)
-        (i 0)
+        (multi-char-table (char-table-extra-slot character-fold-table 0))
+        (i (or from 0))
         (end (length string))
         (out nil))
     ;; When the user types a space, we want to match the table entry
@@ -145,9 +171,36 @@ regexp) and other characters are `regexp-quote'd."
         (c (when (> spaces 0)
              (push (character-fold--make-space-string spaces) out)
              (setq spaces 0))
-           (push (or (aref character-fold-table c)
-                     (regexp-quote (string c)))
-                 out)))
+           (let ((regexp (or (aref character-fold-table c)
+                             (regexp-quote (string c))))
+                 ;; Long string.  The regexp would probably be too long.
+                 (alist (unless (> end 60)
+                          (aref multi-char-table c))))
+             (push (let ((alist-out '("\\)")))
+                     (pcase-dolist (`(,suffix . ,out-regexp) alist)
+                       (let ((len-suf (length suffix)))
+                         (when (eq (compare-strings suffix 0 nil
+                                                    string (1+ i) (+ i 1 
len-suf)
+                                                    nil)
+                                   t)
+                           ;; FIXME: If N suffixes match, we "branch"
+                           ;; out into N+1 executions for the rest of
+                           ;; the string.  This involves redundant
+                           ;; work and makes a huge regexp.
+                           (push (concat "\\|" out-regexp
+                                         (character-fold-to-regexp
+                                          string nil (+ i 1 len-suf)))
+                                 alist-out))))
+                     ;; If no suffixes matched, just go on.
+                     (if (not (cdr alist-out))
+                         regexp
+                       ;; Otherwise, add a branch for the
+                       ;; no-suffix case, and stop the loop here.
+                       (prog1 (apply #'concat "\\(?:" regexp
+                                     (character-fold-to-regexp string nil (1+ 
i))
+                                     alist-out)
+                         (setq i end))))
+                   out))))
       (setq i (1+ i)))
     (when (> spaces 0)
       (push (character-fold--make-space-string spaces) out))
diff --git a/test/automated/character-fold-tests.el 
b/test/automated/character-fold-tests.el
index 40f0aec..aa2ee96 100644
--- a/test/automated/character-fold-tests.el
+++ b/test/automated/character-fold-tests.el
@@ -43,7 +43,7 @@
       (character-fold--test-search-with-contents w w))))
 
 (ert-deftest character-fold--test-lax-whitespace ()
-  (dotimes (n 100)
+  (dotimes (n 50)
     (let ((w1 (character-fold--random-word n))
           (w2 (character-fold--random-word n))
           (search-spaces-regexp "\\s-+"))
@@ -52,17 +52,44 @@
        (concat w1 " " w2))
       (character-fold--test-search-with-contents
        (concat w1 "\s\n\s\t\f\t\n\r\t" w2)
-       (concat w1 (make-string 90 ?\s) w2)))))
+       (concat w1 (make-string 10 ?\s) w2)))))
+
+(defun character-fold--test-match-exactly (string &rest strings-to-match)
+  (let ((re (concat "\\`" (character-fold-to-regexp string) "\\'")))
+    (dolist (it strings-to-match)
+      (should (string-match re it)))))
+
+(ert-deftest character-fold--test-some-defaults ()
+  (dolist (it '(("ffl" . "ffl") ("ffi" . "ffi")
+                ("fi" . "fi") ("ff" . "ff")
+                ("ä" . "ä")))
+    (character-fold--test-search-with-contents (cdr it) (car it))
+    (let ((multi (char-table-extra-slot character-fold-table 0))
+          (character-fold-table (make-char-table 'character-fold-table)))
+      (set-char-table-extra-slot character-fold-table 0 multi)
+      (character-fold--test-match-exactly (car it) (cdr it)))))
 
 (ert-deftest character-fold--test-fold-to-regexp ()
-  (let ((character-fold-table (make-char-table 'character-fold-table)))
-    (aset character-fold-table ?a "abc")
-    (aset character-fold-table ?1 "123")
+  (let ((character-fold-table (make-char-table 'character-fold-table))
+        (multi  (make-char-table 'character-fold-table)))
+    (set-char-table-extra-slot character-fold-table 0 multi)
+    (aset character-fold-table ?a "xx")
+    (aset character-fold-table ?1 "44")
     (aset character-fold-table ?\s "-!-")
-    (should (equal (character-fold-to-regexp "a1a1")
-                   "abc123abc123"))
-    (should (equal (character-fold-to-regexp "a1  a 1")
-                   "abc123\\(?:  \\|-!--!-\\)abc\\(?: \\|-!-\\)123"))))
+    (character-fold--test-match-exactly "a1a1" "xx44xx44")
+    (character-fold--test-match-exactly "a1  a 1" "xx44-!--!-xx-!-44")
+    (aset multi ?a '(("1" . "99")
+                     ("2" . "88")
+                     ("12" . "77")))
+    (character-fold--test-match-exactly "a" "xx")
+    (character-fold--test-match-exactly "a1" "xx44" "99")
+    (character-fold--test-match-exactly "a12" "77" "xx442" "992")
+    (character-fold--test-match-exactly "a2" "88")
+    (aset multi ?1 '(("2" . "yy")))
+    (character-fold--test-match-exactly "a1" "xx44" "99")
+    (character-fold--test-match-exactly "a12" "77" "xx442" "992")
+    (character-fold--test-match-exactly "a12" "xxyy")))
+
 
 (provide 'character-fold-tests)
 ;;; character-fold-tests.el ends here



reply via email to

[Prev in Thread] Current Thread [Next in Thread]