emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] master da75804: rx: fix `or' ordering by adding argument t


From: Mattias Engdegård
Subject: [Emacs-diffs] master da75804: rx: fix `or' ordering by adding argument to regexp-opt
Date: Sat, 2 Mar 2019 09:36:10 -0500 (EST)

branch: master
commit da758046da74e33273265cd2e72a8aa1a0c9c7e3
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>

    rx: fix `or' ordering by adding argument to regexp-opt
    
    The rx `or' form may reorder its arguments in an unpredictable way,
    contrary to user expectation, since it sometimes uses `regexp-opt'.
    Add a NOREORDER option to `regexp-opt' for preventing it from
    producing a reordered regexp (Bug#34641).
    
    * doc/lispref/searching.texi (Regular Expression Functions):
    * etc/NEWS (Lisp Changes in Emacs 27.1):
    Describe the new regexp-opt NOREORDER argument.
    * lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER.
    Make no attempt at regexp improvement if the set of strings contains
    a prefix of another string.
    (regexp-opt--contains-prefix): New.
    * lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER.
    * test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order.
---
 doc/lispref/searching.texi       | 13 ++++++++++---
 etc/NEWS                         |  7 +++++++
 lisp/emacs-lisp/regexp-opt.el    | 38 ++++++++++++++++++++++++++++++++++----
 lisp/emacs-lisp/rx.el            |  2 +-
 test/lisp/emacs-lisp/rx-tests.el | 13 +++++++++++++
 5 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index cfbd244..fb7f484 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -950,7 +950,7 @@ whitespace:
 @end defun
 
 @cindex optimize regexp
address@hidden regexp-opt strings &optional paren
address@hidden regexp-opt strings &optional paren noreorder
 This function returns an efficient regular expression that will match
 any of the strings in the list @var{strings}.  This is useful when you
 need to make matching or searching as fast as possible---for example,
@@ -985,8 +985,15 @@ if it is necessary to ensure that a postfix operator 
appended to
 it will apply to the whole expression.
 @end table
 
-The resulting regexp of @code{regexp-opt} is equivalent to but usually
-more efficient than that of a simplified version:
+The optional argument @var{noreorder}, if @code{nil} or omitted,
+allows the returned regexp to match the strings in any order.  If
address@hidden, the match is guaranteed to be performed in the order
+given, as if the strings were made into a regexp by joining them with
+the @samp{\|} operator.
+
+Up to reordering, the resulting regexp of @code{regexp-opt} is
+equivalent to but usually more efficient than that of a simplified
+version:
 
 @example
 (defun simplified-regexp-opt (strings &optional paren)
diff --git a/etc/NEWS b/etc/NEWS
index 29ed7ab..7c95988 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1642,6 +1642,13 @@ MS-Windows.
 ** New module environment function 'process_input' to process user
 input while module code is running.
 
++++
+** The function 'regexp-opt' accepts an additional optional argument.
+By default, the regexp returned by 'regexp-opt' may match the strings
+in any order.  If the new third argument is non-nil, the match is
+guaranteed to be performed in the order given, as if the strings were
+made into a regexp by joining them with '\|'.
+
 
 * Changes in Emacs 27.1 on Non-Free Operating Systems
 
diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el
index 63786c1..d0c5f2d 100644
--- a/lisp/emacs-lisp/regexp-opt.el
+++ b/lisp/emacs-lisp/regexp-opt.el
@@ -84,7 +84,7 @@
 ;;; Code:
 
 ;;;###autoload
-(defun regexp-opt (strings &optional paren)
+(defun regexp-opt (strings &optional paren noreorder)
   "Return a regexp to match a string in the list STRINGS.
 Each string should be unique in STRINGS and should not contain
 any regexps, quoted or not.  Optional PAREN specifies how the
@@ -111,8 +111,14 @@ nil
     necessary to ensure that a postfix operator appended to it will
     apply to the whole expression.
 
-The resulting regexp is equivalent to but usually more efficient
-than that of a simplified version:
+The optional argument NOREORDER, if nil or omitted, allows the
+returned regexp to match the strings in any order.  If non-nil,
+the match is guaranteed to be performed in the order given, as if
+the strings were made into a regexp by joining them with the
+`\\|' operator.
+
+Up to reordering, the resulting regexp is equivalent to but
+usually more efficient than that of a simplified version:
 
  (defun simplified-regexp-opt (strings &optional paren)
    (let ((parens
@@ -133,7 +139,15 @@ than that of a simplified version:
           (open (cond ((stringp paren) paren) (paren "\\(")))
           (sorted-strings (delete-dups
                            (sort (copy-sequence strings) 'string-lessp)))
-          (re (regexp-opt-group sorted-strings (or open t) (not open))))
+          (re
+            ;; If NOREORDER is non-nil and the list contains a prefix
+            ;; of another string, we give up all attempts at optimisation.
+            ;; There is plenty of room for improvement (Bug#34641).
+            (if (and noreorder (regexp-opt--contains-prefix sorted-strings))
+                (concat (or open "\\(?:")
+                        (mapconcat #'regexp-quote strings "\\|")
+                        "\\)")
+              (regexp-opt-group sorted-strings (or open t) (not open)))))
       (cond ((eq paren 'words)
             (concat "\\<" re "\\>"))
            ((eq paren 'symbols)
@@ -313,6 +327,22 @@ CHARS should be a list of characters."
           (concat "[" dash caret "]"))
       (concat "[" bracket charset caret dash "]"))))
 
+
+(defun regexp-opt--contains-prefix (strings)
+  "Whether STRINGS contains a proper prefix of one of its other elements.
+STRINGS must be a list of sorted strings without duplicates."
+  (let ((s strings))
+    ;; In a lexicographically sorted list, a string always immediately
+    ;; succeeds one of its prefixes.
+    (while (and (cdr s)
+                (not (string-equal
+                      (car s)
+                      (substring (cadr s) 0 (min (length (car s))
+                                                 (length (cadr s)))))))
+      (setq s (cdr s)))
+    (cdr s)))
+
+
 (provide 'regexp-opt)
 
 ;;; regexp-opt.el ends here
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 715cd60..ca756ef 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -393,7 +393,7 @@ FORM is of the form `(and FORM1 ...)'."
   (rx-group-if
    (if (memq nil (mapcar 'stringp (cdr form)))
        (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
-     (regexp-opt (cdr form)))
+     (regexp-opt (cdr form) nil t))
    (and (memq rx-parent '(: * t)) rx-parent)))
 
 
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index e14feda..fa3d9b0 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -92,5 +92,18 @@
                          (*? "e") (+? "f") (\?? "g") (?? "h"))))
                  "a*b+c?d?e*?f+?g??h??")))
 
+(ert-deftest rx-or ()
+  ;; Test or-pattern reordering (Bug#34641).
+  (let ((s "abc"))
+    (should (equal (and (string-match (rx (or "abc" "ab" "a")) s)
+                        (match-string 0 s))
+                   "abc"))
+    (should (equal (and (string-match (rx (or "ab" "abc" "a")) s)
+                        (match-string 0 s))
+                   "ab"))
+    (should (equal (and (string-match (rx (or "a" "ab" "abc")) s)
+                        (match-string 0 s))
+                   "a"))))
+
 (provide 'rx-tests)
 ;; rx-tests.el ends here.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]