emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/pyim 995b412: 优化 cregexp-build 相关功能


From: ELPA Syncer
Subject: [elpa] externals/pyim 995b412: 优化 cregexp-build 相关功能
Date: Thu, 9 Dec 2021 00:57:31 -0500 (EST)

branch: externals/pyim
commit 995b412ad30f9273bab46e89d38f042776b3a672
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>

    优化 cregexp-build 相关功能
    
        * tests/pyim-tests.el (pyim-test-pyim-cregexp): test char-level > 3.
    
        * pyim-cregexp.el (pyim-cregexp-char-level-num): Added.
        (pyim-cregexp-build): Simplify.
        (pyim-cregexp-build-1, pyim-cregexp-build:quanpin): Use 
pyim-cregexp-char-level-num.
---
 pyim-cregexp.el     | 71 +++++++++++++++++++++++++++--------------------------
 tests/pyim-tests.el |  6 +++++
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/pyim-cregexp.el b/pyim-cregexp.el
index b79e779b..6d4e85c 100644
--- a/pyim-cregexp.el
+++ b/pyim-cregexp.el
@@ -43,35 +43,40 @@
 将使用这个 scheme."
   :type 'symbol)
 
+(defun pyim-cregexp-char-level-num (num)
+  "根据 NUM 返回一个有效的常用汉字级别。"
+  (if (numberp num)
+      (max (min num 4) 1)
+    4))
+
 (defun pyim-cregexp-build (string &optional char-level-num)
   "根据 STRING 构建一个中文 regexp, 用于 \"拼音搜索汉字\".
 
 比如:\"nihao\" -> \"[你呢...][好号...] \\| nihao\"
 
-注意事项:这个函数生成的 regexp 只支持常用的汉字(大概8000左右),
-生僻汉字是不支持的,因为添加生僻字后这会导致生成的 regexp 长度超
-出 Emacs 可处理范围。"
+注意事项:如果生成的 regexp 太长,Emacs 无法处理,那么,这个命令
+会抛弃一些不常用的汉字,重新生成,知道生成一个 Emacs 可以处理的
+regexp, 所以搜索单字的时候一般可以搜到生僻字,但搜索句子的时候,
+就无法搜索生僻字了。"
   ;; NOTE: (rx-to-string "") will return "\\(?:\\)",
   ;; While I want (pyim-cregexp-build "") return just "".
   (if (equal string "")
       string
-    (let* ((char-level-num (or char-level-num 3))
-           (rx-string
-            (if (= char-level-num 0)
-                string
-              (ignore-errors
-                (rx-to-string
-                 (pyim-cregexp-build-from-rx
-                  (lambda (x)
-                    (if (stringp x)
-                        (xr (pyim-cregexp-build-1 x char-level-num))
-                      x))
-                  (xr string)))))))
-      (if (and rx-string (stringp rx-string))
-          (if (pyim-cregexp-valid-p rx-string)
-              rx-string
-            (pyim-cregexp-build string (- char-level-num 1)))
-        string))))
+    (let ((num (pyim-cregexp-char-level-num char-level-num))
+          rx-string)
+      (while (not (pyim-cregexp-valid-p rx-string))
+        (setq rx-string
+              (or (ignore-errors
+                    (rx-to-string
+                     (pyim-cregexp-build-from-rx
+                      (lambda (x)
+                        (if (stringp x)
+                            (xr (pyim-cregexp-build-1 x num))
+                          x))
+                      (xr string))))
+                  string))
+        (setq num (1- num)))
+      rx-string)))
 
 (defun pyim-cregexp-valid-p (cregexp)
   "Return t when cregexp is a valid regexp."
@@ -94,19 +99,15 @@
     (_ (funcall fn rx-form))))
 
 (defun pyim-cregexp-build-1 (str &optional char-level-num)
-  (let* ((scheme-name (pyim-scheme-name))
+  (let* ((num (pyim-cregexp-char-level-num char-level-num))
+         (scheme-name (pyim-scheme-name))
          (class (pyim-scheme-get-option scheme-name :class))
          (code-prefix (pyim-scheme-get-option scheme-name :code-prefix))
          (sep "#####&&&&#####")
          (lst (remove "" (split-string
                           (replace-regexp-in-string
                            "\\([a-z]+'*\\)" (concat sep "\\1" sep) str)
-                          sep)))
-         (char-level-num
-          (cond
-           ((and char-level-num (> char-level-num 3)) 3)
-           ((and char-level-num (< char-level-num 1)) 1)
-           (t char-level-num))))
+                          sep))))
     ;; 确保 pyim 词库加载
     (pyim-dcache-init-variables)
     ;; pyim 暂时只支持全拼和双拼搜索
@@ -124,7 +125,7 @@
                   (lambda (imobj)
                     (if (eq class 'xingma)
                         (pyim-cregexp-build:xingma imobj nil nil nil 
code-prefix)
-                      (pyim-cregexp-build:quanpin imobj nil nil nil 
char-level-num)))
+                      (pyim-cregexp-build:quanpin imobj nil nil nil num)))
                   imobjs))
                 (regexp
                  (when regexp-list
@@ -143,10 +144,10 @@
 (defun pyim-cregexp-build:quanpin (imobj &optional match-beginning
                                          first-equal all-equal char-level-num)
   "从 IMOBJ 创建一个搜索中文的 regexp."
-  (let* ((imobj
-          (mapcar (lambda (x)
-                    (concat (nth 0 x) (nth 1 x)))
-                  imobj))
+  (let* ((num (pyim-cregexp-char-level-num char-level-num))
+         (imobj (mapcar (lambda (x)
+                          (concat (nth 0 x) (nth 1 x)))
+                        imobj))
          (cchar-list
           (let ((n 0) results)
             (dolist (py imobj)
@@ -154,11 +155,11 @@
                       (or all-equal
                           (and first-equal (= n 0))))
                      (cchars
-                      ;; 只取常用字,不常用的汉字忽略,防止生成的
-                      ;; regexp 太长而无法搜索
                       (mapconcat (lambda (x)
                                    (mapconcat #'identity
-                                              (cl-subseq (split-string x "|") 
0 char-level-num)
+                                              (let* ((list (split-string x 
"|"))
+                                                     (length (length list)))
+                                                (cl-subseq list 0 (min num 
length)))
                                               ""))
                                  (pyim-pymap-py2cchar-get py equal-match nil 
t) "")))
                 (push cchars results))
diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el
index 60880ee..2618d0d 100644
--- a/tests/pyim-tests.el
+++ b/tests/pyim-tests.el
@@ -311,6 +311,12 @@
     (should (string-match-p regexp "你好"))
     (should (string-match-p regexp "哈哈你好吗")))
 
+  (let ((regexp (pyim-cregexp-build "beng")))
+    (should (string-match-p regexp "痭"))
+    (should (string-match-p regexp "泵"))
+    (should (string-match-p regexp "堋"))
+    (should (string-match-p regexp "洴")))
+
   (let ((regexp (pyim-cregexp-build "ni.*ma")))
     (should (string-match-p regexp "nihaoma"))
     (should (string-match-p regexp "nima"))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]