[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/pyim 990bd9e678 1/2: use partition search algorithm in
From: |
ELPA Syncer |
Subject: |
[elpa] externals/pyim 990bd9e678 1/2: use partition search algorithm in dregcache engine |
Date: |
Mon, 30 May 2022 15:57:49 -0400 (EDT) |
branch: externals/pyim
commit 990bd9e6784617cf8f5705bf2bd70067bd088942
Author: Chen Bin <chenbin.sh@gmail.com>
Commit: Chen Bin <chenbin.sh@gmail.com>
use partition search algorithm in dregcache engine
---
pyim-dregcache.el | 91 ++++++++++++++++++++++-------------------------------
tests/pyim-tests.el | 41 ++++++++++++++++++++++--
2 files changed, 77 insertions(+), 55 deletions(-)
diff --git a/pyim-dregcache.el b/pyim-dregcache.el
index 14c90b16d5..c63f4f03b7 100644
--- a/pyim-dregcache.el
+++ b/pyim-dregcache.el
@@ -39,9 +39,6 @@
(require 'subr-x)
(require 'pyim-dcache)
-(defvar pyim-dregcache-partition-minimum-size 32
- "小于这个数值(单位为M)的词典不需要用分区算法加速.可节约一半的内存.")
-
(defvar pyim-dregcache-cache nil)
(defvar pyim-dregcache-icode2word nil)
(defvar pyim-dregcache-iword2count nil)
@@ -59,6 +56,11 @@
(plist-get x :file)))
`(,@pyim-dicts ,@pyim-extra-dicts)))
(dicts-md5 (pyim-dcache-create-files-md5 dict-files)))
+ (when pyim-debug
+ (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s
dict-files=%s"
+ pyim-dicts
+ pyim-extra-dicts
+ dict-files))
(pyim-dregcache-update-code2word dict-files dicts-md5 force))))
(defun pyim-dregcache-variable-file (variable)
@@ -115,42 +117,26 @@
(defun pyim-dregcache-create-cache-content (raw-content)
"将 RAW-CONTENT 划分成可以更高效搜索的缓冲区."
- (let* (rlt)
- (cond
- ;; 小词库不用划分"子搜索区域".
- ;; `pyim-dregcache-partition-minimum-size'定义了小词库的最大值
- ((< (length raw-content) (* pyim-dregcache-partition-minimum-size 1024
1024))
- (setq rlt (list :content raw-content)))
- (t
- (let* ((chars "bcdefghjklmnopqrstwxyz")
- pattern
- (i 0)
- dict-splited
- (content-segments '())
- (start 0)
- end)
- ;; 将字典缓存划分成多个"子搜索区域"
- (while (< i (length chars))
- (setq pattern (concat "^" (string (elt chars i))))
- (setq end (string-match pattern raw-content start))
- (when end
- (setq content-segments
- `(,@content-segments ,(substring-no-properties raw-content
start end)))
- (setq dict-splited t)
- ;; 将搜索起始点前移
- (setq start end))
- (setq i (1+ i)))
-
- (cond
- ;; attach segments
- (dict-splited
- ;; 将剩余的附后
- (setq content-segments
- `(,@content-segments ,(substring-no-properties raw-content
start end)))
- (setq rlt (list :content content-segments)))
- (t
- (setq rlt (list :content raw-content)))))))
- rlt))
+ (let ((chars "bcdefghjklmnopqrstwxyz")
+ (i 0)
+ content-segments
+ (start (string-match "^a" raw-content))
+ chunk
+ end)
+ ;; 将字典缓存划分成多个"子搜索区域"
+ (while (< i (length chars))
+ (when (setq end (string-match (string ?^ (elt chars i))
+ raw-content
+ start))
+ (setq chunk (substring-no-properties raw-content start end))
+ (push chunk content-segments)
+ (setq start end))
+ (setq i (1+ i)))
+
+ ;; last chunk
+ (setq chunk (substring-no-properties raw-content end (length raw-content)))
+ (push chunk content-segments)
+ (list :content (nreverse content-segments))))
(defun pyim-dregcache-load-dictionary-file (dict-file)
"READ from DICT-FILE."
@@ -244,19 +230,17 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
(let* ((rlt (plist-get file-info :content))
idx
(ch (elt code 0)))
- (when (listp rlt)
- (cond
- ((<= ch ?h)
- (setq idx (- ch ?a)))
- ((<= ch ?t)
- ;; 'i' could not be first character of pinyin code
- (setq idx (- ch ?a 1)))
- (t
- ;; 'i', 'u', 'v' could not be first character of pinyin code
- (setq idx (- ch ?a 3))))
- ;; fetch segment using the first character of pinyin code
- (setq rlt (nth idx rlt)))
- rlt))
+ (cond
+ ((< ch ?i)
+ (setq idx (- ch ?a)))
+ ((< ch ?u)
+ ;; 'i' could not be first character of pinyin code
+ (setq idx (- ch ?a 1)))
+ (t
+ ;; 'i', 'u', 'v' could not be first character of pinyin code
+ (setq idx (- ch ?a 3))))
+ ;; fetch segment using the first character of pinyin code
+ (nth idx rlt)))
(defun pyim-dregcache-get-1 (content code)
(let ((case-fold-search t)
@@ -293,6 +277,7 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码.
nil)
(t (let ((dict-files (pyim-dregcache-all-dict-files))
result)
+
(when pyim-debug (message "pyim-dregcache-get is called. code=%s"
code))
(when dict-files
(dolist (file dict-files)
@@ -466,7 +451,7 @@ update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所
(dolist (file dict-files)
(let* ((file-info (lax-plist-get pyim-dregcache-cache file))
(contents (lax-plist-get file-info :content)))
- (dolist (content (if (listp contents) contents (list contents)))
+ (dolist (content contents)
(setq code (pyim-dregcache-search-word-code-1 word content))
(when code (throw 'result (list code))))))))))
diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el
index 3fe4b68708..b4f07ad6ac 100644
--- a/tests/pyim-tests.el
+++ b/tests/pyim-tests.el
@@ -967,23 +967,60 @@ yin-xing 因行
(ert-deftest pyim-tests-pyim-dregcache-backend ()
(let ((pyim-dcache-backend 'pyim-dregcache)
- words)
- (should (eq (length pyim-dregcache-cache) 0))
+ words
+ file-info
+ content)
+ (setq pyim-dregcache-cache nil)
+
;; load dictionary
(pyim-dcache-update t)
;; cache is filled
(should (> (length pyim-dregcache-cache) 0))
+ ;; get first dictionary cache
+ (setq file-info (lax-plist-get pyim-dregcache-cache
+ (car (pyim-dregcache-all-dict-files))))
+
+ (setq content (plist-get file-info :content))
+ (let ((i 0)
+ (chars "abcdefghjklmnopqrstwxyz"))
+ (should (eq (length content) (length chars)))
+ (while (< i (length chars))
+ (should (eq (elt chars i) (elt (nth i content) 0)))
+ (setq i (1+ i))))
+ (should (string= (pyim-dregcache-get-content "ai" file-info)
+ (pyim-dregcache-get-content "a" file-info)))
+ (should (string= (pyim-dregcache-get-content "ba" file-info)
+ (pyim-dregcache-get-content "b" file-info)))
+ (should (string= (pyim-dregcache-get-content "ze" file-info)
+ (pyim-dregcache-get-content "z" file-info)))
;; test dregcache api
+ (setq words (pyim-dcache-get "a"))
+ (should (eq (length words) 16))
+ (should (string= (nth 0 words) "阿"))
+
+ (setq words (pyim-dcache-get "za-cao"))
+ (should (eq (length words) 1))
+ (should (string= (nth 0 words) "杂草"))
+
+ (setq words (pyim-dcache-get "ba-shi-tian-huan-you-di-qiu"))
+ (should (eq (length words) 1))
+ (should (string= (nth 0 words) "八十天环游地球"))
+
(setq words (pyim-dcache-get "zun-bei"))
(should (eq (length words) 1))
(should (string= (nth 0 words) "尊卑"))
(setq words (pyim-dcache-get "zun"))
(should (string= (nth 0 words) "尊"))
+
+ (let* ((gc-cons-threshold most-positive-fixnum))
+ (message "search by code \"zun-yi\" takes %s seconds"
(benchmark-run-compiled 1 (pyim-dcache-get "zun-yi"))))
+
;; `pyim-dregcache-get' calls `pyim-pymap-py2cchar-get' before return
result
(should (eq (length words) 51))))
+
(ert-deftest pyim-tests-pyim-cloudim ()
(with-temp-buffer
(insert "HTTP/1.1 200 OK
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [elpa] externals/pyim 990bd9e678 1/2: use partition search algorithm in dregcache engine,
ELPA Syncer <=