[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/pyim 5289127408 3/3: pyim-cstring-to-pinyin: use dcache
From: |
ELPA Syncer |
Subject: |
[elpa] externals/pyim 5289127408 3/3: pyim-cstring-to-pinyin: use dcache when possible. |
Date: |
Mon, 16 Jan 2023 02:58:10 -0500 (EST) |
branch: externals/pyim
commit 5289127408140f6fec442e61b8e3c3134a448ccb
Author: Feng Shu <tumashu@163.com>
Commit: Feng Shu <tumashu@163.com>
pyim-cstring-to-pinyin: use dcache when possible.
---
pyim-cstring.el | 49 +++++++++++++++++++++++++++++++++----------------
pyim-dhashcache.el | 50 ++++++++++++++++++++++++++++++++++----------------
2 files changed, 67 insertions(+), 32 deletions(-)
diff --git a/pyim-cstring.el b/pyim-cstring.el
index a0cb3b2c5c..5f8c27cb85 100644
--- a/pyim-cstring.el
+++ b/pyim-cstring.el
@@ -109,22 +109,10 @@ t, 遇到多音字时,只使用第一个拼音,其它拼音忽略。
BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结果会
包含多余的连接符:比如: \"你=好\" --> \"ni-=-hao\""
(if (not (pyim-string-match-p "\\cc" string))
- (if return-list
- (list string)
- string)
- (let* ((string-parts (pyim-cstring--partition string t))
- (pinyins-list
- ;; ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing"))
- (mapcar (lambda (str)
- (if (pyim-string-match-p "\\cc" str)
- (pyim-pymap-cchar2py-get str)
- (list str)))
- string-parts))
- ;; 通过排列组合的方式, 重排 pinyins-list。
- ;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang"))
- (pinyins-list (pyim-permutate-list
- (pyim-cstring--adjust-duoyinzi
- string-parts pinyins-list)))
+ (if return-list (list string) string)
+ (let* ((pinyins-list
+ (or (pyim-cstring-to-pinyin--from-dcache string)
+ (pyim-cstring-to-pinyin--from-pymap string)))
(list (mapcar (lambda (x)
(mapconcat (lambda (str)
(if shou-zi-mu
@@ -139,6 +127,35 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结
list
(string-join list " ")))))
+(defun pyim-cstring-to-pinyin--from-dcache (cstring)
+ "从 Dcache 中搜索 CSTRING 对应的拼音。"
+ (let* ((string-parts (pyim-cstring--partition cstring))
+ (pinyins-list
+ (mapcar (lambda (str)
+ (if (pyim-string-match-p "\\cc" str)
+ (when-let ((code (car (pyim-dcache-get str
'(word2code)))))
+ (split-string code "-"))
+ (list str)))
+ string-parts)))
+ (unless (member nil pinyins-list)
+ (list (apply #'append pinyins-list)))))
+
+(defun pyim-cstring-to-pinyin--from-pymap (cstring)
+ "使用 PYMAP 提供的工具来搜索 CSTRING 对应的拼音。"
+ (let* ((string-parts (pyim-cstring--partition cstring t))
+ (pinyins-list
+ ;; ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing"))
+ (mapcar (lambda (str)
+ (if (pyim-string-match-p "\\cc" str)
+ (pyim-pymap-cchar2py-get str)
+ (list str)))
+ string-parts)))
+ ;; 通过排列组合的方式, 重排 pinyins-list。
+ ;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang"))
+ (pyim-permutate-list
+ (pyim-cstring--adjust-duoyinzi
+ string-parts pinyins-list))))
+
(defun pyim-cstring--adjust-duoyinzi (string-parts pinyins-list)
"根据 STRING-PARTS 对 PINYINS-LIST 进行校正。
diff --git a/pyim-dhashcache.el b/pyim-dhashcache.el
index 7f4b463bba..bd216ab42e 100644
--- a/pyim-dhashcache.el
+++ b/pyim-dhashcache.el
@@ -40,6 +40,7 @@
(require 'pyim-dcache)
(require 'pyim-dict)
(require 'pyim-scheme)
+(require 'pyim-pymap)
(require 'sort)
(defvar pyim-dhashcache--count-types
@@ -426,25 +427,42 @@ DCACHE 是一个 code -> words 的 hashtable.
(let ((hashtable (make-hash-table :size 1000000 :test #'equal)))
(maphash
(lambda (code words)
- ;; 这里主要考虑五笔仓颉等形码输入法,也就是 code-prefix 中包含 "/" 的输
- ;; 入法,全拼输入法反查功能主要使用 pymap 实现,不使用这个表。
- (when (pyim-string-match-p "/" code)
+ (if (pyim-string-match-p "/" code)
+ ;; 这里主要考虑五笔仓颉等形码输入法,也就是 code-prefix 中包含 "/"
+ ;; 的输入法,
+ (dolist (word words)
+ (let ((value (gethash word hashtable))
+ ;; NOTE: 这里使用 `cl-copy-seq', 可以让保存的文件内容类似:
+ ;;
+ ;; "呵" ("he" "a")
+ ;;
+ ;; 而不是:
+ ;;
+ ;; "呵" (#9="he" #2#)
+ ;;
+ (code (cl-copy-seq code)))
+ (puthash word
+ (if value
+ `(,code ,@value)
+ (list code))
+ hashtable)))
+ ;; 使用拼音输入法时,构建词条到拼音的哈希表非常消耗内存,在这里只处理
+ ;; 包含多音字的词条(2-4个字),测试发现,生成的哈希表也不小,大约是
+ ;; code2word 的 1/4.
+ ;;
+ ;; 除了包含多音字的 2-4 字词条,其余词条的拼音反查功能主要使用 pymap
+ ;; 实现,不使用这个表。
(dolist (word words)
(let ((value (gethash word hashtable))
- ;; NOTE: 这里使用 `cl-copy-seq', 可以让保存的文件内容类似:
- ;;
- ;; "呵" ("he" "a")
- ;;
- ;; 而不是:
- ;;
- ;; "呵" (#9="he" #2#)
- ;;
(code (cl-copy-seq code)))
- (puthash word
- (if value
- `(,code ,@value)
- (list code))
- hashtable)))))
+ (when (and (> (length word) 1)
+ (< (length word) 5)
+ (pyim-pymap-duoyinzi-include-p word))
+ (puthash word
+ (if value
+ `(,code ,@value)
+ (list code))
+ hashtable))))))
dcache)
(pyim-dcache-save-value-to-file hashtable file))))