[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/doc-toc 559e7bbf3d 13/84: Implement extract with tesser
From: |
ELPA Syncer |
Subject: |
[elpa] externals/doc-toc 559e7bbf3d 13/84: Implement extract with tesseract ocr |
Date: |
Mon, 26 Sep 2022 13:58:34 -0400 (EDT) |
branch: externals/doc-toc
commit 559e7bbf3d785133b4b26519f7b97af35cc7c0a3
Author: Daniel Nicolai <dalanicolai@gmail.com>
Commit: Daniel Nicolai <dalanicolai@gmail.com>
Implement extract with tesseract ocr
---
toc-mode.el | 39 ++++++++++++++++++++++++++++++++++-----
1 file changed, 34 insertions(+), 5 deletions(-)
diff --git a/toc-mode.el b/toc-mode.el
index 09947d9e2e..4d17187e0a 100644
--- a/toc-mode.el
+++ b/toc-mode.el
@@ -50,6 +50,13 @@ For DJVU the old DJVU file is replaced by default"
(forward-line 1))
)
+(defun toc-cleanup-dots-ocr ()
+ (interactive)
+ (beginning-of-buffer)
+ (while (re-search-forward "\\([0-9\\. \\-]*\\)\\( [0-9]* *\\)$" nil t)
+ (replace-match " \\2"))
+ )
+
(defun toc-cleanup-lines-contents-string (&optional arg)
(interactive "nEnter line number of entry 'Contents': ")
(when (called-interactively-p 'any)
@@ -79,13 +86,19 @@ For DJVU the old DJVU file is replaced by default"
(re-search-forward "[^0-9]\\s-*$" nil t)
(join-line 1))
-(defun toc-cleanup (startpage)
+(defun toc-join-next-overindexed-index ()
+ (interactive)
+ (re-search-forward "^[0-9\\.]*\\. " nil t))
+
+(defun toc-cleanup (startpage &optional arg)
(interactive)
(beginning-of-buffer)
(when (search-forward "contents" nil t)
(replace-match (format "Contents %s" startpage)))
(toc-cleanup-lines-contents-string)
- (toc-cleanup-dots)
+ (if arg
+ (toc-cleanup-dots-ocr)
+ (toc-cleanup-dots))
;; (toc-cleanup-lines-roman-string)
(toc-cleanup-blank-lines)
(toc-join-next-unnumbered-lines)
@@ -123,7 +136,6 @@ For DJVU the old DJVU file is replaced by default"
;;; toc extract
(defun document-extract-pages-text (startpage endpage)
- (interactive "nEnter start-pagenumber for extraction: \nnEnter
end-pagenumber for extraction: ")
(let* ((source-buffer (current-buffer))
(ext (url-file-extension (buffer-file-name (current-buffer))))
(shell-command (cond ((string= ".pdf" ext) "pdftotext -f %s -l %s
-layout %s -")
@@ -142,8 +154,7 @@ For DJVU the old DJVU file is replaced by default"
;; (kill-whole-line)
))
-
-(defun toc-extract-pages (startpage endpage &optional arg)
+(defun toc-extract-pages (startpage endpage arg)
"Extract text and cleanup text from table of contents.
Use with the universal argument (C-u) omits cleanup to get the unprocessed
text."
(interactive "nEnter start-pagenumber for extraction: \nnEnter
end-pagenumber for extraction: \nP")
@@ -151,6 +162,23 @@ Use with the universal argument (C-u) omits cleanup to get
the unprocessed text.
(unless arg
(toc-cleanup startpage)))
+(defun toc-extract-pages-ocr (startpage endpage arg)
+ (interactive "nEnter start-pagenumber for extraction: \nnEnter
end-pagenumber for extraction: \nP")
+ (let* ((source-buffer (current-buffer))
+ (ext (url-file-extension (buffer-file-name (current-buffer))))
+ (text "")
+ (buffer (file-name-sans-extension (buffer-name))))
+ (while (<= startpage (+ endpage))
+ (let ((file (cond ((string= ".pdf" ext) (make-temp-file "pageimage"
nil (number-to-string startpage) (pdf-cache-get-image startpage 600)))
+ ((string= ".djvu" ext) (djvu-goto-page startpage)
(make-temp-file "pageimage" nil (number-to-string startpage) (image-property
djvu-doc-image :data))))))
+ (call-process "tesseract" nil (list buffer nil) nil file
"stdout" "--psm" "6")
+ (setq startpage (1+ startpage))))
+ (switch-to-buffer buffer)
+ (toc-cleanup-mode) ;; required before setting local variable
+ (setq-local doc-buffer source-buffer)
+ (unless arg
+ (toc-cleanup startpage t))))
+
(defun toc-extract-outline ()
(interactive)
(let* ((source-buffer (current-buffer))
@@ -458,6 +486,7 @@ Use with the universal argument (C-u) omits cleanup to get
the unprocessed text.
;; (print (mapcar #'(lambda (region) (pdf-info-gettext page region))
(pdf-info-line-regions regions)))))
;; (mapcar '(lambda (region) (pdf-info-gettext page region)) regions)))
+
(provide 'toc-mode)
;;; document-outliner.el ends here
- [elpa] externals/doc-toc a0018b8a63 71/84: Add replace fields in toc-tabular-mode feature, (continued)
- [elpa] externals/doc-toc a0018b8a63 71/84: Add replace fields in toc-tabular-mode feature, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc f2ea7db226 68/84: Fix all checkdoc warnings, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 487177a81b 73/84: Update package description, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 36fe728f38 72/84: Make tesseract psm code configurable via universal arg, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc d5629c7165 75/84: Update djvu ocr to djvu3.el (create ppm instead of svg), ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc a259f4cc65 77/84: Correct docstring position, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 0671d11654 78/84: Merge pull request #8 from syohex/docstring, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc cf6b12ca35 79/84: Fix #10: prevent pdf-filename become nil, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc b4bb748aa3 84/84: Fix README (remove repeated section), ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 0c9a7a484f 09/84: Update README with link to djvulibre website, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 559e7bbf3d 13/84: Implement extract with tesseract ocr,
ELPA Syncer <=
- [elpa] externals/doc-toc 97c0ce5c40 19/84: Implement from tabular jump to/scroll page for djvu, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 2b6045b0a3 22/84: minor additions i.e. autoload comment and docstrings, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc dda3491299 34/84: Fix djvu parse tablist to djvused algorithm, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc e84c00c923 45/84: Fix/remove ask for path before add TOC djvu, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc eb8c6a0c00 56/84: Fix seq and cl compiler warnings, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 05b6d034aa 59/84: Fix djvu/pdf hard dependency (github issue #3), ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 8d4817a543 62/84: Fix org-noter conflict (github #4), ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc a14dbb4c25 67/84: Fix beginning-of-buffer compilation warning, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 102e14199c 65/84: pdf.tocgen replace original pdf, and show message on fail, ELPA Syncer, 2022/09/26
- [elpa] externals/doc-toc 4c9ce0f54d 76/84: Fix: eval-when-compile pdf-tools, ELPA Syncer, 2022/09/26