emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/guess-language 30e23aa 001/101: Added initial proof of


From: Stefan Monnier
Subject: [elpa] externals/guess-language 30e23aa 001/101: Added initial proof of concept.
Date: Sat, 23 Feb 2019 10:34:31 -0500 (EST)

branch: externals/guess-language
commit 30e23aaaac776393e65a944131316fa87c9c90ad
Author: Titus von der Malsburg <address@hidden>
Commit: Titus von der Malsburg <address@hidden>

    Added initial proof of concept.
---
 guess-language.el |  92 +++++++++++++++++
 trigrams/de       | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 trigrams/en       | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 692 insertions(+)

diff --git a/guess-language.el b/guess-language.el
new file mode 100644
index 0000000..304a375
--- /dev/null
+++ b/guess-language.el
@@ -0,0 +1,92 @@
+;;; guess-language.el --- Automatically detect human language
+
+;; Author: Titus von der Malsburg <address@hidden>
+;; Maintainer: Titus von der Malsburg <address@hidden>
+;; Version: 2.0.0
+;; Package-Requires: ((cl-lib "0.5"))
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; Just a proof of concept at this time.  Only supports English and
+;; German but can easily be extended to handle other languages.
+
+;; The detection algorithm is based on counts of character
+;; trigrams.  The trigrams are copied from guess_language.py
+;; (https://github.com/kent37/guess-language).
+
+(defvar guess-language-languages '(en de))
+
+(defun guess-language-load-trigrams ()
+  (cl-loop
+   for lang in guess-language-languages
+   for trigrams = (with-temp-buffer
+                    (insert-file-contents (symbol-name lang))
+                    (split-string (buffer-string) "\n" t))
+   collect (cons lang trigrams)))
+
+(defun guess-language-compile-regexps ()
+  (setq guess-language-regexps
+        (cl-loop
+         for lang in (guess-language-load-trigrams)
+         for regexp = (mapconcat 'identity (cdr lang) "\\)\\|\\(")
+         for regexp = (concat "\\(" regexp "\\)")
+         collect (cons (car lang) regexp))))
+
+(defun guess-language (beginning end)
+  (let ((tally (cl-loop
+                for lang in guess-language-regexps
+                for regexp = (cdr lang)
+                collect (cons (car lang) (how-many regexp beginning end)))))
+    (print tally)
+    (car (--max-by (> (cdr it) (cdr other)) tally))))
+
+(defun guess-language-buffer ()
+  (interactive)
+  (print (guess-language (point-min) (point-max))))
+
+(defun guess-language-paragraph ()
+  (interactive)
+  (let ((beginning (save-excursion (backward-paragraph) (point)))
+        (end       (save-excursion (forward-paragraph) (point))))
+    (print (guess-language beginning end))))
+
+(defun guess-language-region ()
+  (interactive)
+  (print (guess-language (region-beginning) (region-end))))
+
+(defun guess-language-autoset ()
+  "Detects language of the current paragraph and sets things like
+ispell dictionaries accordingly."
+  (interactive)
+  (pcase (guess-language-paragraph)
+    ('en (progn
+           (ispell-change-dictionary "en")
+           (typo-change-language "English")))
+    ('de (progn
+           (ispell-change-dictionary "de")
+           (typo-change-language "German"))))
+  (flyspell-region (save-excursion (backward-paragraph) (point))
+                   (save-excursion (forward-paragraph) (point))))
+
+(provide 'guess-language)
+
+;; Local Variables:
+;; byte-compile-warnings: (not cl-functions obsolete)
+;; coding: utf-8
+;; indent-tabs-mode: nil
+;; End:
+
+;;; guess-language.el ends here
diff --git a/trigrams/de b/trigrams/de
new file mode 100644
index 0000000..ebbeca8
--- /dev/null
+++ b/trigrams/de
@@ -0,0 +1,300 @@
+en 
+er 
+ de
+der
+ie 
+ di
+die
+sch
+ein
+che
+ich
+den
+in 
+te 
+ch 
+ ei
+ung
+n d
+nd 
+ be
+ver
+es 
+ zu
+eit
+gen
+und
+ un
+ au
+ in
+cht
+it 
+ten
+ da
+ent
+ ve
+and
+ ge
+ine
+ mi
+r d
+hen
+ng 
+nde
+ vo
+e d
+ber
+men
+ei 
+mit
+ st
+ter
+ren
+t d
+ er
+ere
+n s
+ste
+ se
+e s
+ht 
+des
+ist
+ne 
+auf
+e a
+isc
+on 
+rte
+ re
+ we
+ges
+uch
+ fü
+ so
+bei
+e e
+nen
+r s
+ach
+für
+ier
+par
+ür 
+ ha
+as 
+ert
+ an
+ pa
+ sa
+ sp
+ wi
+for
+tag
+zu 
+das
+rei
+he 
+hre
+nte
+sen
+vor
+ sc
+ech
+etz
+hei
+lan
+n a
+pd 
+st 
+sta
+ese
+lic
+ ab
+ si
+gte
+ wa
+iti
+kei
+n e
+nge
+sei
+tra
+zen
+ im
+ la
+art
+im 
+lle
+n w
+rde
+rec
+set
+str
+tei
+tte
+ ni
+e p
+ehe
+ers
+g d
+nic
+von
+ al
+ pr
+an 
+aus
+erf
+r e
+tze
+tür
+uf 
+ag 
+als
+ar 
+chs
+end
+ge 
+ige
+ion
+ls 
+n m
+ngs
+nis
+nt 
+ord
+s s
+sse
+ tü
+ahl
+e b
+ede
+em 
+len
+n i
+orm
+pro
+rke
+run
+s d
+wah
+wer
+ürk
+ me
+age
+att
+ell
+est
+hat
+n b
+oll
+raf
+s a
+tsc
+ es
+ fo
+ gr
+ ja
+abe
+auc
+ben
+e n
+ege
+lie
+n u
+r v
+re 
+rit
+sag
+ am
+agt
+ahr
+bra
+de 
+erd
+her
+ite
+le 
+n p
+n v
+or 
+rbe
+rt 
+sic
+wie
+übe
+ is
+ üb
+cha
+chi
+e f
+e m
+eri
+ied
+mme
+ner
+r a
+sti
+t a
+t s
+tis
+ ko
+arb
+ds 
+gan
+n z
+r f
+r w
+ran
+se 
+t i
+wei
+wir
+ br
+ np
+am 
+bes
+d d
+deu
+e g
+e k
+efo
+et 
+eut
+fen
+hse
+lte
+n r
+npd
+r b
+rhe
+t w
+tz 
+ fr
+ ih
+ ke
+ ma
+ame
+ang
+d s
+eil
+el 
+era
+erh
+h d
+i d
+kan
+n f
+n l
+nts
+och
+rag
+rd 
+spd
+spr
+tio
+ ar
+ en
+ ka
+ark
+ass
diff --git a/trigrams/en b/trigrams/en
new file mode 100644
index 0000000..9d9139d
--- /dev/null
+++ b/trigrams/en
@@ -0,0 +1,300 @@
+ th
+the
+he 
+ed 
+ to
+ in
+er 
+ing
+ng 
+ an
+nd 
+ of
+and
+to 
+of 
+ co
+at 
+on 
+in 
+ a 
+d t
+ he
+e t
+ion
+es 
+ re
+re 
+hat
+ sa
+ st
+ ha
+her
+tha
+tio
+or 
+ ''
+en 
+ wh
+e s
+ent
+n t
+s a
+as 
+for
+is 
+t t
+ be
+ld 
+e a
+rs 
+ wa
+ut 
+ve 
+ll 
+al 
+ ma
+e i
+ fo
+'s 
+an 
+est
+ hi
+ mo
+ se
+ pr
+s t
+ate
+st 
+ter
+ere
+ted
+nt 
+ver
+d a
+ wi
+se 
+e c
+ect
+ns 
+ on
+ly 
+tol
+ey 
+r t
+ ca
+ati
+ts 
+all
+ no
+his
+s o
+ers
+con
+e o
+ear
+f t
+e w
+was
+ons
+sta
+'' 
+sti
+n a
+sto
+t h
+ we
+id 
+th 
+ it
+ce 
+ di
+ave
+d h
+cou
+pro
+ad 
+oll
+ry 
+d s
+e m
+ so
+ill
+cti
+te 
+tor
+eve
+g t
+it 
+ ch
+ de
+hav
+oul
+ty 
+uld
+use
+ al
+are
+ch 
+me 
+out
+ove
+wit
+ys 
+chi
+t a
+ith
+oth
+ ab
+ te
+ wo
+s s
+res
+t w
+tin
+e b
+e h
+nce
+t s
+y t
+e p
+ele
+hin
+s i
+nte
+ li
+le 
+ do
+aid
+hey
+ne 
+s w
+ as
+ fr
+ tr
+end
+sai
+ el
+ ne
+ su
+'t 
+ay 
+hou
+ive
+lec
+n't
+ ye
+but
+d o
+o t
+y o
+ ho
+ me
+be 
+cal
+e e
+had
+ple
+ at
+ bu
+ la
+d b
+s h
+say
+t i
+ ar
+e f
+ght
+hil
+igh
+int
+not
+ren
+ is
+ pa
+ sh
+ays
+com
+n s
+r a
+rin
+y a
+ un
+n c
+om 
+thi
+ mi
+by 
+d i
+e d
+e n
+t o
+ by
+e r
+eri
+old
+ome
+whe
+yea
+ gr
+ar 
+ity
+mpl
+oun
+one
+ow 
+r s
+s f
+tat
+ ba
+ vo
+bou
+sam
+tim
+vot
+abo
+ant
+ds 
+ial
+ine
+man
+men
+ or
+ po
+amp
+can
+der
+e l
+les
+ny 
+ot 
+rec
+tes
+tho
+ica
+ild
+ir 
+nde
+ose
+ous
+pre
+ste
+era
+per
+r o
+red
+rie
+ bo
+ le
+ali
+ars
+ore
+ric
+s m
+str
+ fa
+ess
+ie 
+ist
+lat
+uri



reply via email to

[Prev in Thread] Current Thread [Next in Thread]