emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master ce63f91025: Add textsec functions for verifying email addresses


From: Lars Ingebrigtsen
Subject: master ce63f91025: Add textsec functions for verifying email addresses
Date: Tue, 18 Jan 2022 07:20:12 -0500 (EST)

branch: master
commit ce63f9102545fa50abbe08a4083b332a9101c243
Author: Lars Ingebrigtsen <larsi@gnus.org>
Commit: Lars Ingebrigtsen <larsi@gnus.org>

    Add textsec functions for verifying email addresses
    
    * lisp/international/characters.el (bidi-control-characters):
    Rename from glyphless--bidi-control-characters for use in textsec,
    and add LRM/RLM/ALM.
    (update-glyphless-char-display): Adjust the code.
    
    * lisp/international/textsec.el (textsec-local-address-suspicious-p)
    (textsec-name-suspicious-p, textsec-suspicious-nonspacing-p)
    (textsec-email-suspicious-p): New functions.
---
 lisp/international/characters.el         | 13 +++---
 lisp/international/textsec.el            | 77 ++++++++++++++++++++++++++++++++
 test/lisp/international/textsec-tests.el | 39 ++++++++++++++++
 3 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/lisp/international/characters.el b/lisp/international/characters.el
index 3ff280f480..ce23e995c1 100644
--- a/lisp/international/characters.el
+++ b/lisp/international/characters.el
@@ -1526,8 +1526,11 @@ Setup `char-width-table' appropriate for non-CJK 
language environment."
 
 ;; We can't use the \N{name} things here, because this file is used
 ;; too early in the build process.
-(defvar glyphless--bidi-control-characters
-  '(#x202a                          ; ?\N{left-to-right embedding}
+(defvar bidi-control-characters
+  '(#x200e                           ; ?\N{left-to-right mark}
+    #x200f                           ; ?\N{right-to-left mark}
+    #x061c                           ; ?\N{arabic letter mark}
+    #x202a                          ; ?\N{left-to-right embedding}
     #x202b                          ; ?\N{right-to-left embedding}
     #x202d                          ; ?\N{left-to-right override}
     #x202e                          ; ?\N{right-to-left override}
@@ -1535,7 +1538,8 @@ Setup `char-width-table' appropriate for non-CJK language 
environment."
     #x2067                          ; ?\N{right-to-left isolate}
     #x2068                          ; ?\N{first strong isolate}
     #x202c                          ; ?\N{pop directional formatting}
-    #x2069))                         ; ?\N{pop directional isolate})
+    #x2069)                          ; ?\N{pop directional isolate}
+  "List of bidirectional control characters.")
 
 (defun update-glyphless-char-display (&optional variable value)
   "Make the setting of `glyphless-char-display-control' take effect.
@@ -1582,8 +1586,7 @@ option `glyphless-char-display'."
                                   (or (aref char-acronym-table from)
                                       "UNK")))
                           (when (or (eq target 'format-control)
-                                    (memq from
-                                          glyphless--bidi-control-characters))
+                                    (memq from bidi-control-characters))
                             (set-char-table-range glyphless-char-display
                                                   from this-method)))
                         (setq from (1+ from))))))
diff --git a/lisp/international/textsec.el b/lisp/international/textsec.el
index d0d435ed7d..55e4ce9d86 100644
--- a/lisp/international/textsec.el
+++ b/lisp/international/textsec.el
@@ -28,6 +28,7 @@
 (require 'ucs-normalize)
 (require 'idna-mapping)
 (require 'puny)
+(require 'mail-parse)
 
 (defvar textsec--char-scripts nil)
 
@@ -225,6 +226,9 @@ STRING isn't a single script string."
        (textsec-single-script-p string2)))
 
 (defun textsec-domain-suspicious-p (domain)
+  "Say whether DOMAIN looks suspicious.
+If it isn't, nil is returned.  If it is, a string explaining the
+problem is returned."
   (catch 'found
     (seq-do
      (lambda (char)
@@ -236,6 +240,79 @@ STRING isn't a single script string."
       (throw 'found "%s is not highly restrictive"))
     nil))
 
+(defun textsec-local-address-suspicious-p (local)
+  "Say whether LOCAL looks suspicious.
+LOCAL is the bit before \"@\" in an email address.
+
+If it suspicious, nil is returned.  If it is, a string explaining
+the problem is returned."
+  (cond
+   ((not (equal local (ucs-normalize-NFKC-string local)))
+    (format "`%s' is not in normalized format `%s'"
+            local (ucs-normalize-NFKC-string local)))
+   ((textsec-mixed-numbers-p local)
+    (format "`%s' contains numbers from different number systems" local))
+   ((eq (textsec-restriction-level local) 'unrestricted)
+    (format "`%s' isn't restrictive enough" local))
+   ((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local)
+    (format "`%s' contains invalid dots" local))))
+
+(defun textsec-name-suspicious-p (name)
+  "Say whether NAME looks suspicious.
+NAME is (for instance) the free-text name from an email address.
+
+If it suspicious, nil is returned.  If it is, a string explaining
+the problem is returned."
+  (cond
+   ((not (equal name (ucs-normalize-NFC-string name)))
+    (format "`%s' is not in normalized format `%s'"
+            name (ucs-normalize-NFC-string name)))
+   ((seq-find (lambda (char)
+                (and (member char bidi-control-characters)
+                     (not (member char
+                                  '( ?\N{left-to-right mark}
+                                     ?\N{right-to-left mark}
+                                     ?\N{arabic letter mark})))))
+              name)
+    (format "The string contains bidirectional control characters"))
+   ((textsec-suspicious-nonspacing-p name))))
+
+(defun textsec-suspicious-nonspacing-p (string)
+  "Say whether STRING has a suspicious use of nonspacing characters.
+If it suspicious, nil is returned.  If it is, a string explaining
+the problem is returned."
+  (let ((prev nil)
+        (nonspace-count 0))
+    (catch 'found
+      (seq-do
+       (lambda (char)
+         (let ((nonspacing
+                (memq (get-char-code-property char 'general-category)
+                      '(Cf Cc Mn))))
+           (when (and nonspacing
+                      (equal char prev))
+             (throw 'found "Two identical nonspacing characters in a row"))
+           (setq nonspace-count (if nonspacing
+                                    (1+ nonspace-count)
+                                  0))
+           (when (> nonspace-count 4)
+             (throw 'found
+                    "Excessive number of nonspacing characters in a row"))
+           (setq prev char)))
+       string)
+      nil)))
+
+(defun textsec-email-suspicious-p (email)
+  "Say whether EMAIL looks suspicious.
+If it isn't, nil is returned.  If it is, a string explaining the
+problem is returned."
+  (pcase-let* ((`(,address . ,name) (mail-header-parse-address email t))
+               (`(,local ,domain) (split-string address "@")))
+    (or
+     (textsec-domain-suspicious-p domain)
+     (textsec-local-address-suspicious-p local)
+     (textsec-name-suspicious-p name))))
+
 (provide 'textsec)
 
 ;;; textsec.el ends here
diff --git a/test/lisp/international/textsec-tests.el 
b/test/lisp/international/textsec-tests.el
index c946d85069..aeb8bc7283 100644
--- a/test/lisp/international/textsec-tests.el
+++ b/test/lisp/international/textsec-tests.el
@@ -115,4 +115,43 @@
   (should-not (textsec-domain-suspicious-p "foo.org"))
   (should (textsec-domain-suspicious-p "f\N{LEFT-TO-RIGHT ISOLATE}oo.org")))
 
+(ert-deftest test-suspicious-local ()
+  (should-not (textsec-local-address-suspicious-p "larsi"))
+  (should (textsec-local-address-suspicious-p ".larsi"))
+  (should (textsec-local-address-suspicious-p "larsi."))
+  (should-not (textsec-local-address-suspicious-p "la.rsi"))
+  (should (textsec-local-address-suspicious-p "lar..si"))
+
+  (should-not (textsec-local-address-suspicious-p "LÅRSI"))
+  (should (textsec-local-address-suspicious-p "LÅRSI"))
+
+  (should (textsec-local-address-suspicious-p "larsi8৪")))
+
+(ert-deftest test-suspicious-name ()
+  (should-not (textsec-name-suspicious-p "Lars Ingebrigtsen"))
+  (should (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN"))
+  (should-not (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN"))
+
+  (should (textsec-name-suspicious-p
+           "Lars Ingebrigtsen\N{LEFT-TO-RIGHT ISOLATE}"))
+  (should-not (textsec-name-suspicious-p
+               "Lars Ingebrigtsen\N{LEFT-TO-RIGHT MARK}"))
+
+  (should (textsec-name-suspicious-p
+           "\N{LEFT-TO-RIGHT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen"))
+  (should-not (textsec-name-suspicious-p
+               "\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}Lars 
Ingebrigtsen"))
+  (should (textsec-name-suspicious-p
+               "\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT 
MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen")))
+
+(ert-deftest test-suspicious-email ()
+  (should-not
+   (textsec-email-suspicious-p "Lars Ingebrigtsen <larsi@gnus.org>"))
+  (should
+   (textsec-email-suspicious-p "LÅrs Ingebrigtsen <larsi@gnus.org>"))
+  (should
+   (textsec-email-suspicious-p "Lars Ingebrigtsen <.larsi@gnus.org>"))
+  (should
+   (textsec-email-suspicious-p "Lars Ingebrigtsen <larsi@gn\N{LEFT-TO-RIGHT 
ISOLATE}us.org>")))
+
 ;;; textsec-tests.el ends here



reply via email to

[Prev in Thread] Current Thread [Next in Thread]