emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master 53bea87: Make downcasing unibyte strings in Turkish less wrong


From: Lars Ingebrigtsen
Subject: master 53bea87: Make downcasing unibyte strings in Turkish less wrong
Date: Tue, 19 Oct 2021 14:36:59 -0400 (EDT)

branch: master
commit 53bea8796d52d90d09c29780070442d59e1883b7
Author: Lars Ingebrigtsen <larsi@gnus.org>
Commit: Lars Ingebrigtsen <larsi@gnus.org>

    Make downcasing unibyte strings in Turkish less wrong
    
    * src/casefiddle.c (ascii_casify_character): New function.
    (do_casify_unibyte_string): Use it to make downcasing tr_TR.UTF-8
    "I" less wrong.
    (Fdowncase): Mention caveats.
    (Fupcase):
    (Fcapitalize):
    (Fupcase_initials): Refer to details in `downcase'.
    (syms_of_casefiddle): Define more symbols.
---
 etc/NEWS         | 12 ++++++++++++
 src/casefiddle.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/etc/NEWS b/etc/NEWS
index 9f0a4ac..98c710a 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -179,6 +179,18 @@ Emacs buffers, like indentation and the like.  The new ert 
function
 * Incompatible Lisp Changes in Emacs 29.1
 
 ---
+** 'downcase' details have changed slightly.
+In certain locales, changing the case of an ASCII-range character may
+turn it into a multibyte character, most notably with "I" in Turkish
+(the lowercase is "ı", 0x0131).  Previously, 'downcase' on a unibyte
+string was buggy, and would mistakenly just return the lower byte of
+this, 0x31 (the digit "1").  'downcase' on a unibyte string has now
+been changed to downcase such characters as if they were ASCII.  To
+get proper locale-dependent downcasing, the string has to be converted
+to multibyte first.  (This goes for the other case-changing functions,
+too.)
+
+---
 ** 'def' indentation changes.
 In 'emacs-lisp-mode', forms with a symbol with a name that start with
 "def" have been automatically indented as if they were 'defun'-like
diff --git a/src/casefiddle.c b/src/casefiddle.c
index a7a2541..e41ada8 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -297,6 +297,16 @@ do_casify_multibyte_string (struct casing_context *ctx, 
Lisp_Object obj)
   return obj;
 }
 
+static int
+ascii_casify_character (bool downcase, int c)
+{
+  Lisp_Object cased = CHAR_TABLE_REF (downcase?
+                                     uniprop_table (Qlowercase) :
+                                     uniprop_table (Quppercase),
+                                     c);
+  return FIXNATP (cased) ? XFIXNAT (cased) : c;
+}
+
 static Lisp_Object
 do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
 {
@@ -310,11 +320,12 @@ do_casify_unibyte_string (struct casing_context *ctx, 
Lisp_Object obj)
       cased = case_single_character (ctx, ch);
       if (ch == cased)
        continue;
-      cased = make_char_unibyte (cased);
-      /* If the char can't be converted to a valid byte, just don't
-        change it.  */
-      if (SINGLE_BYTE_CHAR_P (cased))
-       SSET (obj, i, cased);
+      /* If down/upcasing changed an ASCII character into a non-ASCII
+        character (this can happen in some locales, like the Turkish
+        "I"), downcase using the ASCII char table.  */
+      if (ASCII_CHAR_P (ch) && !SINGLE_BYTE_CHAR_P (cased))
+       cased = ascii_casify_character (ctx->flag == CASE_DOWN, ch);
+      SSET (obj, i, make_char_unibyte (cased));
     }
   return obj;
 }
@@ -339,10 +350,13 @@ casify_object (enum case_action flag, Lisp_Object obj)
 
 DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
        doc: /* Convert argument to upper case and return that.
-The argument may be a character or string.  The result has the same type.
+The argument may be a character or string.  The result has the same
+type.  (See `downcase' for further details about the type.)
+
 The argument object is not altered--the value is a copy.  If argument
 is a character, characters which map to multiple code points when
 cased, e.g. fi, are returned unchanged.
+
 See also `capitalize', `downcase' and `upcase-initials'.  */)
   (Lisp_Object obj)
 {
@@ -351,7 +365,15 @@ See also `capitalize', `downcase' and `upcase-initials'.  
*/)
 
 DEFUN ("downcase", Fdowncase, Sdowncase, 1, 1, 0,
        doc: /* Convert argument to lower case and return that.
-The argument may be a character or string.  The result has the same type.
+The argument may be a character or string.  The result has the same type,
+including the multibyteness of the string.
+
+This means that if this function is called with a unibyte string
+argument, and downcasing it would turn it into a multibyte string
+(according to the current locale), the downcasing is done using ASCII
+\"C\" rules instead.  To accurately downcase according to the current
+locale, the string must be converted into multibyte first.
+
 The argument object is not altered--the value is a copy.  */)
   (Lisp_Object obj)
 {
@@ -362,7 +384,10 @@ DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
        doc: /* Convert argument to capitalized form and return that.
 This means that each word's first character is converted to either
 title case or upper case, and the rest to lower case.
-The argument may be a character or string.  The result has the same type.
+
+The argument may be a character or string.  The result has the same
+type.  (See `downcase' for further details about the type.)
+
 The argument object is not altered--the value is a copy.  If argument
 is a character, characters which map to multiple code points when
 cased, e.g. fi, are returned unchanged.  */)
@@ -377,7 +402,10 @@ DEFUN ("upcase-initials", Fupcase_initials, 
Supcase_initials, 1, 1, 0,
        doc: /* Convert the initial of each word in the argument to upper case.
 This means that each word's first character is converted to either
 title case or upper case, and the rest are left unchanged.
-The argument may be a character or string.  The result has the same type.
+
+The argument may be a character or string.  The result has the same
+type.  (See `downcase' for further details about the type.)
+
 The argument object is not altered--the value is a copy.  If argument
 is a character, characters which map to multiple code points when
 cased, e.g. fi, are returned unchanged.  */)
@@ -651,6 +679,8 @@ syms_of_casefiddle (void)
   DEFSYM (Qbounds, "bounds");
   DEFSYM (Qidentity, "identity");
   DEFSYM (Qtitlecase, "titlecase");
+  DEFSYM (Qlowercase, "lowercase");
+  DEFSYM (Quppercase, "uppercase");
   DEFSYM (Qspecial_uppercase, "special-uppercase");
   DEFSYM (Qspecial_lowercase, "special-lowercase");
   DEFSYM (Qspecial_titlecase, "special-titlecase");



reply via email to

[Prev in Thread] Current Thread [Next in Thread]