guile-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Text collation


From: Ludovic Courtès
Subject: Text collation
Date: Tue, 19 Sep 2006 11:23:03 +0200
User-agent: Gnus/5.110006 (No Gnus v0.6) Emacs/21.4 (gnu/linux)

Hi,

Guile doesn't provide any locale-dependent string comparison primitive.
`string<?' et al., from SRFI-13 (also found in R5.91RS), are required to
be locale-independent:

  These procedures are the lexicographic extensions to strings of the
  corresponding orderings on characters. For example, string< is the
  lexicographic ordering on strings induced by the ordering char<? on
  characters.

  [...]

  Comparison is simply done on individual code-points of the
  string. True text collation is not handled by this SRFI.

R5.91RS, about `char<?' et al., adds:

  These procedures impose a total ordering on the set of characters
  according to their scalar values.

Thus, for proper locale-dependent "text collation", we need a separate
API.  The patch below provides preliminary support for this.  Here is a
sample session showing how the API can be used:

  guile> (define l (make-locale LC_COLLATE_MASK "fr_FR"))
  guile> l
  #<locale 300a74d0>
  guile> (string-locale<? "été" "hiver") ;; using the "C" locale
  #f
  guile> (string-locale<? "été" "hiver" l)
  #t
  guile> (char-locale<? #\é #\h)         ;; using the "C" locale
  #f
  guile> (char-locale<? #\é #\h l)
  #t
  guile> (setlocale LC_COLLATE "fr_FR")
  "fr_FR"
  guile> (string-locale<? "été" "hiver") ;; using the "fr_FR" locale
  #t

The cool thing is that it's a "clean" API in that the locale settings
can be passed explicitly as a third parameter.  The (potentially)
controversial part is that this wraps a non-standard GNU extension.

I don't think this should be seen as a problem, first because this GNU
extension can be emulated on non-GNU platforms by controlling
invocations of `setlocale' (via a mutex) and by using that to create
"critical locale sections" within the `-locale<?' functions:

  /* Save the current locale settings and install LOCALE_SETTINGS.  */
  scm_being_locked_locale_section (locale_settings);

  result = strcoll (c_s1, c_s2);
  /* ... */

  /* Restore the previous locale settings.  */
  scm_end_locked_locale_section ();

Emulation of the locale category mask that is passed to `make-locale'
may turn out to be quite inelegant, but it can be implemented in terms
of `setlocale ()' (as many calls as bits set in the mask).

Second, it is likely that this or a similar API will be adopted by OSes
and may eventually be standardized.  The C++ standard already includes a
similar locale API (ISO C++ 14882, see the `locale_classes.h' header
that ships with GCC).

The GNU C library also provides `strcasecmp_l ()' which would be nice to
have but since this would be hard or even impossible to "emulate" on
other platforms, it may be preferable not to include it at the moment.

If all this sounds like a reasonable plan to you, then I can implement
locked locale sections (for non-GNU platforms), augment the docs, and
eventually merge it in both 1.8 and HEAD.

Thanks,
Ludovic.

PS: I would be happier if this was part of an `(ice-9 i18n)' module but
    since it's already too late for `gettext', `bindtextdomain' and
    friends to go there, maybe we should just keep using the root
    module...

--- orig/configure.in
+++ mod/configure.in
@@ -599,8 +599,9 @@
 #   stat64 - SuS largefile stuff, not on old systems
 #   sysconf - not on old systems
 #   _NSGetEnviron - Darwin specific
+#   strcoll_l, newlocale - GNU extensions (glibc)
 #
-AC_CHECK_FUNCS([DINFINITY DQNAN ctermid fesetround ftime fchown getcwd geteuid 
gettimeofday gmtime_r ioctl lstat mkdir mknod nice readdir_r readlink rename 
rmdir select setegid seteuid setlocale setpgid setsid sigaction siginterrupt 
stat64 strftime strptime symlink sync sysconf tcgetpgrp tcsetpgrp times uname 
waitpid strdup system usleep atexit on_exit chown link fcntl ttyname getpwent 
getgrent kill getppid getpgrp fork setitimer getitimer strchr strcmp index 
bcopy memcpy rindex unsetenv _NSGetEnviron])
+AC_CHECK_FUNCS([DINFINITY DQNAN ctermid fesetround ftime fchown getcwd geteuid 
gettimeofday gmtime_r ioctl lstat mkdir mknod nice readdir_r readlink rename 
rmdir select setegid seteuid setlocale setpgid setsid sigaction siginterrupt 
stat64 strftime strptime symlink sync sysconf tcgetpgrp tcsetpgrp times uname 
waitpid strdup system usleep atexit on_exit chown link fcntl ttyname getpwent 
getgrent kill getppid getpgrp fork setitimer getitimer strchr strcmp index 
bcopy memcpy rindex unsetenv _NSGetEnviron strcoll strcoll_l newlocale])
 
 # Reasons for testing:
 #   netdb.h - not in mingw


--- orig/libguile/i18n.c
+++ mod/libguile/i18n.c
@@ -15,6 +15,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define _GNU_SOURCE /* Ask for glibc's `newlocale' API */
 
 #if HAVE_CONFIG_H
 # include <config.h>
@@ -24,10 +25,13 @@
 #include "libguile/feature.h"
 #include "libguile/i18n.h"
 #include "libguile/strings.h"
+#include "libguile/chars.h"
 #include "libguile/dynwind.h"
+#include "libguile/validate.h"
 
 #include "gettext.h"
 #include <locale.h>
+#include <string.h> /* `strcoll ()' */
 
 
 int
@@ -312,10 +316,206 @@
 }
 #undef FUNC_NAME
 
-void 
+
+/* Locale objects, string and character collation.  */
+
+SCM_SMOB (scm_tc16_locale_smob_type, "locale", 0);
+
+SCM_SMOB_FREE (scm_tc16_locale_smob_type, smob_locale_free, locale)
+{
+#ifdef __GNU_LIBRARY__
+  freelocale ((locale_t)SCM_SMOB_DATA (locale));
+#endif
+
+  return 0;
+}
+
+#ifndef __GNU_LIBRARY__
+
+/* Provide the locale category masks as found in glibc (copied from
+   <locale.h> as found in glibc 2.3.6).  */
+
+# define LC_CTYPE_MASK         (1 << LC_CTYPE)
+# define LC_NUMERIC_MASK       (1 << LC_NUMERIC)
+# define LC_TIME_MASK          (1 << LC_TIME)
+# define LC_COLLATE_MASK       (1 << LC_COLLATE)
+# define LC_MONETARY_MASK      (1 << LC_MONETARY)
+# define LC_MESSAGES_MASK      (1 << LC_MESSAGES)
+# define LC_PAPER_MASK         (1 << LC_PAPER)
+# define LC_NAME_MASK          (1 << LC_NAME)
+# define LC_ADDRESS_MASK       (1 << LC_ADDRESS)
+# define LC_TELEPHONE_MASK     (1 << LC_TELEPHONE)
+# define LC_MEASUREMENT_MASK   (1 << LC_MEASUREMENT)
+# define LC_IDENTIFICATION_MASK        (1 << LC_IDENTIFICATION)
+# define LC_ALL_MASK           (LC_CTYPE_MASK \
+                                | LC_NUMERIC_MASK \
+                                | LC_TIME_MASK \
+                                | LC_COLLATE_MASK \
+                                | LC_MONETARY_MASK \
+                                | LC_MESSAGES_MASK \
+                                | LC_PAPER_MASK \
+                                | LC_NAME_MASK \
+                                | LC_ADDRESS_MASK \
+                                | LC_TELEPHONE_MASK \
+                                | LC_MEASUREMENT_MASK \
+                                | LC_IDENTIFICATION_MASK \
+                                )
+
+#endif
+
+
+SCM_DEFINE (scm_make_locale, "make-locale", 2, 1, 0,
+           (SCM category_mask, SCM locale_name, SCM base_locale),
+           "Return a reference to a data structure representing a set of "
+           "locale datasets.  Unlike for the @var{category} parameter for "
+           "@code{setlocale}, the @var{category_mask} parameter here uses "
+           "a single bit for each category, made by OR'ing together "
+           "@code{LC_*_MASK} bits.")
+#define FUNC_NAME s_scm_make_locale
+{
+#ifdef __GNU_LIBRARY__
+  SCM locale;
+  int c_category_mask;
+  char *c_locale_name;
+  locale_t c_base_locale, c_locale;
+
+  SCM_VALIDATE_INT_COPY (1, category_mask, c_category_mask);
+  SCM_VALIDATE_STRING (2, locale_name);
+  c_locale_name = scm_to_locale_string (locale_name);
+
+  if (base_locale != SCM_UNDEFINED)
+    {
+      SCM_VALIDATE_SMOB (3, base_locale, locale_smob_type);
+      c_base_locale = (locale_t)SCM_SMOB_DATA (base_locale);
+    }
+  else
+    c_base_locale = NULL;
+
+  c_locale = newlocale (c_category_mask, c_locale_name, c_base_locale);
+  free (c_base_locale);
+
+  if (!c_locale)
+    locale = SCM_BOOL_F;
+  else
+    SCM_NEWSMOB (locale, scm_tc16_locale_smob_type, c_locale);
+
+  return locale;
+#else
+  /* FIXME: Handle this situation, for instance:
+     SCM_RETURN_NEWSMOB (scm_tc16_locale_smob_type,
+                         scm_list_3 (category_mask, locale_name,
+                                    base_locale));  */
+  return SCM_BOOL_F;
+#endif
+}
+#undef FUNC_NAME
+
+
+/* Compare null-terminated strings C_S1 and C_S2 according to LOCALE.  Return
+   an integer whose sign is the same as the difference between C_S1 and
+   C_S2.  */
+static inline int
+compare_strings (const char *c_s1, const char *c_s2, SCM locale,
+                const char *func_name)
+#define FUNC_NAME func_name
+{
+#ifdef __GNU_LIBRARY__
+  locale_t c_locale;
+#endif
+  int result;
+
+#ifdef __GNU_LIBRARY__
+  if (locale != SCM_UNDEFINED)
+    {
+      SCM_VALIDATE_SMOB (3, locale, locale_smob_type);
+      c_locale = (locale_t)SCM_SMOB_DATA (locale);
+    }
+  else
+    c_locale = NULL;
+
+  if (c_locale)
+    result = strcoll_l (c_s1, c_s2, c_locale);
+  else
+#endif
+
+#if HAVE_STRCOLL
+    result = strcoll (c_s1, c_s2);
+#else
+    result = strcmp (c_s1, c_s2);
+#endif
+
+  return result;
+}
+#undef FUNC_NAME
+
+
+SCM_DEFINE (scm_string_locale_lt, "string-locale<?", 2, 1, 0,
+           (SCM s1, SCM s2, SCM locale),
+           "Compare strings @var{s1} and @var{s2} in a locale-dependent way."
+           "If @var{locale} is provided, it should be locale object (as "
+           "returned by @code{make-locale}) and will be used to perform the "
+           "comparison; otherwise, the current system locale is used.")
+#define FUNC_NAME s_scm_string_locale_lt
+{
+  int result;
+  const char *c_s1, *c_s2;
+
+  SCM_VALIDATE_STRING (1, s1);
+  SCM_VALIDATE_STRING (2, s2);
+
+  c_s1 = scm_i_string_chars (s1);
+  c_s2 = scm_i_string_chars (s2);
+
+  result = compare_strings (c_s1, c_s2, locale, FUNC_NAME);
+
+  scm_remember_upto_here_2 (s1, s2);
+
+  return scm_from_bool (result < 0);
+}
+#undef FUNC_NAME
+
+/* XXX: Wrap GNU's `strcasecmp_l ()'?  */
+
+
+SCM_DEFINE (scm_char_locale_lt, "char-locale<?", 2, 1, 0,
+           (SCM c1, SCM c2, SCM locale),
+           "Return true if character @var{c1} is lower than @var{c2} "
+           "according to @var{locale} or to the current locale.")
+#define FUNC_NAME s_scm_char_locale_lt
+{
+  char c_c1[2], c_c2[2];
+
+  SCM_VALIDATE_CHAR (1, c1);
+  SCM_VALIDATE_CHAR (2, c2);
+
+  c_c1[0] = (char)SCM_CHAR (c1); c_c1[1] = '\0';
+  c_c2[0] = (char)SCM_CHAR (c2); c_c2[1] = '\0';
+
+  return scm_from_bool (compare_strings (c_c1, c_c2, locale, FUNC_NAME) < 0);
+}
+#undef FUNC_NAME
+
+
+
+void
 scm_init_i18n ()
 {
   scm_add_feature ("i18n");
+
+  scm_c_define ("LC_CTYPE_MASK", SCM_I_MAKINUM (LC_CTYPE_MASK));
+  scm_c_define ("LC_NUMERIC_MASK", SCM_I_MAKINUM (LC_NUMERIC_MASK));
+  scm_c_define ("LC_TIME_MASK", SCM_I_MAKINUM (LC_TIME_MASK));
+  scm_c_define ("LC_COLLATE_MASK", SCM_I_MAKINUM (LC_COLLATE_MASK));
+  scm_c_define ("LC_MONETARY_MASK", SCM_I_MAKINUM (LC_MONETARY_MASK));
+  scm_c_define ("LC_MESSAGES_MASK", SCM_I_MAKINUM (LC_MESSAGES_MASK));
+  scm_c_define ("LC_PAPER_MASK", SCM_I_MAKINUM (LC_PAPER_MASK));
+  scm_c_define ("LC_NAME_MASK", SCM_I_MAKINUM (LC_NAME_MASK));
+  scm_c_define ("LC_ADDRESS_MASK", SCM_I_MAKINUM (LC_ADDRESS_MASK));
+  scm_c_define ("LC_TELEPHONE_MASK", SCM_I_MAKINUM (LC_TELEPHONE_MASK));
+  scm_c_define ("LC_MEASUREMENT_MASK", SCM_I_MAKINUM (LC_MEASUREMENT_MASK));
+  scm_c_define ("LC_IDENTIFICATION_MASK", SCM_I_MAKINUM 
(LC_IDENTIFICATION_MASK));
+  scm_c_define ("LC_ALL_MASK", SCM_I_MAKINUM (LC_ALL_MASK));
+
 #include "libguile/i18n.x"
 }
 


--- orig/libguile/i18n.h
+++ mod/libguile/i18n.h
@@ -30,6 +30,10 @@
 
 SCM_API int scm_i_to_lc_category (SCM category, int allow_lc_all);
 
+SCM_API SCM scm_make_locale (SCM category_mask, SCM locale_name, SCM 
base_locale);
+SCM_API SCM scm_string_locale_lt (SCM s1, SCM s2, SCM locale);
+SCM_API SCM scm_char_locale_lt (SCM c1, SCM c2, SCM locale);
+
 SCM_API void scm_init_i18n (void);
 
 #endif  /* SCM_I18N_H */




reply via email to

[Prev in Thread] Current Thread [Next in Thread]