gawk-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gawk-diffs] [SCM] gawk branch, master, updated. gawk-4.1.0-1972-gb02f58


From: Arnold Robbins
Subject: [gawk-diffs] [SCM] gawk branch, master, updated. gawk-4.1.0-1972-gb02f580
Date: Thu, 1 Sep 2016 17:46:38 +0000 (UTC)

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".

The branch, master has been updated
       via  b02f580f06996bd88f741f9c7330aff79216a169 (commit)
      from  af43bad53b2f05ba0d4403a59433f587a1e32b22 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=b02f580f06996bd88f741f9c7330aff79216a169

commit b02f580f06996bd88f741f9c7330aff79216a169
Author: Arnold D. Robbins <address@hidden>
Date:   Thu Sep 1 20:46:12 2016 +0300

    Merge multithreaded dfa into gawk.

diff --git a/ChangeLog b/ChangeLog
index 023fa88..d3fcbdd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,20 @@
-2016-08-29         Aharon Robbins       <address@hidden>
+2016-09-01         Arnold D. Robbins     <address@hidden>
+
+       Merge grep's now thread-safe dfa. Wheee.
+
+       * dfa.h, dfa.c: Sync with grep.
+       * localeinfo.h, localeinfo.c, verify.h: New files.
+       * Makefile.am (base_sources): Adjust.
+       * awk.h (using_utf8): Declare new function.
+       * node.c (str2wstr): Use using_utf8 instead of now-gone dfa function.
+       * re.c: Include "localeinfo.h".
+       (localeinfo): New static variable.
+       (make_regexp): Adjust call to dfa_syntax.
+       (resetup): Call init_localeinfo on localeinfo. Remove call to
+       now-gone function dfa_init.
+       (using_utf8): New function.
+
+2016-08-29         Arnold D. Robbins     <address@hidden>
 
        * configure.ac (fwrite_unlocked): Check for it.
        * awk.h (fwrite): Define to fwrite_unlocked if we have it.
diff --git a/Makefile.am b/Makefile.am
index dce6501..9acae0b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -109,6 +109,8 @@ base_sources = \
        gettext.h \
        int_array.c \
        interpret.h \
+       localeinfo.c \
+       localeinfo.h \
        io.c \
        mbsupport.h \
        main.c \
@@ -126,6 +128,7 @@ base_sources = \
        replace.c \
        str_array.c \
        symbol.c \
+       verify.h \
        version.c \
        xalloc.h
 
diff --git a/Makefile.in b/Makefile.in
index 036361c..f103a42 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -143,10 +143,11 @@ am__objects_1 = array.$(OBJEXT) awkgram.$(OBJEXT) 
builtin.$(OBJEXT) \
        dfa.$(OBJEXT) eval.$(OBJEXT) ext.$(OBJEXT) field.$(OBJEXT) \
        floatcomp.$(OBJEXT) gawkapi.$(OBJEXT) gawkmisc.$(OBJEXT) \
        getopt.$(OBJEXT) getopt1.$(OBJEXT) int_array.$(OBJEXT) \
-       io.$(OBJEXT) main.$(OBJEXT) mpfr.$(OBJEXT) msg.$(OBJEXT) \
-       node.$(OBJEXT) profile.$(OBJEXT) random.$(OBJEXT) re.$(OBJEXT) \
-       regex.$(OBJEXT) replace.$(OBJEXT) str_array.$(OBJEXT) \
-       symbol.$(OBJEXT) version.$(OBJEXT)
+       localeinfo.$(OBJEXT) io.$(OBJEXT) main.$(OBJEXT) \
+       mpfr.$(OBJEXT) msg.$(OBJEXT) node.$(OBJEXT) profile.$(OBJEXT) \
+       random.$(OBJEXT) re.$(OBJEXT) regex.$(OBJEXT) \
+       replace.$(OBJEXT) str_array.$(OBJEXT) symbol.$(OBJEXT) \
+       version.$(OBJEXT)
 am_gawk_OBJECTS = $(am__objects_1)
 gawk_OBJECTS = $(am_gawk_OBJECTS)
 gawk_LDADD = $(LDADD)
@@ -518,6 +519,8 @@ base_sources = \
        gettext.h \
        int_array.c \
        interpret.h \
+       localeinfo.c \
+       localeinfo.h \
        io.c \
        mbsupport.h \
        main.c \
@@ -535,6 +538,7 @@ base_sources = \
        replace.c \
        str_array.c \
        symbol.c \
+       verify.h \
        version.c \
        xalloc.h
 
@@ -681,6 +685,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
 @AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
 @AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
address@hidden@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
 @AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
 @AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
 @AMDEP_TRUE@@am__include@ @address@hidden/$(DEPDIR)/address@hidden@
diff --git a/awk.h b/awk.h
index edd9cb9..2c40163 100644
--- a/awk.h
+++ b/awk.h
@@ -1656,6 +1656,7 @@ extern void resyntax(int syntax);
 extern void resetup(void);
 extern int reisstring(const char *text, size_t len, Regexp *re, const char 
*buf);
 extern int get_numbase(const char *str, bool use_locale);
+extern bool using_utf8(void);
 
 /* symbol.c */
 extern void load_symbols();
diff --git a/dfa.c b/dfa.c
index 85cb46a..fad03e4 100644
--- a/dfa.c
+++ b/dfa.c
@@ -69,6 +69,8 @@
 
 #include "dfa.h"
 
+#include "localeinfo.h"
+
 #ifdef GAWK
 static int
 is_blank (int c)
@@ -445,14 +447,9 @@ struct dfa
   size_t nregexps;              /* Count of parallel regexps being built
                                    with dfaparse.  */
   bool fast;                   /* The DFA is fast.  */
-  bool multibyte;              /* MB_CUR_MAX > 1.  */
   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;               /* Multibyte conversion state.  */
 
-  /* dfaexec implementation.  */
-  char *(*dfaexec) (struct dfa *, char const *, char *,
-                    bool, size_t *, bool *);
-
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
   /* The value of multibyte_prop[i] is defined by following rule.
@@ -538,6 +535,21 @@ struct dfa
   state_num **mb_trans;      /* Transition tables for states with ANYCHAR.  */
   state_num mb_trcount;         /* Number of transition tables for states with
                                    ANYCHAR that have actually been built.  */
+
+  /* Information derived from the locale.  This is at the end so that
+     a quick memset need not clear it specially.  */
+
+  /* dfaexec implementation.  */
+  char *(*dfaexec) (struct dfa *, char const *, char *,
+                    bool, size_t *, bool *);
+
+  /* The locale is simple, like the C locale.  These locales can be
+     processed more efficiently, e.g., the relationship between lower-
+     and upper-case letters is 1-1.  */
+  bool simple_locale;
+
+  /* Other cached information derived from the locale.  */
+  struct localeinfo localeinfo;
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -551,13 +563,8 @@ struct dfa
 
 static void regexp (struct dfa *dfa);
 
-/* A table indexed by byte values that contains the corresponding wide
-   character (if any) for that byte.  WEOF means the byte is not a
-   valid single-byte character.  */
-static wint_t mbrtowc_cache[NOTCHAR];
-
 /* Store into *PWC the result of converting the leading bytes of the
-   multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+   multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
    and updating the conversion state in *D.  On conversion error,
    convert just a single byte, to WEOF.  Return the number of bytes
    converted.
@@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
 
    * PWC points to wint_t, not to wchar_t.
    * The last arg is a dfa *D instead of merely a multibyte conversion
-     state D->mbs.  D also contains an mbrtowc_cache for speed.
+     state D->mbs.
    * N must be at least 1.
    * S[N - 1] must be a sentinel byte.
    * Shift encodings are not supported.
@@ -577,7 +584,7 @@ static size_t
 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 {
   unsigned char uc = s[0];
-  wint_t wc = mbrtowc_cache[uc];
+  wint_t wc = d->localeinfo.sbctowc[uc];
 
   if (wc == WEOF)
     {
@@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, 
size_t itemsize)
 
 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
 {
   size_t i;
 
@@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
 }
 
 static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 {
-  return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+  return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 }
 
 static int
@@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c)
 {
   if (c == dfa->syntax.eolbyte)
     return CTX_NEWLINE;
-  if (unibyte_word_constituent (c))
+  if (unibyte_word_constituent (dfa, c))
     return CTX_LETTER;
   return CTX_NONE;
 }
 
-/* UTF-8 encoding allows some optimizations that we can't otherwise
-   assume in a multibyte encoding.  */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
-  return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
-  int i;
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      char c = i;
-      unsigned char uc = i;
-      mbstate_t s = { 0 };
-      wchar_t wc;
-      mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
-    }
-}
-
-/* Entry point to set syntax options.  */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
-  int i;
-  dfa->syntax.syntax_bits_set = true;
-  dfa->syntax.syntax_bits = bits;
-  dfa->syntax.case_fold = fold;
-  dfa->syntax.eolbyte = eol;
-
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      unsigned char uc = i;
-
-      /* Use mbrtowc_cache to calculate sbit.  */
-      dfa->syntax.sbit[uc] = char_context (dfa, uc);
-      switch (dfa->syntax.sbit[uc])
-        {
-        case CTX_LETTER:
-          setbit (uc, dfa->syntax.letters);
-          break;
-        case CTX_NEWLINE:
-          setbit (uc, dfa->syntax.newline);
-          break;
-        }
-
-      /* POSIX requires that the five bytes in "\n\r./" (including the
-         terminating NUL) cannot occur inside a multibyte character.  */
-      dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
-                                     : strchr ("\n\r./", uc) != NULL);
-    }
-}
-
 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
    this may happen when folding case in weird Turkish locales where
@@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c)
       setbit (i, c);
 }
 
-static void check_utf8 (void)
-{
-  wchar_t wc;
-  mbstate_t mbs = { 0 };
-  using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
-  char const *locale = setlocale (LC_ALL, NULL);
-  unibyte_c = (!locale
-               || STREQ (locale, "C")
-               || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
-   without multicharacter collating sequences and where range
-   comparisons simply use the native encoding.  These locales can be
-   processed more efficiently.  */
+/* Return true if the locale compatible with the C locale.  */
 
 static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
 {
   /* The native character set is known to be compatible with
      the C locale.  The following test isn't perfect, but it's good
@@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa)
      && '}' == 125 && '~' == 126)
   };
 
-  return (native_c_charset & !dfa->multibyte) | unibyte_c;
+  if (native_c_charset && !multibyte)
+    return true;
+  else
+    {
+      /* Treat C and POSIX locales as being compatible.  Also, treat
+         errors as compatible, as these are invariably from stubs.  */
+      char const *loc = setlocale (LC_ALL, NULL);
+      return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
+    }
 }
 
 /* Fetch the next lexical input character.  Set C (of type int) to the
@@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa)
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE,
-  };
-
-/* Maximum number of characters that can be the case-folded
-   counterparts of a single character, not counting the character
-   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
-   entry in LONESOME_LOWER.  */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
-   itself, and store them into FOLDED.  Return the number of characters
-   stored.  */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
-  unsigned int i;
-  unsigned int n = 0;
-  wint_t uc = towupper (c);
-  wint_t lc = towlower (uc);
-  if (uc != c)
-    folded[n++] = uc;
-  if (lc != uc && lc != c && towupper (lc) == uc)
-    folded[n++] = lc;
-  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
-    {
-      wint_t li = lonesome_lower[i];
-      if (li != lc && li != uc && li != c && towupper (li) == uc)
-        folded[n++] = li;
-    }
-  return n;
-}
-
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa)
   size_t chars_al;
 
   chars_al = 0;
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
                                     &dfa->mbcsets_alloc,
@@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa)
     {
       FETCH_WC (dfa, c, wc, _("unbalanced ["));
       invert = true;
-      known_bracket_exp = using_simple_locale (dfa);
+      known_bracket_exp = dfa->simple_locale;
     }
   else
     invert = false;
@@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa)
                   if (!pred)
                     dfaerror (_("invalid character class"));
 
-                  if (dfa->multibyte && !pred->single_byte_only)
+                  if (dfa->localeinfo.multibyte && !pred->single_byte_only)
                     known_bracket_exp = false;
                   else
                     for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa)
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
-                  if (dfa->multibyte)
+                  if (dfa->localeinfo.multibyte)
                     known_bracket_exp = false;
-                  else if (using_simple_locale (dfa))
+                  else if (dfa->simple_locale)
                     {
                       int ci;
                       for (ci = c; ci <= c2; ci++)
@@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa)
 
       colon_warning_state |= (c == ':') ? 2 : 4;
 
-      if (!dfa->multibyte)
+      if (!dfa->localeinfo.multibyte)
         {
           if (dfa->syntax.case_fold)
             setbit_case_fold_c (c, ccl);
@@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa)
   if (! known_bracket_exp)
     return BACKREF;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       work_mbc->invert = invert;
-      work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
       return MBCSET;
     }
 
   if (invert)
     {
-      assert (!dfa->multibyte);
+      assert (!dfa->localeinfo.multibyte);
       notset (ccl);
       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit ('\n', ccl);
     }
 
-  return CSET + dfa_charclass_index (dfa, ccl);
+  return CSET + charclass_index (dfa, ccl);
 }
 
 struct lexptr
@@ -1535,7 +1426,7 @@ lex (struct dfa *dfa)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             {
               /* In multibyte environment period must match with a single
                  character not a byte.  So we use ANYCHAR.  */
@@ -1549,13 +1440,13 @@ lex (struct dfa *dfa)
           if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
             clrbit ('\0', ccl);
           dfa->lex.laststart = false;
-          return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+          return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
 
         case 's':
         case 'S':
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1564,7 +1455,7 @@ lex (struct dfa *dfa)
               if (c == 'S')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1588,16 +1479,16 @@ lex (struct dfa *dfa)
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
 
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (unibyte_word_constituent (c2))
+                if (unibyte_word_constituent (dfa, c2))
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1627,14 +1518,14 @@ lex (struct dfa *dfa)
           dfa->lex.laststart = false;
           /* For multibyte character sets, folding is done in atom.  Always
              return WCHAR.  */
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             return dfa->lex.lasttok = WCHAR;
 
           if (dfa->syntax.case_fold && isalpha (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           return dfa->lex.lasttok = c;
@@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
     {
       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
                                 sizeof *dfa->tokens);
-      if (dfa->multibyte)
+      if (dfa->localeinfo.multibyte)
         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
                                          sizeof *dfa->multibyte_prop);
     }
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     dfa->multibyte_prop[dfa->tindex] = mbprop;
   dfa->tokens[dfa->tindex++] = t;
 
@@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
 static void
 addtok (struct dfa *dfa, token t)
 {
-  if (dfa->multibyte && t == MBCSET)
+  if (dfa->localeinfo.multibyte && t == MBCSET)
     {
       bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa)
             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
               clrbit ('\0', c);
           }
-        dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
       }
 
   /* A valid UTF-8 character is
@@ -1878,7 +1769,7 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && using_utf8)
+  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
     {
       /* For UTF-8 expand the period to a series of CSETs that define a valid
          UTF-8 character.  This avoids using the slow multibyte path.  I'm
@@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
 {
   size_t i;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     for (i = 0; i < ntokens; ++i)
       addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + 
i]);
   else
@@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   d->lex.lasttok = END;
   d->lex.laststart = true;
   d->lex.parens = 0;
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     {
       d->lex.cur_mb_len = 0;
       memset (&d->mbs, 0, sizeof d->mbs);
@@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int 
context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
         {
           /* ANYCHAR must match a single character, so put it to
              D->states[s].mbps which contains the positions which can
@@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+        trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
       trans[d->syntax.eolbyte] = state_newline;
     }
   else
@@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
           insert (d->follows[grps[i].elems[j]].elems[k], &follows);
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           /* If a token in follows.elems is not 1st byte of a multibyte
              character, or the states of follows must accept the bytes
@@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well.  */
-      if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+      if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
         {
           merge (&d->states[0].elems, &follows, &tmp);
           copy (&tmp, &follows);
@@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
               if (c == d->syntax.eolbyte)
                 trans[c] = state_newline;
-              else if (unibyte_word_constituent (c))
+              else if (unibyte_word_constituent (d, c))
                 trans[c] = state_letter;
               else if (c < NOTCHAR)
                 trans[c] = state;
@@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num 
new_state)
       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num 
new_state)
         {
           d->trans[oldalloc] = NULL;
           d->fails[oldalloc] = NULL;
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             d->mb_trans[oldalloc] = NULL;
         }
     }
@@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d)
         }
       d->trcount = d->min_trcount;
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           for (i = d->min_trcount; i < d->tralloc; i++)
             {
@@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
   return (char *) begin;
 }
 
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
    but faster and set *BACKREF if the DFA code does not support this
    regexp usage.  */
 
@@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (!d->multibyte)
+          if (!d->localeinfo.multibyte)
             continue;
           /* fallthrough */
 
@@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d)
   size_t i;
   bool have_backref = false;
 
-  if (!using_utf8)
+  if (!d->localeinfo.using_utf8)
     return;
 
   for (i = 0; i < d->tindex; ++i)
@@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d)
     }
 
   free_mbdata (d);
-  d->multibyte = false;
+  d->localeinfo.multibyte = false;
   d->dfaexec = dfaexec_sb;
   d->fast = true;
 }
@@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d)
   struct dfa *sup = dfaalloc ();
 
   *sup = *d;
-  sup->multibyte = false;
+  sup->localeinfo.multibyte = false;
   sup->dfaexec = dfaexec_sb;
   sup->multibyte_prop = NULL;
   sup->mbcsets = NULL;
@@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d)
         case BACKREF:
           zeroset (ccl);
           notset (ccl);
-          sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+          sup->tokens[j++] = CSET + charclass_index (sup, ccl);
           sup->tokens[j++] = STAR;
           if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
               || d->tokens[i + 1] == PLUS)
@@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             {
               /* These constraints aren't supported in a multibyte locale.
                  Ignore them in the superset DFA.  */
@@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d)
     }
   sup->tindex = j;
 
-  if (have_nchar && (have_achar || d->multibyte))
+  if (have_nchar && (have_achar || d->localeinfo.multibyte))
     d->superset = sup;
   else
     {
@@ -3705,7 +3596,7 @@ dfafree (struct dfa *d)
   free (d->charclasses);
   free (d->tokens);
 
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     free_mbdata (d);
 
   for (i = 0; i < d->sindex; ++i)
@@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm)
 struct dfa *
 dfaalloc (void)
 {
-  struct dfa *d = xzalloc (sizeof *d);
-  d->multibyte = MB_CUR_MAX > 1;
-  d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
-  d->fast = !d->multibyte;
-  d->lex.cur_mb_len = 1;
-  return d;
+  return xmalloc (sizeof (struct dfa));
 }
 
+/* Initialize DFA.  */
 void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+           reg_syntax_t bits, bool fold, unsigned char eol)
 {
-  check_utf8 ();
-  check_unibyte_c ();
-  init_mbrtowc_cache ();
+  int i;
+  memset (dfa, 0, offsetof (struct dfa, dfaexec));
+  dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+  dfa->simple_locale = using_simple_locale (linfo->multibyte);
+  dfa->localeinfo = *linfo;
+
+  dfa->fast = !dfa->localeinfo.multibyte;
+
+  dfa->lex.cur_mb_len = 1;
+  dfa->syntax.syntax_bits_set = true;
+  dfa->syntax.syntax_bits = bits;
+  dfa->syntax.case_fold = fold;
+  dfa->syntax.eolbyte = eol;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      unsigned char uc = i;
+
+      dfa->syntax.sbit[uc] = char_context (dfa, uc);
+      switch (dfa->syntax.sbit[uc])
+        {
+        case CTX_LETTER:
+          setbit (uc, dfa->syntax.letters);
+          break;
+        case CTX_NEWLINE:
+          setbit (uc, dfa->syntax.newline);
+          break;
+        }
+
+      /* POSIX requires that the five bytes in "\n\r./" (including the
+         terminating NUL) cannot occur inside a multibyte character.  */
+      dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+                                     ? (uc & 0xc0) != 0x80
+                                     : strchr ("\n\r./", uc) != NULL);
+    }
 }
 
 /* vim:set shiftwidth=2: */
diff --git a/dfa.h b/dfa.h
index 02f56f4..1fd37ec 100644
--- a/dfa.h
+++ b/dfa.h
@@ -28,6 +28,8 @@
 
 #define _GL_ATTRIBUTE_MALLOC
 
+struct localeinfo; /* See localeinfo.h.  */
+
 /* Element of a list of strings, at least one of which is known to
    appear in any R.E. matching the DFA. */
 struct dfamust
@@ -48,17 +50,22 @@ struct dfa;
    calling dfafree() on it. */
 extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
 
+/* Initialize or reinitialize a DFA.  This must be called before
+   any of the routines below.  The arguments are:
+   1. The DFA to operate on.
+   2. Information about the current locale.
+   3. The syntax bits described earlier in this file.
+   4. The case-folding flag.
+   5. The line terminator.  */
+extern void dfasyntax (struct dfa *, struct localeinfo const *,
+                       reg_syntax_t, bool, unsigned char);
+
 /* Build and return the struct dfamust from the given struct dfa. */
 extern struct dfamust *dfamust (struct dfa const *);
 
 /* Free the storage held by the components of a struct dfamust. */
 extern void dfamustfree (struct dfamust *);
 
-/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
-   second sets the syntax bits described earlier in this file, the third sets
-   the case-folding flag, and the fourth specifies the line terminator. */
-extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
-
 /* Compile the given string of the given length into the given struct dfa.
    Final argument is a flag specifying whether to build a searching or an
    exact matcher. */
@@ -103,8 +110,3 @@ extern void dfawarn (const char *);
    takes a single argument, a NUL-terminated string describing the error.
    The user must supply a dfaerror.  */
 extern _Noreturn void dfaerror (const char *);
-
-extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
-
-/* This must be called before calling any of the above dfa*() functions. */
-extern void dfa_init (void);
diff --git a/localeinfo.c b/localeinfo.c
new file mode 100644
index 0000000..ca96afc
--- /dev/null
+++ b/localeinfo.c
@@ -0,0 +1,113 @@
+/* locale information
+
+   Copyright 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written by Paul Eggert.  */
+
+#include <config.h>
+
+#include <localeinfo.h>
+
+#include <verify.h>
+
+#include <limits.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <wctype.h>
+
+/* The sbclen implementation relies on this.  */
+verify (MB_LEN_MAX <= SCHAR_MAX);
+
+/* Return true if the locale uses UTF-8.  */
+
+static bool
+is_using_utf8 (void)
+{
+  wchar_t wc;
+  mbstate_t mbs = {0};
+  return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+
+/* Initialize *LOCALEINFO from the current locale.  */
+
+void
+init_localeinfo (struct localeinfo *localeinfo)
+{
+  int i;
+
+  localeinfo->multibyte = MB_CUR_MAX > 1;
+  localeinfo->using_utf8 = is_using_utf8 ();
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; i++)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t s = {0};
+      wchar_t wc;
+      size_t len = mbrtowc (&wc, &c, 1, &s);
+      localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
+      localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
+    }
+}
+
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+/* Verify that the worst case fits.  This is 1 for towupper, 1 for
+   towlower, and 1 for each entry in LONESOME_LOWER.  */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+        <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
diff --git a/localeinfo.h b/localeinfo.h
new file mode 100644
index 0000000..cf2f9a6
--- /dev/null
+++ b/localeinfo.h
@@ -0,0 +1,54 @@
+/* locale information
+
+   Copyright 2016 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written by Paul Eggert.  */
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct localeinfo
+{
+  /* MB_CUR_MAX > 1.  */
+  bool multibyte;
+
+  /* The locale uses UTF-8.  */
+  bool using_utf8;
+
+  /* An array indexed by byte values B that contains 1 if B is a
+     single-byte character, -1 if B is an encoding error, and -2 if B
+     is the leading byte of a multibyte character that contains more
+     than one byte.  */
+  signed char sbclen[UCHAR_MAX + 1];
+
+  /* An array indexed by byte values B that contains the corresponding
+     wide character (if any) for B if sbclen[B] == 1.  WEOF means the
+     byte is not a valid single-byte character, i.e., sbclen[B] == -1
+     or -2.  */
+  wint_t sbctowc[UCHAR_MAX + 1];
+};
+
+extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is a generous upper bound.  */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/node.c b/node.c
index c6c9af8..bb2fe43 100644
--- a/node.c
+++ b/node.c
@@ -752,7 +752,7 @@ str2wstr(NODE *n, size_t **ptr)
                         * stopping early. This is particularly important
                         * for match() where we need to build the indices.
                         */
-                       if (dfa_using_utf8()) {
+                       if (using_utf8()) {
                                count = 1;
                                wc = 0xFFFD;    /* unicode replacement 
character */
                                goto set_wc;
diff --git a/re.c b/re.c
index c822c90..6a100db 100644
--- a/re.c
+++ b/re.c
@@ -25,10 +25,14 @@
 
 #include "awk.h"
 
+#include "localeinfo.h"
+
 static reg_syntax_t syn;
 static void check_bracket_exp(char *s, size_t len);
 const char *regexflags2str(int flags);
 
+static struct localeinfo localeinfo;
+
 /* make_regexp --- generate compiled regular expressions */
 
 Regexp *
@@ -223,7 +227,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
        rp->pat.newline_anchor = false; /* don't get \n in middle of string */
        if (dfa && ! no_dfa) {
                rp->dfareg = dfaalloc();
-               dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
+               dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n');
                dfacomp(buf, len, rp->dfareg, true);
        } else
                rp->dfareg = NULL;
@@ -395,6 +399,9 @@ re_update(NODE *t)
 void
 resetup()
 {
+       // init localeinfo for dfa
+       init_localeinfo(& localeinfo);
+
        /*
         * Syntax bits: _that_ is yet another mind trip.  Recreational drugs
         * are helpful for recovering from the experience.
@@ -418,8 +425,14 @@ resetup()
                syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
 
        (void) re_set_syntax(syn);
+}
+
+/* using_utf8 --- are we using utf8 */
 
-       dfa_init();
+bool
+using_utf8(void)
+{
+       return localeinfo.using_utf8;
 }
 
 /* reisstring --- return true if the RE match is a simple string match */
diff --git a/verify.h b/verify.h
new file mode 100644
index 0000000..5c8381d
--- /dev/null
+++ b/verify.h
@@ -0,0 +1,279 @@
+/* Compile-time assert-like macros.
+
+   Copyright (C) 2005-2006, 2009-2016 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Paul Eggert, Bruno Haible, and Jim Meyering.  */
+
+#ifndef _GL_VERIFY_H
+#define _GL_VERIFY_H
+
+
+/* Define _GL_HAVE__STATIC_ASSERT to 1 if _Static_assert works as per C11.
+   This is supported by GCC 4.6.0 and later, in C mode, and its use
+   here generates easier-to-read diagnostics when verify (R) fails.
+
+   Define _GL_HAVE_STATIC_ASSERT to 1 if static_assert works as per C++11.
+   This will likely be supported by future GCC versions, in C++ mode.
+
+   Use this only with GCC.  If we were willing to slow 'configure'
+   down we could also use it with other compilers, but since this
+   affects only the quality of diagnostics, why bother?  */
+#if (4 < __GNUC__ + (6 <= __GNUC_MINOR__) \
+     && (201112L <= __STDC_VERSION__  || !defined __STRICT_ANSI__) \
+     && !defined __cplusplus)
+# define _GL_HAVE__STATIC_ASSERT 1
+#endif
+/* The condition (99 < __GNUC__) is temporary, until we know about the
+   first G++ release that supports static_assert.  */
+#if (99 < __GNUC__) && defined __cplusplus
+# define _GL_HAVE_STATIC_ASSERT 1
+#endif
+
+/* FreeBSD 9.1 <sys/cdefs.h>, included by <stddef.h> and lots of other
+   system headers, defines a conflicting _Static_assert that is no
+   better than ours; override it.  */
+#ifndef _GL_HAVE_STATIC_ASSERT
+# include <stddef.h>
+# undef _Static_assert
+#endif
+
+/* Each of these macros verifies that its argument R is nonzero.  To
+   be portable, R should be an integer constant expression.  Unlike
+   assert (R), there is no run-time overhead.
+
+   If _Static_assert works, verify (R) uses it directly.  Similarly,
+   _GL_VERIFY_TRUE works by packaging a _Static_assert inside a struct
+   that is an operand of sizeof.
+
+   The code below uses several ideas for C++ compilers, and for C
+   compilers that do not support _Static_assert:
+
+   * The first step is ((R) ? 1 : -1).  Given an expression R, of
+     integral or boolean or floating-point type, this yields an
+     expression of integral type, whose value is later verified to be
+     constant and nonnegative.
+
+   * Next this expression W is wrapped in a type
+     struct _gl_verify_type {
+       unsigned int _gl_verify_error_if_negative: W;
+     }.
+     If W is negative, this yields a compile-time error.  No compiler can
+     deal with a bit-field of negative size.
+
+     One might think that an array size check would have the same
+     effect, that is, that the type struct { unsigned int dummy[W]; }
+     would work as well.  However, inside a function, some compilers
+     (such as C++ compilers and GNU C) allow local parameters and
+     variables inside array size expressions.  With these compilers,
+     an array size check would not properly diagnose this misuse of
+     the verify macro:
+
+       void function (int n) { verify (n < 0); }
+
+   * For the verify macro, the struct _gl_verify_type will need to
+     somehow be embedded into a declaration.  To be portable, this
+     declaration must declare an object, a constant, a function, or a
+     typedef name.  If the declared entity uses the type directly,
+     such as in
+
+       struct dummy {...};
+       typedef struct {...} dummy;
+       extern struct {...} *dummy;
+       extern void dummy (struct {...} *);
+       extern struct {...} *dummy (void);
+
+     two uses of the verify macro would yield colliding declarations
+     if the entity names are not disambiguated.  A workaround is to
+     attach the current line number to the entity name:
+
+       #define _GL_CONCAT0(x, y) x##y
+       #define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y)
+       extern struct {...} * _GL_CONCAT (dummy, __LINE__);
+
+     But this has the problem that two invocations of verify from
+     within the same macro would collide, since the __LINE__ value
+     would be the same for both invocations.  (The GCC __COUNTER__
+     macro solves this problem, but is not portable.)
+
+     A solution is to use the sizeof operator.  It yields a number,
+     getting rid of the identity of the type.  Declarations like
+
+       extern int dummy [sizeof (struct {...})];
+       extern void dummy (int [sizeof (struct {...})]);
+       extern int (*dummy (void)) [sizeof (struct {...})];
+
+     can be repeated.
+
+   * Should the implementation use a named struct or an unnamed struct?
+     Which of the following alternatives can be used?
+
+       extern int dummy [sizeof (struct {...})];
+       extern int dummy [sizeof (struct _gl_verify_type {...})];
+       extern void dummy (int [sizeof (struct {...})]);
+       extern void dummy (int [sizeof (struct _gl_verify_type {...})]);
+       extern int (*dummy (void)) [sizeof (struct {...})];
+       extern int (*dummy (void)) [sizeof (struct _gl_verify_type {...})];
+
+     In the second and sixth case, the struct type is exported to the
+     outer scope; two such declarations therefore collide.  GCC warns
+     about the first, third, and fourth cases.  So the only remaining
+     possibility is the fifth case:
+
+       extern int (*dummy (void)) [sizeof (struct {...})];
+
+   * GCC warns about duplicate declarations of the dummy function if
+     -Wredundant-decls is used.  GCC 4.3 and later have a builtin
+     __COUNTER__ macro that can let us generate unique identifiers for
+     each dummy function, to suppress this warning.
+
+   * This implementation exploits the fact that older versions of GCC,
+     which do not support _Static_assert, also do not warn about the
+     last declaration mentioned above.
+
+   * GCC warns if -Wnested-externs is enabled and verify() is used
+     within a function body; but inside a function, you can always
+     arrange to use verify_expr() instead.
+
+   * In C++, any struct definition inside sizeof is invalid.
+     Use a template type to work around the problem.  */
+
+/* Concatenate two preprocessor tokens.  */
+#define _GL_CONCAT(x, y) _GL_CONCAT0 (x, y)
+#define _GL_CONCAT0(x, y) x##y
+
+/* _GL_COUNTER is an integer, preferably one that changes each time we
+   use it.  Use __COUNTER__ if it works, falling back on __LINE__
+   otherwise.  __LINE__ isn't perfect, but it's better than a
+   constant.  */
+#if defined __COUNTER__ && __COUNTER__ != __COUNTER__
+# define _GL_COUNTER __COUNTER__
+#else
+# define _GL_COUNTER __LINE__
+#endif
+
+/* Generate a symbol with the given prefix, making it unique if
+   possible.  */
+#define _GL_GENSYM(prefix) _GL_CONCAT (prefix, _GL_COUNTER)
+
+/* Verify requirement R at compile-time, as an integer constant expression
+   that returns 1.  If R is false, fail at compile-time, preferably
+   with a diagnostic that includes the string-literal DIAGNOSTIC.  */
+
+#define _GL_VERIFY_TRUE(R, DIAGNOSTIC) \
+   (!!sizeof (_GL_VERIFY_TYPE (R, DIAGNOSTIC)))
+
+#ifdef __cplusplus
+# if !GNULIB_defined_struct__gl_verify_type
+template <int w>
+  struct _gl_verify_type {
+    unsigned int _gl_verify_error_if_negative: w;
+  };
+#  define GNULIB_defined_struct__gl_verify_type 1
+# endif
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+    _gl_verify_type<(R) ? 1 : -1>
+#elif defined _GL_HAVE__STATIC_ASSERT
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+    struct {                                   \
+      _Static_assert (R, DIAGNOSTIC);          \
+      int _gl_dummy;                          \
+    }
+#else
+# define _GL_VERIFY_TYPE(R, DIAGNOSTIC) \
+    struct { unsigned int _gl_verify_error_if_negative: (R) ? 1 : -1; }
+#endif
+
+/* Verify requirement R at compile-time, as a declaration without a
+   trailing ';'.  If R is false, fail at compile-time, preferably
+   with a diagnostic that includes the string-literal DIAGNOSTIC.
+
+   Unfortunately, unlike C11, this implementation must appear as an
+   ordinary declaration, and cannot appear inside struct { ... }.  */
+
+#ifdef _GL_HAVE__STATIC_ASSERT
+# define _GL_VERIFY _Static_assert
+#else
+# define _GL_VERIFY(R, DIAGNOSTIC)                                    \
+    extern int (*_GL_GENSYM (_gl_verify_function) (void))             \
+      [_GL_VERIFY_TRUE (R, DIAGNOSTIC)]
+#endif
+
+/* _GL_STATIC_ASSERT_H is defined if this code is copied into assert.h.  */
+#ifdef _GL_STATIC_ASSERT_H
+# if !defined _GL_HAVE__STATIC_ASSERT && !defined _Static_assert
+#  define _Static_assert(R, DIAGNOSTIC) _GL_VERIFY (R, DIAGNOSTIC)
+# endif
+# if !defined _GL_HAVE_STATIC_ASSERT && !defined static_assert
+#  define static_assert _Static_assert /* C11 requires this #define.  */
+# endif
+#endif
+
+/* @assert.h omit start@  */
+
+/* Each of these macros verifies that its argument R is nonzero.  To
+   be portable, R should be an integer constant expression.  Unlike
+   assert (R), there is no run-time overhead.
+
+   There are two macros, since no single macro can be used in all
+   contexts in C.  verify_true (R) is for scalar contexts, including
+   integer constant expression contexts.  verify (R) is for declaration
+   contexts, e.g., the top level.  */
+
+/* Verify requirement R at compile-time, as an integer constant expression.
+   Return 1.  This is equivalent to verify_expr (R, 1).
+
+   verify_true is obsolescent; please use verify_expr instead.  */
+
+#define verify_true(R) _GL_VERIFY_TRUE (R, "verify_true (" #R ")")
+
+/* Verify requirement R at compile-time.  Return the value of the
+   expression E.  */
+
+#define verify_expr(R, E) \
+   (_GL_VERIFY_TRUE (R, "verify_expr (" #R ", " #E ")") ? (E) : (E))
+
+/* Verify requirement R at compile-time, as a declaration without a
+   trailing ';'.  */
+
+#define verify(R) _GL_VERIFY (R, "verify (" #R ")")
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+/* Assume that R always holds.  This lets the compiler optimize
+   accordingly.  R should not have side-effects; it may or may not be
+   evaluated.  Behavior is undefined if R is false.  */
+
+#if (__has_builtin (__builtin_unreachable) \
+     || 4 < __GNUC__ + (5 <= __GNUC_MINOR__))
+# define assume(R) ((R) ? (void) 0 : __builtin_unreachable ())
+#elif 1200 <= _MSC_VER
+# define assume(R) __assume (R)
+#elif ((defined GCC_LINT || defined lint) \
+       && (__has_builtin (__builtin_trap) \
+           || 3 < __GNUC__ + (3 < __GNUC_MINOR__ + (4 <= 
__GNUC_PATCHLEVEL__))))
+  /* Doing it this way helps various packages when configured with
+     --enable-gcc-warnings, which compiles with -Dlint.  It's nicer
+     when 'assume' silences warnings even with older GCCs.  */
+# define assume(R) ((R) ? (void) 0 : __builtin_trap ())
+#else
+# define assume(R) ((void) (0 && (R)))
+#endif
+
+/* @assert.h omit end@  */
+
+#endif

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog    |   18 +++-
 Makefile.am  |    3 +
 Makefile.in  |   13 ++-
 awk.h        |    1 +
 dfa.c        |  316 ++++++++++++++++++++++------------------------------------
 dfa.h        |   22 ++--
 localeinfo.c |  113 +++++++++++++++++++++
 localeinfo.h |   54 ++++++++++
 node.c       |    2 +-
 re.c         |   17 +++-
 verify.h     |  279 +++++++++++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 622 insertions(+), 216 deletions(-)
 create mode 100644 localeinfo.c
 create mode 100644 localeinfo.h
 create mode 100644 verify.h


hooks/post-receive
-- 
gawk



reply via email to

[Prev in Thread] Current Thread [Next in Thread]