[PATCH 4/8] parsers: don't double escape tnames

bison-patches
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 4/8] parsers: don't double escape tnames

From:	Akim Demaille
Subject:	[PATCH 4/8] parsers: don't double escape tnames
Date:	Sat, 29 Dec 2018 17:30:23 +0100
So far we used to escape the token aliases before saving them into
yytname.  As a consequence, we introduced yytnamerr to strip this
escaping when "it's useless".

Unfortunately, our escaping is too aggressive and for instance
destroys UTF-8 symbols (foreign languages, mathematical symbols, etc).
Let's stop quoting these symbols.

* src/output.c (prepare_symbols): Don't escape symbols twice.
* data/skeletons/glr.c, data/skeletons/lalr1.cc,
* data/skeletons/lalr1.java, data/skeletons/yacc.c
(yytnamerr): Don't de-quote these symbols.

* tests/javapush.at, tests/regression.at: Adjust expectations.
* tests/regression.at (Token definitions): Make sure we preserve
non ASCII symbols.
---
 data/skeletons/glr.c      | 71 ++++++++++-----------------------------
 data/skeletons/lalr1.cc   | 29 +---------------
 data/skeletons/lalr1.d    | 31 ++---------------
 data/skeletons/lalr1.java | 34 +++----------------
 data/skeletons/yacc.c     | 39 ++-------------------
 src/output.c              |  5 ++-
 tests/javapush.at         | 64 +++++++++++++++++------------------
 tests/regression.at       | 18 ++++------
 8 files changed, 69 insertions(+), 222 deletions(-)

diff --git a/data/skeletons/glr.c b/data/skeletons/glr.c
index ef26c391..02438887 100644
--- a/data/skeletons/glr.c
+++ b/data/skeletons/glr.c
@@ -534,6 +534,20 @@ static void yypdumpstack (struct yyGLRStack* yystackp)
   } while (0)
 #endif
 
+/** Grammar symbol */
+typedef int yySymbol;
+
+#if ]b4_api_PREFIX[DEBUG || YYERROR_VERBOSE
+/** A printable representation of TOKEN.  */
+static inline const char*
+yytokenName (yySymbol yytoken)
+{
+  if (yytoken == YYEMPTY)
+    return "";
+  else
+    return yytname[yytoken];
+}
+#endif
 
 #if YYERROR_VERBOSE
 
@@ -558,50 +572,14 @@ yystpcpy (char *yydest, const char *yysrc)
 # endif
 
 # ifndef yytnamerr
-/* Copy to YYRES the name of YYTOKEN after stripping away unnecessary
-   quotes and backslashes, so that it's suitable for yyerror.  The
-   heuristic is that double-quoting is unnecessary unless the string
-   contains an apostrophe, a comma, or backslash (other than
-   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
-   null, do not copy; instead, return the length of what the result
-   would have been.  */
+/* Copy to YYRES the name of YYTOKEN.  If YYRES is null, do not copy;
+   instead, return the length of what the result would have been.  */
 static size_t
 yytnamerr (char *yyres, int yytoken)
 {
-  const char *yystr = yytname[yytoken];
-  if (*yystr == '"')
-    {
-      size_t yyn = 0;
-      char const *yyp = yystr;
-
-      for (;;)
-        switch (*++yyp)
-          {
-          case '\'':
-          case ',':
-            goto do_not_strip_quotes;
-
-          case '\\':
-            if (*++yyp != '\\')
-              goto do_not_strip_quotes;
-            /* Fall through.  */
-          default:
-            if (yyres)
-              yyres[yyn] = *yyp;
-            yyn++;
-            break;
-
-          case '"':
-            if (yyres)
-              yyres[yyn] = '\0';
-            return yyn;
-          }
-    do_not_strip_quotes: ;
-    }
-
+  const char *yystr = yytokenName (yytoken);
   if (! yyres)
     return strlen (yystr);
-
   return (size_t) (yystpcpy (yyres, yystr) - yyres);
 }
 # endif
@@ -614,9 +592,6 @@ typedef int yyStateNum;
 /** Rule numbers, as in LALR(1) machine */
 typedef int yyRuleNum;
 
-/** Grammar symbol */
-typedef int yySymbol;
-
 /** Item references, as in LALR(1) machine */
 typedef short yyItemNum;
 
@@ -721,18 +696,6 @@ yyMemoryExhausted (yyGLRStack* yystackp)
   YYLONGJMP (yystackp->yyexception_buffer, 2);
 }
 
-#if ]b4_api_PREFIX[DEBUG || YYERROR_VERBOSE
-/** A printable representation of TOKEN.  */
-static inline const char*
-yytokenName (yySymbol yytoken)
-{
-  if (yytoken == YYEMPTY)
-    return "";
-  else
-    return yytname[yytoken];
-}
-#endif
-
 /** Fill in YYVSP[YYLOW1 .. YYLOW0-1] from the chain of states starting
  *  at YYVSP[YYLOW0].yystate.yypred.  Leaves YYVSP[YYLOW1].yystate.yypred
  *  containing the pointer to the next state in the chain.  */
diff --git a/data/skeletons/lalr1.cc b/data/skeletons/lalr1.cc
index 5a6091b1..7944c4e7 100644
--- a/data/skeletons/lalr1.cc
+++ b/data/skeletons/lalr1.cc
@@ -512,34 +512,7 @@ m4_if(b4_prefix, [yy], [],
   std::string
   ]b4_parser_class_name[::yytnamerr_ (int yytoken)
   {
-    const char *yystr = yytname_[yytoken];
-    if (*yystr == '"')
-      {
-        std::string yyr;
-        char const *yyp = yystr;
-
-        for (;;)
-          switch (*++yyp)
-            {
-            case '\'':
-            case ',':
-              goto do_not_strip_quotes;
-
-            case '\\':
-              if (*++yyp != '\\')
-                goto do_not_strip_quotes;
-              // Fall through.
-            default:
-              yyr += *yyp;
-              break;
-
-            case '"':
-              return yyr;
-            }
-      do_not_strip_quotes: ;
-      }
-
-    return yystr;
+    return yytname_[yytoken];
   }
 ]])[
 
diff --git a/data/skeletons/lalr1.d b/data/skeletons/lalr1.d
index 6f8ef552..4c0445ee 100644
--- a/data/skeletons/lalr1.d
+++ b/data/skeletons/lalr1.d
@@ -407,39 +407,12 @@ b4_lexer_if([[
     return YYNEWSTATE;
   }
 
-  /* The name of YYTOKEN after stripping away unnecessary quotes and
-     backslashes, so that it's suitable for yyerror.  The heuristic is
-     that double-quoting is unnecessary unless the string contains an
-     apostrophe, a comma, or backslash (other than backslash-backslash).
-     YYSTR is taken from yytname.  */
+  /* The name of YYTOKEN.  */
   private final string yytnamerr_ (int yytoken)
   {
     string yystr = yytname_[yytoken];
-    if (yystr[0] == '"')
-      {
-        string yyr;
-        strip_quotes: for (int i = 1; i < yystr.length; i++)
-          switch (yystr[i])
-            {
-            case '\'':
-            case ',':
-              break strip_quotes;
-
-            case '\\':
-              if (yystr[++i] != '\\')
-                break strip_quotes;
-              goto default;
-            default:
-              yyr ~= yystr[i];
-              break;
-
-            case '"':
-              return yyr;
-            }
-      }
-    else if (yystr=="$end")
+    if (yystr=="$end")
       return "end of input";
-
     return yystr;
   }
 
diff --git a/data/skeletons/lalr1.java b/data/skeletons/lalr1.java
index a4e48c05..2489d669 100644
--- a/data/skeletons/lalr1.java
+++ b/data/skeletons/lalr1.java
@@ -501,40 +501,14 @@ b4_define_state])[
   }
 
 ]b4_error_verbose_if([[
-  /* The name of YYTOKEN after stripping away unnecessary quotes and
-     backslashes, so that it's suitable for yyerror.  The heuristic is
-     that double-quoting is unnecessary unless the string contains an
-     apostrophe, a comma, or backslash (other than backslash-backslash).
-     YYSTR is taken from yytname.  */
+  /* The name of YYTOKEN.  */
   private final String yytnamerr_ (int yytoken)
   {
     String yystr = yytname_[yytoken];
-    if (yystr.charAt (0) == '"')
-      {
-        StringBuffer yyr = new StringBuffer ();
-        strip_quotes: for (int i = 1; i < yystr.length (); i++)
-          switch (yystr.charAt (i))
-            {
-            case '\'':
-            case ',':
-              break strip_quotes;
-
-            case '\\':
-              if (yystr.charAt(++i) != '\\')
-                break strip_quotes;
-              /* Fall through.  */
-            default:
-              yyr.append (yystr.charAt (i));
-              break;
-
-            case '"':
-              return yyr.toString ();
-            }
-      }
-    else if (yystr.equals ("$end"))
+    if (yystr.equals ("$end"))
       return "end of input";
-
-    return yystr;
+    else
+      return yystr;
   }
 ]])[
 
diff --git a/data/skeletons/yacc.c b/data/skeletons/yacc.c
index 5ec843e1..fd10a004 100644
--- a/data/skeletons/yacc.c
+++ b/data/skeletons/yacc.c
@@ -1041,47 +1041,12 @@ yy_lac (yytype_int16 *yyesa, yytype_int16 **yyes,
 # endif
 
 # ifndef yytnamerr
-/* Copy to YYRES the name of YYTOKEN after stripping away unnecessary
-   quotes and backslashes, so that it's suitable for yyerror.  The
-   heuristic is that double-quoting is unnecessary unless the string
-   contains an apostrophe, a comma, or backslash (other than
-   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
-   null, do not copy; instead, return the length of what the result
-   would have been.  */
+/* Copy to YYRES the name of YYTOKEN.  If YYRES is null, do not copy;
+   instead, return the length of what the result would have been.  */
 static YYSIZE_T
 yytnamerr (char *yyres, int yytoken)
 {
   const char *yystr = yytname[yytoken];
-  if (*yystr == '"')
-    {
-      YYSIZE_T yyn = 0;
-      char const *yyp = yystr;
-
-      for (;;)
-        switch (*++yyp)
-          {
-          case '\'':
-          case ',':
-            goto do_not_strip_quotes;
-
-          case '\\':
-            if (*++yyp != '\\')
-              goto do_not_strip_quotes;
-            /* Fall through.  */
-          default:
-            if (yyres)
-              yyres[yyn] = *yyp;
-            yyn++;
-            break;
-
-          case '"':
-            if (yyres)
-              yyres[yyn] = '\0';
-            return yyn;
-          }
-    do_not_strip_quotes: ;
-    }
-
   if (! yyres)
     return yystrlen (yystr);
 
diff --git a/src/output.c b/src/output.c
index e87f9812..a90e8266 100644
--- a/src/output.c
+++ b/src/output.c
@@ -165,7 +165,10 @@ prepare_symbols (void)
     set_quoting_flags (qo, QA_SPLIT_TRIGRAPHS);
     for (int i = 0; i < nsyms; i++)
       {
-        char *cp = quotearg_alloc (symbols[i]->tag, -1, qo);
+        char *cp =
+          symbols[i]->tag[0] == '"'
+          ? xstrdup (symbols[i]->tag)
+          : quotearg_alloc (symbols[i]->tag, -1, qo);
         /* Width of the next token, including the two quotes, the
            comma and the space.  */
         int width = strlen (cp) + 2;
diff --git a/tests/javapush.at b/tests/javapush.at
index 8749301a..557903ce 100644
--- a/tests/javapush.at
+++ b/tests/javapush.at
@@ -726,121 +726,121 @@ total = 256
 total = 64
 ]])
 
-AT_DATA([locations],[[Next token is token "number" (1.1: 1)
+AT_DATA([locations],[[Next token is token number (1.1: 1)
 Next token is token '+' (1.2: 1)
-Next token is token "number" (1.3: 2)
+Next token is token number (1.3: 2)
 Next token is token '*' (1.4: 2)
-Next token is token "number" (1.5: 3)
+Next token is token number (1.5: 3)
 Next token is token '=' (1.6: 3)
 Next token is token '=' (1.6: 3)
 Next token is token '=' (1.6: 3)
-Next token is token "number" (1.7: 7)
+Next token is token number (1.7: 7)
 Next token is token '\n' (2.0: 7)
 Next token is token '\n' (2.0: 7)
-Next token is token "number" (2.1: 1)
+Next token is token number (2.1: 1)
 Next token is token '+' (2.2: 1)
-Next token is token "number" (2.3: 2)
+Next token is token number (2.3: 2)
 Next token is token '*' (2.4: 2)
 Next token is token '-' (2.5: 2)
-Next token is token "number" (2.6: 3)
+Next token is token number (2.6: 3)
 Next token is token '=' (2.7: 3)
 Next token is token '=' (2.7: 3)
 Next token is token '=' (2.7: 3)
 Next token is token '=' (2.7: 3)
 Next token is token '-' (2.8: 3)
-Next token is token "number" (2.9: 5)
+Next token is token number (2.9: 5)
 Next token is token '\n' (3.0: 5)
 Next token is token '\n' (3.0: 5)
 Next token is token '\n' (3.0: 5)
 Next token is token '\n' (4.0: 5)
 Next token is token '-' (4.1: 5)
-Next token is token "number" (4.2: 1)
+Next token is token number (4.2: 1)
 Next token is token '^' (4.3: 1)
-Next token is token "number" (4.4: 2)
+Next token is token number (4.4: 2)
 Next token is token '=' (4.5: 2)
 Next token is token '=' (4.5: 2)
 Next token is token '=' (4.5: 2)
 Next token is token '-' (4.6: 2)
-Next token is token "number" (4.7: 1)
+Next token is token number (4.7: 1)
 Next token is token '\n' (5.0: 1)
 Next token is token '\n' (5.0: 1)
 Next token is token '\n' (5.0: 1)
 Next token is token '(' (5.1: 1)
 Next token is token '-' (5.2: 1)
-Next token is token "number" (5.3: 1)
+Next token is token number (5.3: 1)
 Next token is token ')' (5.4: 1)
 Next token is token ')' (5.4: 1)
 Next token is token '^' (5.5: 1)
-Next token is token "number" (5.6: 2)
+Next token is token number (5.6: 2)
 Next token is token '=' (5.7: 2)
 Next token is token '=' (5.7: 2)
-Next token is token "number" (5.8: 1)
+Next token is token number (5.8: 1)
 Next token is token '\n' (6.0: 1)
 Next token is token '\n' (6.0: 1)
 Next token is token '\n' (7.0: 1)
 Next token is token '-' (7.1: 1)
 Next token is token '-' (7.2: 1)
 Next token is token '-' (7.3: 1)
-Next token is token "number" (7.4: 1)
+Next token is token number (7.4: 1)
 Next token is token '=' (7.5: 1)
 Next token is token '=' (7.5: 1)
 Next token is token '=' (7.5: 1)
 Next token is token '=' (7.5: 1)
 Next token is token '-' (7.6: 1)
-Next token is token "number" (7.7: 1)
+Next token is token number (7.7: 1)
 Next token is token '\n' (8.0: 1)
 Next token is token '\n' (8.0: 1)
 Next token is token '\n' (8.0: 1)
 Next token is token '\n' (9.0: 1)
-Next token is token "number" (9.1: 1)
+Next token is token number (9.1: 1)
 Next token is token '-' (9.2: 1)
-Next token is token "number" (9.3: 2)
+Next token is token number (9.3: 2)
 Next token is token '-' (9.4: 2)
 Next token is token '-' (9.4: 2)
-Next token is token "number" (9.5: 3)
+Next token is token number (9.5: 3)
 Next token is token '=' (9.6: 3)
 Next token is token '=' (9.6: 3)
 Next token is token '-' (9.7: 3)
-Next token is token "number" (9.8: 4)
+Next token is token number (9.8: 4)
 Next token is token '\n' (10.0: 4)
 Next token is token '\n' (10.0: 4)
 Next token is token '\n' (10.0: 4)
-Next token is token "number" (10.1: 1)
+Next token is token number (10.1: 1)
 Next token is token '-' (10.2: 1)
 Next token is token '(' (10.3: 1)
-Next token is token "number" (10.4: 2)
+Next token is token number (10.4: 2)
 Next token is token '-' (10.5: 2)
-Next token is token "number" (10.6: 3)
+Next token is token number (10.6: 3)
 Next token is token ')' (10.7: 3)
 Next token is token ')' (10.7: 3)
 Next token is token '=' (10.8: 3)
 Next token is token '=' (10.8: 3)
-Next token is token "number" (10.9: 2)
+Next token is token number (10.9: 2)
 Next token is token '\n' (11.0: 2)
 Next token is token '\n' (11.0: 2)
 Next token is token '\n' (12.0: 2)
-Next token is token "number" (12.1: 2)
+Next token is token number (12.1: 2)
 Next token is token '^' (12.2: 2)
-Next token is token "number" (12.3: 2)
+Next token is token number (12.3: 2)
 Next token is token '^' (12.4: 2)
-Next token is token "number" (12.5: 3)
+Next token is token number (12.5: 3)
 Next token is token '=' (12.6: 3)
 Next token is token '=' (12.6: 3)
 Next token is token '=' (12.6: 3)
-Next token is token "number" (12.7: 256)
+Next token is token number (12.7: 256)
 Next token is token '\n' (13.0: 256)
 Next token is token '\n' (13.0: 256)
 Next token is token '(' (13.1: 256)
-Next token is token "number" (13.2: 2)
+Next token is token number (13.2: 2)
 Next token is token '^' (13.3: 2)
-Next token is token "number" (13.4: 2)
+Next token is token number (13.4: 2)
 Next token is token ')' (13.5: 2)
 Next token is token ')' (13.5: 2)
 Next token is token '^' (13.6: 2)
-Next token is token "number" (13.7: 3)
+Next token is token number (13.7: 3)
 Next token is token '=' (13.8: 3)
 Next token is token '=' (13.8: 3)
-Next token is token "number" (13.9: 64)
+Next token is token number (13.9: 64)
 Next token is token '\n' (14.0: 64)
 Next token is token '\n' (14.0: 64)
 ]])
diff --git a/tests/regression.at b/tests/regression.at
index 0530b1e5..147e2c5e 100644
--- a/tests/regression.at
+++ b/tests/regression.at
@@ -433,11 +433,12 @@ AT_DATA_GRAMMAR([input.y],
 %token 'd' D_TOKEN
 %token SPECIAL "\\\'\?\"\a\b\f\n\r\t\v\001\201\x001\x000081"
 %token SPECIAL "\\\'\?\"\a\b\f\n\r\t\v\001\201\x001\x000081"
+%token MAGIC "∃¬∩∪∀"
 %%
-exp: "a" "\\\'\?\"\a\b\f\n\r\t\v\001\201\x001\x000081??!";
+exp: "a" MAGIC;
 %%
 ]AT_YYERROR_DEFINE[
-]AT_YYLEX_DEFINE([{ SPECIAL }])[
+]AT_YYLEX_DEFINE([{ MAGIC }])[
 ]AT_MAIN_DEFINE[
 ]])
 AT_BISON_OPTION_POPDEFS
@@ -454,14 +455,9 @@ input.y:22.16-60: warning: symbol 
"\\'?\"\a\b\f\n\r\t\v\001\201\001\201" used mo
 ]])
 AT_COMPILE([input])
 
-# Checking the error message here guarantees that yytname, which does contain
-# C-string literals, does have the trigraph escaped correctly.  Thus, the
-# symbol name reported by the parser is exactly the same as that reported by
-# Bison itself.
-AT_DATA([experr],
-[[syntax error, unexpected "\\'?\"\a\b\f\n\r\t\v\001\201\001\201??!", 
expecting a
+AT_PARSER_CHECK([./input], 1, [],
+[[syntax error, unexpected ∃¬∩∪∀, expecting a
 ]])
-AT_PARSER_CHECK([./input], 1, [], [experr])
 AT_CLEANUP
 
 
@@ -736,8 +732,8 @@ static const yytype_uint8 yyrline[] =
 };
 static const char *const yytname[] =
 {
-  "$end", "error", "$undefined", "\"if\"", "\"const\"", "\"then\"",
-  "\"else\"", "$accept", "statement", "struct_stat", "if", "else", YY_NULLPTR
+  "$end", "error", "$undefined", "if", "const", "then", "else", "$accept",
+  "statement", "struct_stat", "if", "else", YY_NULLPTR
 };
 static const yytype_uint16 yytoknum[] =
 {
-- 
2.20.0
[Prev in Thread]
Current Thread
[Next in Thread]
[PATCH 0/8] Revamp the handling token string aliases in error messages, Akim Demaille, 2018/12/29
- [PATCH 6/8] tests: check that internationalization of token works, Akim Demaille, 2018/12/29
- [PATCH 7/8] translate bison's own tokens, Akim Demaille, 2018/12/29
- [PATCH 4/8] parsers: don't double escape tnames, Akim Demaille <=
- [PATCH 2/8] parsers: revamp the interface of yytnamerr, Akim Demaille, 2018/12/29
- [PATCH 1/8] yacc.c: avoid negated if, Akim Demaille, 2018/12/29
- [PATCH 3/8] tests: no longer play with trigraphs, Akim Demaille, 2018/12/29
- [PATCH 5/8] parsers: support translatable token aliases, Akim Demaille, 2018/12/29
- [PATCH 8/8] regen, Akim Demaille, 2018/12/29
Prev by Date: [PATCH 7/8] translate bison's own tokens
Next by Date: [PATCH 2/8] parsers: revamp the interface of yytnamerr
Previous by thread: [PATCH 7/8] translate bison's own tokens
Next by thread: [PATCH 2/8] parsers: revamp the interface of yytnamerr
Index(es):
- Date
- Thread