[PATCH 05/17] multistart: turn start symbols into rules on $accept

bison-patches

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 05/17] multistart: turn start symbols into rules on $accept

From:	Akim Demaille
Subject:	[PATCH 05/17] multistart: turn start symbols into rules on $accept
Date:	Sun, 20 Sep 2020 10:37:37 +0200

Now that the parser can read several start symbols, let's process
them, and create the corresponding rules.

* src/parse-gram.y (grammar_declaration): Accept a list of start symbols.
* src/reader.h, src/reader.c (grammar_start_symbol_set): Rename as...
(grammar_start_symbols_set): this.

* src/reader.h, src/reader.c (start_flag): Replace with...
(start_symbols): this.
* src/reader.c (grammar_start_symbols_set): Build a list of start
symbols.
(switching_token, create_start_rules): New.
(check_and_convert_grammar): Use them to turn the list of start
symbols into a set of rules.
* src/reduce.c (nonterminals_reduce): Don't complain about $accept,
it's an internal detail.
(reduce_grammar): Complain about all the start symbols that don't
derive sentences.

* src/symtab.c (startsymbol, startsymbol_loc): Remove, replaced by
start_symbols.
symbols_pack): Move the check about the start symbols
to...
* src/symlist.c (check_start_symbols): here.
Adjust to multiple start symbols.
* tests/reduce.at (Empty Language): Generalize into...
(Bad start symbols): this.
---
 TODO             |  11 +++++
 src/parse-gram.y |   4 +-
 src/reader.c     | 108 ++++++++++++++++++++++++++++++++++++++---------
 src/reader.h     |   9 +++-
 src/reduce.c     |  19 ++++++---
 src/symtab.c     |  11 -----
 src/symtab.h     |   5 ---
 tests/reduce.at  |  66 ++++++++++++++++++++++++-----
 8 files changed, 179 insertions(+), 54 deletions(-)

diff --git a/TODO b/TODO
index c692b9f9..b8641a97 100644
--- a/TODO
+++ b/TODO
@@ -664,6 +664,17 @@ happen with yy_start: stmt | expr).  Then adjust the 
skeletons so that this
 initial token (YY_START_STMT, YY_START_EXPR) be shifted first in the
 corresponding parse function.
 
+*** Number of useless symbols
+AT_TEST(
+[[%start exp;
+exp: exp;]],
+[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+We should say "1 nonterminal": the other one is $accept, which should not
+participate in the count.
+
 ** %include
 This is a popular demand.  We already made many changes in the parser that
 should make this reasonably easy to implement.
diff --git a/src/parse-gram.y b/src/parse-gram.y
index c849eb80..598639b4 100644
--- a/src/parse-gram.y
+++ b/src/parse-gram.y
@@ -381,9 +381,9 @@ params:
 
 grammar_declaration:
   symbol_declaration
-| "%start" symbol
+| "%start" symbols.1
     {
-      grammar_start_symbol_set ($2, @2);
+      grammar_start_symbols_set ($2);
     }
 | code_props_type "{...}" generic_symlist
     {
diff --git a/src/reader.c b/src/reader.c
index 6932decd..5988bf35 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -21,7 +21,9 @@
 #include <config.h>
 #include "system.h"
 
+#include <c-ctype.h>
 #include <quote.h>
+#include <vasnprintf.h>
 
 #include "complain.h"
 #include "conflicts.h"
@@ -40,7 +42,7 @@ static void prepare_percent_define_front_end_variables (void);
 static void check_and_convert_grammar (void);
 
 static symbol_list *grammar = NULL;
-static bool start_flag = false;
+symbol_list *start_symbols = NULL;
 merger_list *merge_functions;
 
 /* Was %union seen?  */
@@ -54,16 +56,9 @@ bool default_prec = true;
 `-----------------------*/
 
 void
-grammar_start_symbol_set (symbol *sym, location loc)
+grammar_start_symbols_set (symbol_list *syms)
 {
-  if (start_flag)
-    complain (&loc, complaint, _("multiple %s declarations"), "%start");
-  else
-    {
-      start_flag = true;
-      startsymbol = sym;
-      startsymbol_loc = loc;
-    }
+  start_symbols = symbol_list_append (start_symbols, syms);
 }
 
 
@@ -791,6 +786,84 @@ create_start_rule (symbol *swtok, symbol *start)
   grammar = initial_rule;
 }
 
+/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
+
+   We don't use the simple "YY_FOO" because (i) we might get clashes
+   with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
+   introduces possible clashes between terminal FOO and nonterminal
+   foo.  */
+symbol *
+switching_token (const symbol *start)
+{
+  char buf[100];
+  size_t len = sizeof buf;
+  char *name
+    = asnprintf (buf, &len,
+                 "YY_PARSE_%s", start->alias ? start->alias->tag : start->tag);
+  if (!name)
+    xalloc_die ();
+  // Setting the location ensures deterministic symbol numbers.
+  symbol *res = symbol_get (name, start->location);
+  if (name != buf)
+    free (name);
+  symbol_class_set (res, token_sym, start->location, false);
+  return res;
+}
+
+/* For each start symbol "foo", create the rule "$accept: YY_FOO
+   foo $end". */
+static void
+create_start_rules (void)
+{
+  if (!start_symbols)
+    {
+      symbol *start = find_start_symbol ();
+      start_symbols = symbol_list_sym_new (start, start->location);
+    }
+
+  const bool several = start_symbols->next;
+  if (several)
+    for (symbol_list *list = start_symbols; list; list = list->next)
+      {
+        assert (list->content_type == SYMLIST_SYMBOL);
+        symbol *start = list->content.sym;
+        symbol *swtok = switching_token (start);
+        create_start_rule (swtok, start);
+      }
+  else
+    {
+      symbol *start = start_symbols->content.sym;
+      create_start_rule (NULL, start);
+    }
+}
+
+static void
+check_start_symbols (void)
+{
+  // Sanity checks on the start symbols.
+  for (symbol_list *list = start_symbols; list; list = list->next)
+    {
+      const symbol *start = list->content.sym;
+      if (start->content->class == unknown_sym)
+        {
+          complain (&start->location, complaint,
+                    _("the start symbol %s is undefined"),
+                    start->tag);
+          // I claim this situation is unreachable.  This is caught
+          // before, and we get "symbol 'foo' is used, but is not
+          // defined as a token and has no rules".
+          abort ();
+        }
+      if (start->content->class == token_sym)
+        complain (&start->location, complaint,
+                  _("the start symbol %s is a token"),
+                  start->tag);
+    }
+  if (complaint_status == status_complaint)
+    exit (EXIT_FAILURE);
+}
+
+
 /*-------------------------------------------------------------.
 | Check the grammar that has just been read, and convert it to |
 | internal form.                                               |
@@ -818,19 +891,12 @@ check_and_convert_grammar (void)
       }
     }
 
+  /* Insert the initial rule(s).  */
+  create_start_rules ();
+
   /* Report any undefined symbols and consider them nonterminals.  */
   symbols_check_defined ();
 
-  /* Find the start symbol if no %start.  */
-  if (!start_flag)
-    {
-      symbol *start = find_start_symbol ();
-      grammar_start_symbol_set (start, start->location);
-    }
-
-  /* Insert the initial rule.  */
-  create_start_rule (NULL, startsymbol);
-
   if (SYMBOL_NUMBER_MAXIMUM - nnterms < ntokens)
     complain (NULL, fatal, "too many symbols in input grammar (limit is %d)",
               SYMBOL_NUMBER_MAXIMUM);
@@ -840,6 +906,8 @@ check_and_convert_grammar (void)
   /* Assign the symbols their symbol numbers.  */
   symbols_pack ();
 
+  check_start_symbols ();
+
   /* Scan rule actions after invoking symbol_check_alias_consistency
      (in symbols_pack above) so that token types are set correctly
      before the rule action type checking.
diff --git a/src/reader.h b/src/reader.h
index 39ede8c3..49d862ad 100644
--- a/src/reader.h
+++ b/src/reader.h
@@ -38,7 +38,14 @@ typedef struct merger_list
 void free_merger_functions (void);
 extern merger_list *merge_functions;
 
-void grammar_start_symbol_set (symbol *sym, location loc);
+/* List of the start symbols.  */
+extern symbol_list *start_symbols;
+
+/* Get a token "YY_FOO" for each start symbol "foo".  Create it if
+   needed. */
+symbol *switching_token (const symbol *start);
+
+void grammar_start_symbols_set (symbol_list *syms);
 
 void grammar_current_rule_begin (symbol *lhs, location loc,
                                  named_ref *lhs_named_ref);
diff --git a/src/reduce.c b/src/reduce.c
index 0061b687..c9979e0a 100644
--- a/src/reduce.c
+++ b/src/reduce.c
@@ -275,7 +275,8 @@ nonterminals_reduce (void)
       if (!bitset_test (V, i))
         {
           nterm_map[i - ntokens] = n++;
-          if (symbols[i]->content->status != used)
+          if (symbols[i]->content->status != used
+              && symbols[i] != acceptsymbol)
             complain (&symbols[i]->location, Wother,
                       _("nonterminal useless in grammar: %s"),
                       symbols[i]->tag);
@@ -381,10 +382,18 @@ reduce_grammar (void)
     {
       reduce_print ();
 
-      if (!bitset_test (N, acceptsymbol->content->number - ntokens))
-        complain (&startsymbol_loc, fatal,
-                  _("start symbol %s does not derive any sentence"),
-                  startsymbol->tag);
+      // Check that start symbols have non-empty languages.
+      bool failure = false;
+      for (symbol_list *list = start_symbols; list; list = list->next)
+        if (!bitset_test (N, list->content.sym->content->number - ntokens))
+          {
+            failure = true;
+            complain (&list->sym_loc, complaint,
+                      _("start symbol %s does not derive any sentence"),
+                      list->content.sym->tag);
+          }
+      if (failure)
+        exit (EXIT_FAILURE);
 
       /* First reduce the nonterminals, as they renumber themselves in the
          whole grammar.  If you change the order, nonterms would be
diff --git a/src/symtab.c b/src/symtab.c
index b5556715..31a3c048 100644
--- a/src/symtab.c
+++ b/src/symtab.c
@@ -60,8 +60,6 @@ symbol *errtoken = NULL;
 symbol *undeftoken = NULL;
 symbol *eoftoken = NULL;
 symbol *acceptsymbol = NULL;
-symbol *startsymbol = NULL;
-location startsymbol_loc;
 
 /* Precedence relation graph. */
 static symgraph **prec_nodes;
@@ -1146,15 +1144,6 @@ symbols_pack (void)
 
   symbols_token_translations_init ();
 
-  if (startsymbol->content->class == unknown_sym)
-    complain (&startsymbol_loc, fatal,
-              _("the start symbol %s is undefined"),
-              startsymbol->tag);
-  else if (startsymbol->content->class == token_sym)
-    complain (&startsymbol_loc, fatal,
-              _("the start symbol %s is a token"),
-              startsymbol->tag);
-
   // If some user tokens are internationalized, the internal ones
   // should be too.
   if (has_translations ())
diff --git a/src/symtab.h b/src/symtab.h
index e85e5468..1ec8042b 100644
--- a/src/symtab.h
+++ b/src/symtab.h
@@ -247,11 +247,6 @@ extern symbol *eoftoken;
    $accept: start-symbol $end */
 extern symbol *acceptsymbol;
 
-/** The user start symbol. */
-extern symbol *startsymbol;
-/** The location of the \c \%start declaration.  */
-extern location startsymbol_loc;
-
 /** Whether a symbol declared with a type tag.  */
 extern bool tag_seen;
 
diff --git a/tests/reduce.at b/tests/reduce.at
index b561100e..c1af62ee 100644
--- a/tests/reduce.at
+++ b/tests/reduce.at
@@ -445,23 +445,69 @@ AT_CLEANUP
 
 
 
-## ---------------- ##
-## Empty Language.  ##
-## ---------------- ##
+## ------------------- ##
+## Bad start symbols.  ##
+## ------------------- ##
 
-AT_SETUP([Empty Language])
+AT_SETUP([Bad start symbols])
 
+m4_pushdef([AT_TEST],
+[
 AT_DATA([[input.y]],
-[[%output "input.c"
-%%
-exp: exp;
-]])
+[%%
+$1
+])
 
 AT_BISON_CHECK([[input.y]], 1, [],
+[$2
+])
+])
+
+AT_TEST(
+[[exp: exp;]],
 [[input.y: warning: 2 nonterminals useless in grammar [-Wother]
 input.y: warning: 2 rules useless in grammar [-Wother]
-input.y:3.1-3: fatal error: start symbol exp does not derive any sentence
-]])
+input.y:2.1-3: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp;
+exp: exp;]],
+[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp stmt;
+exp: exp;
+stmt: "stmt"]],
+[[input.y: warning: 1 nonterminal useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp stmt;
+exp: exp;
+stmt: stmt]],
+[[input.y: warning: 3 nonterminals useless in grammar [-Wother]
+input.y: warning: 4 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence
+input.y:2.12-15: error: start symbol stmt does not derive any sentence]])
+
+AT_TEST(
+[[%start exp;
+stmt: stmt]],
+[[input.y:2.8-10: warning: symbol 'exp' is used, but is not defined as a token 
and has no rules [-Wother]
+input.y: warning: 3 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%token FOO;
+%start FOO;
+stmt: FOO]],
+[[input.y:2.8-10: error: the start symbol FOO is a token]])
+
+m4_popdef([AT_TEST])
 
 AT_CLEANUP
 
-- 
2.28.0

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH 00/17] RFC: multiple start symbols, Akim Demaille, 2020/09/20
- [PATCH 01/17] gram: more debugging information, Akim Demaille, 2020/09/20
- [PATCH 02/17] reader: get ready to create several initial rules, Akim Demaille, 2020/09/20
- [PATCH 03/17] parser: expose a list of symbols, Akim Demaille, 2020/09/20
- [PATCH 04/17] regen, Akim Demaille, 2020/09/20
- [PATCH 05/17] multistart: turn start symbols into rules on $accept, Akim Demaille <=
- [PATCH 06/17] regen, Akim Demaille, 2020/09/20
- [PATCH 07/17] multistart: adjust computation of initial core and adjust reports, Akim Demaille, 2020/09/20
- [PATCH 08/17] multistart: also check the HTML report, Akim Demaille, 2020/09/20
- [PATCH 09/17] multistart: pass the list of start symbols to the backend, Akim Demaille, 2020/09/20
- [PATCH 10/17] multistart: equip yacc.c, Akim Demaille, 2020/09/20
- [PATCH 11/17] multistart: toy with it in lexcalc, Akim Demaille, 2020/09/20
- [PATCH 12/17] todo: more, Akim Demaille, 2020/09/20
- [PATCH 13/17] multistart: adjust reader checks for generated rules, Akim Demaille, 2020/09/20
- [PATCH 14/17] multistart: use b4_accept instead of action post-processing, Akim Demaille, 2020/09/20
- [PATCH 15/17] multistart: allow tokens as start symbols, Akim Demaille, 2020/09/20

Prev by Date: [PATCH 04/17] regen
Next by Date: [PATCH 06/17] regen
Previous by thread: [PATCH 04/17] regen
Next by thread: [PATCH 06/17] regen
Index(es):
- Date
- Thread