[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 05/17] multistart: turn start symbols into rules on $accept
From: |
Akim Demaille |
Subject: |
[PATCH 05/17] multistart: turn start symbols into rules on $accept |
Date: |
Sun, 20 Sep 2020 10:37:37 +0200 |
Now that the parser can read several start symbols, let's process
them, and create the corresponding rules.
* src/parse-gram.y (grammar_declaration): Accept a list of start symbols.
* src/reader.h, src/reader.c (grammar_start_symbol_set): Rename as...
(grammar_start_symbols_set): this.
* src/reader.h, src/reader.c (start_flag): Replace with...
(start_symbols): this.
* src/reader.c (grammar_start_symbols_set): Build a list of start
symbols.
(switching_token, create_start_rules): New.
(check_and_convert_grammar): Use them to turn the list of start
symbols into a set of rules.
* src/reduce.c (nonterminals_reduce): Don't complain about $accept,
it's an internal detail.
(reduce_grammar): Complain about all the start symbols that don't
derive sentences.
* src/symtab.c (startsymbol, startsymbol_loc): Remove, replaced by
start_symbols.
symbols_pack): Move the check about the start symbols
to...
* src/symlist.c (check_start_symbols): here.
Adjust to multiple start symbols.
* tests/reduce.at (Empty Language): Generalize into...
(Bad start symbols): this.
---
TODO | 11 +++++
src/parse-gram.y | 4 +-
src/reader.c | 108 ++++++++++++++++++++++++++++++++++++++---------
src/reader.h | 9 +++-
src/reduce.c | 19 ++++++---
src/symtab.c | 11 -----
src/symtab.h | 5 ---
tests/reduce.at | 66 ++++++++++++++++++++++++-----
8 files changed, 179 insertions(+), 54 deletions(-)
diff --git a/TODO b/TODO
index c692b9f9..b8641a97 100644
--- a/TODO
+++ b/TODO
@@ -664,6 +664,17 @@ happen with yy_start: stmt | expr). Then adjust the
skeletons so that this
initial token (YY_START_STMT, YY_START_EXPR) be shifted first in the
corresponding parse function.
+*** Number of useless symbols
+AT_TEST(
+[[%start exp;
+exp: exp;]],
+[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+We should say "1 nonterminal": the other one is $accept, which should not
+participate in the count.
+
** %include
This is a popular demand. We already made many changes in the parser that
should make this reasonably easy to implement.
diff --git a/src/parse-gram.y b/src/parse-gram.y
index c849eb80..598639b4 100644
--- a/src/parse-gram.y
+++ b/src/parse-gram.y
@@ -381,9 +381,9 @@ params:
grammar_declaration:
symbol_declaration
-| "%start" symbol
+| "%start" symbols.1
{
- grammar_start_symbol_set ($2, @2);
+ grammar_start_symbols_set ($2);
}
| code_props_type "{...}" generic_symlist
{
diff --git a/src/reader.c b/src/reader.c
index 6932decd..5988bf35 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -21,7 +21,9 @@
#include <config.h>
#include "system.h"
+#include <c-ctype.h>
#include <quote.h>
+#include <vasnprintf.h>
#include "complain.h"
#include "conflicts.h"
@@ -40,7 +42,7 @@ static void prepare_percent_define_front_end_variables (void);
static void check_and_convert_grammar (void);
static symbol_list *grammar = NULL;
-static bool start_flag = false;
+symbol_list *start_symbols = NULL;
merger_list *merge_functions;
/* Was %union seen? */
@@ -54,16 +56,9 @@ bool default_prec = true;
`-----------------------*/
void
-grammar_start_symbol_set (symbol *sym, location loc)
+grammar_start_symbols_set (symbol_list *syms)
{
- if (start_flag)
- complain (&loc, complaint, _("multiple %s declarations"), "%start");
- else
- {
- start_flag = true;
- startsymbol = sym;
- startsymbol_loc = loc;
- }
+ start_symbols = symbol_list_append (start_symbols, syms);
}
@@ -791,6 +786,84 @@ create_start_rule (symbol *swtok, symbol *start)
grammar = initial_rule;
}
+/* Fetch (or create) a token "YY_PARSE_foo" for start symbol "foo".
+
+ We don't use the simple "YY_FOO" because (i) we might get clashes
+ with some of our symbols (e.g., cast => YY_CAST), and (ii) upcasing
+ introduces possible clashes between terminal FOO and nonterminal
+ foo. */
+symbol *
+switching_token (const symbol *start)
+{
+ char buf[100];
+ size_t len = sizeof buf;
+ char *name
+ = asnprintf (buf, &len,
+ "YY_PARSE_%s", start->alias ? start->alias->tag : start->tag);
+ if (!name)
+ xalloc_die ();
+ // Setting the location ensures deterministic symbol numbers.
+ symbol *res = symbol_get (name, start->location);
+ if (name != buf)
+ free (name);
+ symbol_class_set (res, token_sym, start->location, false);
+ return res;
+}
+
+/* For each start symbol "foo", create the rule "$accept: YY_FOO
+ foo $end". */
+static void
+create_start_rules (void)
+{
+ if (!start_symbols)
+ {
+ symbol *start = find_start_symbol ();
+ start_symbols = symbol_list_sym_new (start, start->location);
+ }
+
+ const bool several = start_symbols->next;
+ if (several)
+ for (symbol_list *list = start_symbols; list; list = list->next)
+ {
+ assert (list->content_type == SYMLIST_SYMBOL);
+ symbol *start = list->content.sym;
+ symbol *swtok = switching_token (start);
+ create_start_rule (swtok, start);
+ }
+ else
+ {
+ symbol *start = start_symbols->content.sym;
+ create_start_rule (NULL, start);
+ }
+}
+
+static void
+check_start_symbols (void)
+{
+ // Sanity checks on the start symbols.
+ for (symbol_list *list = start_symbols; list; list = list->next)
+ {
+ const symbol *start = list->content.sym;
+ if (start->content->class == unknown_sym)
+ {
+ complain (&start->location, complaint,
+ _("the start symbol %s is undefined"),
+ start->tag);
+ // I claim this situation is unreachable. This is caught
+ // before, and we get "symbol 'foo' is used, but is not
+ // defined as a token and has no rules".
+ abort ();
+ }
+ if (start->content->class == token_sym)
+ complain (&start->location, complaint,
+ _("the start symbol %s is a token"),
+ start->tag);
+ }
+ if (complaint_status == status_complaint)
+ exit (EXIT_FAILURE);
+}
+
+
/*-------------------------------------------------------------.
| Check the grammar that has just been read, and convert it to |
| internal form. |
@@ -818,19 +891,12 @@ check_and_convert_grammar (void)
}
}
+ /* Insert the initial rule(s). */
+ create_start_rules ();
+
/* Report any undefined symbols and consider them nonterminals. */
symbols_check_defined ();
- /* Find the start symbol if no %start. */
- if (!start_flag)
- {
- symbol *start = find_start_symbol ();
- grammar_start_symbol_set (start, start->location);
- }
-
- /* Insert the initial rule. */
- create_start_rule (NULL, startsymbol);
-
if (SYMBOL_NUMBER_MAXIMUM - nnterms < ntokens)
complain (NULL, fatal, "too many symbols in input grammar (limit is %d)",
SYMBOL_NUMBER_MAXIMUM);
@@ -840,6 +906,8 @@ check_and_convert_grammar (void)
/* Assign the symbols their symbol numbers. */
symbols_pack ();
+ check_start_symbols ();
+
/* Scan rule actions after invoking symbol_check_alias_consistency
(in symbols_pack above) so that token types are set correctly
before the rule action type checking.
diff --git a/src/reader.h b/src/reader.h
index 39ede8c3..49d862ad 100644
--- a/src/reader.h
+++ b/src/reader.h
@@ -38,7 +38,14 @@ typedef struct merger_list
void free_merger_functions (void);
extern merger_list *merge_functions;
-void grammar_start_symbol_set (symbol *sym, location loc);
+/* List of the start symbols. */
+extern symbol_list *start_symbols;
+
+/* Get a token "YY_FOO" for each start symbol "foo". Create it if
+ needed. */
+symbol *switching_token (const symbol *start);
+
+void grammar_start_symbols_set (symbol_list *syms);
void grammar_current_rule_begin (symbol *lhs, location loc,
named_ref *lhs_named_ref);
diff --git a/src/reduce.c b/src/reduce.c
index 0061b687..c9979e0a 100644
--- a/src/reduce.c
+++ b/src/reduce.c
@@ -275,7 +275,8 @@ nonterminals_reduce (void)
if (!bitset_test (V, i))
{
nterm_map[i - ntokens] = n++;
- if (symbols[i]->content->status != used)
+ if (symbols[i]->content->status != used
+ && symbols[i] != acceptsymbol)
complain (&symbols[i]->location, Wother,
_("nonterminal useless in grammar: %s"),
symbols[i]->tag);
@@ -381,10 +382,18 @@ reduce_grammar (void)
{
reduce_print ();
- if (!bitset_test (N, acceptsymbol->content->number - ntokens))
- complain (&startsymbol_loc, fatal,
- _("start symbol %s does not derive any sentence"),
- startsymbol->tag);
+ // Check that start symbols have non-empty languages.
+ bool failure = false;
+ for (symbol_list *list = start_symbols; list; list = list->next)
+ if (!bitset_test (N, list->content.sym->content->number - ntokens))
+ {
+ failure = true;
+ complain (&list->sym_loc, complaint,
+ _("start symbol %s does not derive any sentence"),
+ list->content.sym->tag);
+ }
+ if (failure)
+ exit (EXIT_FAILURE);
/* First reduce the nonterminals, as they renumber themselves in the
whole grammar. If you change the order, nonterms would be
diff --git a/src/symtab.c b/src/symtab.c
index b5556715..31a3c048 100644
--- a/src/symtab.c
+++ b/src/symtab.c
@@ -60,8 +60,6 @@ symbol *errtoken = NULL;
symbol *undeftoken = NULL;
symbol *eoftoken = NULL;
symbol *acceptsymbol = NULL;
-symbol *startsymbol = NULL;
-location startsymbol_loc;
/* Precedence relation graph. */
static symgraph **prec_nodes;
@@ -1146,15 +1144,6 @@ symbols_pack (void)
symbols_token_translations_init ();
- if (startsymbol->content->class == unknown_sym)
- complain (&startsymbol_loc, fatal,
- _("the start symbol %s is undefined"),
- startsymbol->tag);
- else if (startsymbol->content->class == token_sym)
- complain (&startsymbol_loc, fatal,
- _("the start symbol %s is a token"),
- startsymbol->tag);
-
// If some user tokens are internationalized, the internal ones
// should be too.
if (has_translations ())
diff --git a/src/symtab.h b/src/symtab.h
index e85e5468..1ec8042b 100644
--- a/src/symtab.h
+++ b/src/symtab.h
@@ -247,11 +247,6 @@ extern symbol *eoftoken;
$accept: start-symbol $end */
extern symbol *acceptsymbol;
-/** The user start symbol. */
-extern symbol *startsymbol;
-/** The location of the \c \%start declaration. */
-extern location startsymbol_loc;
-
/** Whether a symbol declared with a type tag. */
extern bool tag_seen;
diff --git a/tests/reduce.at b/tests/reduce.at
index b561100e..c1af62ee 100644
--- a/tests/reduce.at
+++ b/tests/reduce.at
@@ -445,23 +445,69 @@ AT_CLEANUP
-## ---------------- ##
-## Empty Language. ##
-## ---------------- ##
+## ------------------- ##
+## Bad start symbols. ##
+## ------------------- ##
-AT_SETUP([Empty Language])
+AT_SETUP([Bad start symbols])
+m4_pushdef([AT_TEST],
+[
AT_DATA([[input.y]],
-[[%output "input.c"
-%%
-exp: exp;
-]])
+[%%
+$1
+])
AT_BISON_CHECK([[input.y]], 1, [],
+[$2
+])
+])
+
+AT_TEST(
+[[exp: exp;]],
[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
input.y: warning: 2 rules useless in grammar [-Wother]
-input.y:3.1-3: fatal error: start symbol exp does not derive any sentence
-]])
+input.y:2.1-3: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp;
+exp: exp;]],
+[[input.y: warning: 2 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp stmt;
+exp: exp;
+stmt: "stmt"]],
+[[input.y: warning: 1 nonterminal useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%start exp stmt;
+exp: exp;
+stmt: stmt]],
+[[input.y: warning: 3 nonterminals useless in grammar [-Wother]
+input.y: warning: 4 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence
+input.y:2.12-15: error: start symbol stmt does not derive any sentence]])
+
+AT_TEST(
+[[%start exp;
+stmt: stmt]],
+[[input.y:2.8-10: warning: symbol 'exp' is used, but is not defined as a token
and has no rules [-Wother]
+input.y: warning: 3 nonterminals useless in grammar [-Wother]
+input.y: warning: 2 rules useless in grammar [-Wother]
+input.y:2.8-10: error: start symbol exp does not derive any sentence]])
+
+AT_TEST(
+[[%token FOO;
+%start FOO;
+stmt: FOO]],
+[[input.y:2.8-10: error: the start symbol FOO is a token]])
+
+m4_popdef([AT_TEST])
AT_CLEANUP
--
2.28.0
- [PATCH 00/17] RFC: multiple start symbols, Akim Demaille, 2020/09/20
- [PATCH 01/17] gram: more debugging information, Akim Demaille, 2020/09/20
- [PATCH 02/17] reader: get ready to create several initial rules, Akim Demaille, 2020/09/20
- [PATCH 03/17] parser: expose a list of symbols, Akim Demaille, 2020/09/20
- [PATCH 04/17] regen, Akim Demaille, 2020/09/20
- [PATCH 05/17] multistart: turn start symbols into rules on $accept,
Akim Demaille <=
- [PATCH 06/17] regen, Akim Demaille, 2020/09/20
- [PATCH 07/17] multistart: adjust computation of initial core and adjust reports, Akim Demaille, 2020/09/20
- [PATCH 08/17] multistart: also check the HTML report, Akim Demaille, 2020/09/20
- [PATCH 09/17] multistart: pass the list of start symbols to the backend, Akim Demaille, 2020/09/20
- [PATCH 10/17] multistart: equip yacc.c, Akim Demaille, 2020/09/20
- [PATCH 11/17] multistart: toy with it in lexcalc, Akim Demaille, 2020/09/20
- [PATCH 12/17] todo: more, Akim Demaille, 2020/09/20
- [PATCH 13/17] multistart: adjust reader checks for generated rules, Akim Demaille, 2020/09/20
- [PATCH 14/17] multistart: use b4_accept instead of action post-processing, Akim Demaille, 2020/09/20
- [PATCH 15/17] multistart: allow tokens as start symbols, Akim Demaille, 2020/09/20