[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Complain about hidden left recursion in GLR parsing
From: |
Sylvain Schmitz |
Subject: |
Re: Complain about hidden left recursion in GLR parsing |
Date: |
Mon, 13 Mar 2006 18:45:29 +0100 |
User-agent: |
Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.7) Gecko/20040616 |
Sylvain Schmitz wrote:
And now I see my solution is not working in case a nonterminal has both
hidden and non-hidden left recursion like so:
s:
a s 'b'
| s 'c'
| 'x'
;
Here is a new patch, which seems to work. I am not overly happy with
my implementation, but it might give you ideas.
I also made up an example from a CSS2 grammar, showing that hidden
left recursions can occur if one is not careful.
--
Regards,
Sylvain
--- bison/src/closure.c 2005-12-10 00:51:25.000000000 +0100
+++ bison-cvs20060222/src/closure.c 2006-03-13 18:35:09.000000000 +0100
@@ -29,9 +29,11 @@
#include <quotearg.h>
#include "closure.h"
+#include "complain.h"
#include "derives.h"
#include "getargs.h"
#include "gram.h"
+#include "nullable.h"
#include "reader.h"
#include "symtab.h"
@@ -44,6 +46,7 @@
/* internal data. See comments before set_fderives and set_firsts. */
static bitsetv fderives = NULL;
static bitsetv firsts = NULL;
+static bitsetv next_vars = NULL;
/* Retrieve the FDERIVES/FIRSTS sets of the nonterminals numbered Var. */
#define FDERIVES(Var) fderives[(Var) - ntokens]
@@ -120,6 +123,10 @@
| symbols 8 3 20, the symbol 8 can be the beginning of the data for |
| symbol 5, so the bit [8 - ntokens] in first[5 - ntokens] (= FIRST |
| (5)) is set. |
+| |
+| NEXT_VARS is similar but also takes nullable symbols into |
+| account. It is used for the detection of hidden left recursion |
+| in GLR parsing. |
`------------------------------------------------------------------*/
static void
@@ -128,13 +135,29 @@
symbol_number i, j;
firsts = bitsetv_create (nvars, nvars, BITSET_FIXED);
+ next_vars = bitsetv_create (nvars, nvars, BITSET_FIXED);
for (i = ntokens; i < nsyms; i++)
for (j = 0; derives[i - ntokens][j]; ++j)
{
item_number sym = derives[i - ntokens][j]->rhs[0];
if (ISVAR (sym))
- bitset_set (FIRSTS (i), sym - ntokens);
+ {
+ int k = 1;
+ bitset_set (FIRSTS (i), sym - ntokens);
+ bitset_set (next_vars [i - ntokens], sym - ntokens);
+ /* Keep going in the rule right hand side if the current
+ symbol is nullable. */
+ while (nullable[sym - ntokens]
+ && (sym = derives[i - ntokens][j]->rhs[k]) >= 0
+ && ISVAR (sym))
+ {
+ bitset_set (next_vars [i - ntokens], sym - ntokens);
+ k++;
+ }
+ if (ISVAR (sym))
+ bitset_set (next_vars [i - ntokens], sym - ntokens);
+ }
}
if (trace_flag & trace_sets)
@@ -145,6 +168,41 @@
if (trace_flag & trace_sets)
print_firsts ();
+
+
+ if (glr_parser)
+ {
+ bitsetv_transitive_closure (next_vars);
+ for (i = ntokens; i < nsyms; i++)
+ if (bitset_test (next_vars[i - ntokens], i - ntokens))
+ for (j = 0; derives[i - ntokens][j]; ++j)
+ {
+ item_number sym = derives[i - ntokens][j]->rhs[0];
+ if (ISVAR(sym) && nullable[sym - ntokens])
+ {
+ int k = 1;
+ while ((sym = derives[i - ntokens][j]->rhs[k++]) >= 0
+ && ISVAR (sym)
+ && nullable[sym - ntokens]
+ && !bitset_test (next_vars[sym-ntokens],i-ntokens));
+
+ if ((ISVAR (sym)
+ && bitset_test (next_vars[sym-ntokens], i-ntokens))
+ ||
+ ((sym = derives[i - ntokens][j]->rhs[k++]) >= 0
+ && ISVAR (sym)
+ && bitset_test (next_vars[sym-ntokens], i-ntokens)))
+ {
+ complain_at (derives[i - ntokens][j]->location,
+ "%s: %s: ",
+ _("error"),
+ _("hidden left recursion in rule"));
+ rule_print (derives[i - ntokens][j], stderr);
+ }
+ }
+ }
+ }
+ bitsetv_free (next_vars);
}
/*-------------------------------------------------------------------.
/* css2.y
* Excerpt of the CSS2 grammar from the W3C recommendation
* <http://www.w3.org/TR/1998/REC-CSS2-19980512>:
*
* combinator:
* '+' S*
* | '-' S*
* | / * empty * /
* ;
*
* selector:
* simple_selector [ combinator simple_selector ]*
* ;
*
* simple_selector:
* element_name? [ HASH | class | attrib | pseudo ]* S*
* ;
*
* Yes, that's an infinitely ambiguous grammar (and thus it's not LL(1)) ...
* this seems to be fixed in the CSS2.1 grammar.
*/
%glr-parser
%{
#include <stdio.h>
#include <ctype.h>
char *yystring;
int yylex (void);
void yyerror (const char *);
%}
%token element_name HASH class attrib pseudo
%%
/* now, suppose for a second someone translates the EBNF
using right recursion ... */
selector:
simple_selector
| simple_selector combinator selector
;
combinator:
'+'
| '-'
| /* empty */
;
simple_selector:
element_name s_s_mlist
| s_s_mlist
;
s_s_mlist:
s_s_mlist s_s_mod
| /* empty */
;
s_s_mod:
HASH
| class
| attrib
| pseudo
;
%%
int
yylex (void)
{
while (isspace (*yystring))
yystring++;
switch (*yystring)
{
case '+': case '-': case '\0':
return *yystring++;
/* we don't need to return HASH, class, attrib nor pseudo
to witness the issue */
otherwise:
yystring++;
return element_name;
}
}
void
yyerror (const char *msg)
{
fprintf (stderr, "%s\n", msg);
}
int
main (int argc, char *argv[])
{
int ret;
yystring = argv[1];
ret = yyparse ();
return ret;
}
%glr-parser
%{
#include <stdio.h>
char *yystring;
int nbs, ncs;
int yylex (void);
void yyerror (const char *);
%}
%%
s:
a s 'c' { ncs++; }
| b
| 'x'
;
a:
/* empty */
;
b:
s 'b' { nbs++; }
;
%%
int
yylex (void)
{
return *yystring++;
}
void
yyerror (const char *msg)
{
fprintf (stderr, "%s\n", msg);
}
int
main (int argc, char *argv[])
{
int ret;
nbs = ncs = 0;
yystring = argv[1];
ret = yyparse ();
if (!ret)
{
printf ("Number of b's: %d\n", nbs);
printf ("Number of c's: %d\n", ncs);
}
return ret;
}