[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Handling of \r
From: |
Akim Demaille |
Subject: |
Re: Handling of \r |
Date: |
Wed, 11 Sep 2019 07:55:11 +0200 |
Hi Paul,
> Le 9 sept. 2019 à 21:10, Paul Eggert <address@hidden> a écrit :
>
> On 9/9/19 9:46 AM, Akim Demaille wrote:
>> I personally prefer treating a lone \r as a regular character, as it's more
>> consistent with what my tools show me. And I think it's a problem that GCC
>> and Emacs disagree, so maybe the GCS should decide.
>> But in the case of Bison, WDYT today?
>
> I vaguely recall thinking that since Bison is a preprocessor for C/C++,
> consistency with GCC was more important than consistency with other GNU tools.
Agreed.
> The GCC tradition was because of Classic Mac OS, which used plain CR to
> terminate lines.
Aaaah! Of course! I had completely forgotten about that!
> Since Apple hasn't supported Classic Mac OS since 2002 (arund the time of my
> Bison patch you mentioned), this old tradition is now completely obsolete,
> and I'd support having Bison and GCC be consistent with everybody else now.
Great!
> So how about this idea: Change Bison to be like most other GNU tools, and
> file a bug report with the GCC folks.
I'm all for it!
The GCC's PR is here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91733
My proposal to address this in Bison is below. I saw two paths: change
no_cr_read into no_crnl_read, or get rid of it leaving the handling of EOL to
Flex. I chose the latter because it seemed more natural to me, but you might
have a different opinion.
Cheers!
commit cac45485e14bb3e59761ee7e85b763c0a18bb89a
Author: Akim Demaille <address@hidden>
Date: Tue Sep 10 18:51:25 2019 +0200
input: \r is lone char
We used to treat lone CRs (\r) as regular NLs (\n), probably to please
classic MacOS. As of today, it makes more sense to treat \r like a
plain white space character.
https://lists.gnu.org/archive/html/bison-patches/2019-09/msg00027.html
* src/scan-gram.l (no_cr_read): Remove. Instead, use...
(eol): this new abbreviation denoting end-of-line.
* src/location.c (caret_getc): New.
(location_caret): Use it.
* tests/diagnostics.at (Carriage return): Adjust expectations.
(CR NL): New.
diff --git a/src/location.c b/src/location.c
index 40fbc04e..d2314a18 100644
--- a/src/location.c
+++ b/src/location.c
@@ -169,7 +169,7 @@ static struct
} caret_info;
void
-caret_free ()
+caret_free (void)
{
if (caret_info.source)
{
@@ -178,6 +178,23 @@ caret_free ()
}
}
+/* Getc, but smash \r\n as \n. */
+static int
+caret_getc (void)
+{
+ FILE *f = caret_info.source;
+ int res = getc (f);
+ if (res == '\r')
+ {
+ int c = getc (f);
+ if (c == '\n')
+ res = c;
+ else
+ ungetc (c, f);
+ }
+ return res;
+}
+
void
location_caret (location loc, const char *style, FILE *out)
{
@@ -230,7 +247,7 @@ location_caret (location loc, const char *style, FILE *out)
/* Advance to the line's position, keeping track of the offset. */
while (caret_info.line < loc.start.line)
{
- int c = getc (caret_info.source);
+ int c = caret_getc ();
if (c == EOF)
/* Something is wrong, that line number does not exist. */
return;
@@ -241,7 +258,7 @@ location_caret (location loc, const char *style, FILE *out)
/* Read the actual line. Don't update the offset, so that we keep a pointer
to the start of the line. */
{
- int c = getc (caret_info.source);
+ int c = caret_getc ();
if (c != EOF)
{
bool single_line = loc.start.line == loc.end.line;
@@ -268,7 +285,7 @@ location_caret (location loc, const char *style, FILE *out)
opened = true;
}
fputc (c, out);
- c = getc (caret_info.source);
+ c = caret_getc ();
++byte;
if (opened
&& (single_line
diff --git a/src/scan-gram.l b/src/scan-gram.l
index 66a8caa7..32bc69aa 100644
--- a/src/scan-gram.l
+++ b/src/scan-gram.l
@@ -49,9 +49,6 @@ static boundary scanner_cursor;
#define YY_USER_ACTION location_compute (loc, &scanner_cursor, yytext,
yyleng);
-static size_t no_cr_read (FILE *, char *, size_t);
-#define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size))
-
/* Report that yytext is an extension, and evaluate to its token type. */
#define BISON_DIRECTIVE(Directive) \
(bison_directive (loc, yytext), PERCENT_ ## Directive)
@@ -139,12 +136,14 @@ id {letter}({letter}|[-0-9])*
int [0-9]+
xint 0[xX][0-9abcdefABCDEF]+
+eol \n|\r\n
+
/* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */
mbchar
[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2})
/* Zero or more instances of backslash-newline. Following GCC, allow
white space between the backslash and the newline. */
-splice (\\[ \f\t\v]*\n)*
+splice (\\[ \f\t\v]*{eol})*
/* An equal sign, with optional leading whitespaces. This is used in some
deprecated constructs. */
@@ -193,7 +192,7 @@ eqopt ({sp}=)?
"," {
complain (loc, Wother, _("stray ',' treated as white space"));
}
- [ \f\n\t\v] |
+ [ \f\t\v\r]|{eol} |
"//".* continue;
"/*" {
token_start = loc->start;
@@ -201,9 +200,7 @@ eqopt ({sp}=)?
BEGIN SC_YACC_COMMENT;
}
- /* #line directives are not documented, and may be withdrawn or
- modified in future versions of Bison. */
- ^"#line "{int}(" \"".*"\"")?"\n" {
+ ^"#line "{int}(" \"".*"\"")?{eol} {
handle_syncline (yytext + sizeof "#line " - 1, *loc);
}
}
@@ -330,7 +327,7 @@ eqopt ({sp}=)?
}
/* Semantic predicate. */
- "%?"[ \f\n\t\v]*"{" {
+ "%?"([ \f\t\v]|{eol})*"{" {
nesting = 0;
code_start = loc->start;
BEGIN SC_PREDICATE;
@@ -359,7 +356,7 @@ eqopt ({sp}=)?
BEGIN SC_BRACKETED_ID;
}
- [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\n\t\v]+|. {
+ [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\r\n\t\v]+|. {
complain (loc, complaint, "%s: %s",
ngettext ("invalid character", "invalid characters", yyleng),
quote_mem (yytext, yyleng));
@@ -458,7 +455,7 @@ eqopt ({sp}=)?
complain (loc, complaint, _("an identifier expected"));
}
- [^\].A-Za-z0-9_/ \f\n\t\v]+|. {
+ [^\].A-Za-z0-9_/ \f\r\n\t\v]+|. {
complain (loc, complaint, "%s: %s",
ngettext ("invalid character in bracketed name",
"invalid characters in bracketed name", yyleng),
@@ -491,7 +488,7 @@ eqopt ({sp}=)?
<SC_YACC_COMMENT>
{
"*/" BEGIN context_state;
- .|\n continue;
+ .|{eol} continue;
<<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state;
}
@@ -513,7 +510,7 @@ eqopt ({sp}=)?
<SC_LINE_COMMENT>
{
- "\n" STRING_GROW; BEGIN context_state;
+ {eol} STRING_GROW; BEGIN context_state;
{splice} STRING_GROW;
<<EOF>> BEGIN context_state;
}
@@ -535,7 +532,7 @@ eqopt ({sp}=)?
RETURN_VALUE (STRING, last_string);
}
<<EOF>> unexpected_eof (token_start, "\"");
- "\n" unexpected_newline (token_start, "\"");
+ {eol} unexpected_newline (token_start, "\"");
}
/*----------------------------------------------------------.
@@ -564,7 +561,7 @@ eqopt ({sp}=)?
BEGIN INITIAL;
return CHAR;
}
- "\n" unexpected_newline (token_start, "'");
+ {eol} unexpected_newline (token_start, "'");
<<EOF>> unexpected_eof (token_start, "'");
}
@@ -641,7 +638,7 @@ eqopt ({sp}=)?
else
obstack_1grow (&obstack_for_string, c);
}
- \\(.|\n) {
+ \\(.|{eol}) {
char const *p = yytext + 1;
/* Quote only if escaping won't make the character visible. */
if (c_isspace ((unsigned char) *p) && c_isprint ((unsigned char) *p))
@@ -665,14 +662,14 @@ eqopt ({sp}=)?
<SC_CHARACTER>
{
"'" STRING_GROW; BEGIN context_state;
- \n unexpected_newline (token_start, "'");
+ {eol} unexpected_newline (token_start, "'");
<<EOF>> unexpected_eof (token_start, "'");
}
<SC_STRING>
{
"\"" STRING_GROW; BEGIN context_state;
- \n unexpected_newline (token_start, "\"");
+ {eol} unexpected_newline (token_start, "\"");
<<EOF>> unexpected_eof (token_start, "\"");
}
@@ -809,53 +806,6 @@ eqopt ({sp}=)?
%%
-/* Read bytes from FP into buffer BUF of size SIZE. Return the
- number of bytes read. Remove '\r' from input, treating \r\n
- and isolated \r as \n. */
-
-static size_t
-no_cr_read (FILE *fp, char *buf, size_t size)
-{
- size_t bytes_read = fread (buf, 1, size, fp);
- if (bytes_read)
- {
- char *w = memchr (buf, '\r', bytes_read);
- if (w)
- {
- char const *r = ++w;
- char const *lim = buf + bytes_read;
-
- for (;;)
- {
- /* Found an '\r'. Treat it like '\n', but ignore any
- '\n' that immediately follows. */
- w[-1] = '\n';
- if (r == lim)
- {
- int ch = getc (fp);
- if (ch != '\n' && ungetc (ch, fp) != ch)
- break;
- }
- else if (*r == '\n')
- r++;
-
- /* Copy until the next '\r'. */
- do
- {
- if (r == lim)
- return w - buf;
- }
- while ((*w++ = *r++) != '\r');
- }
-
- return w - buf;
- }
- }
-
- return bytes_read;
-}
-
-
/*------------------------------------------------------.
| Scan NUMBER for a base-BASE integer at location LOC. |
diff --git a/tests/diagnostics.at b/tests/diagnostics.at
index d9398dd5..d89b5f64 100644
--- a/tests/diagnostics.at
+++ b/tests/diagnostics.at
@@ -274,11 +274,43 @@ AT_TEST([[Carriage return]],
%%
]],
[1],
-[[input.y:37.8-38.0: <error>error:</error> missing '"' at end of line
-input.y:37.8-38.0: <error>error:</error> syntax error, unexpected string,
expecting char or identifier or <tag>
+[[input.y:10.8-11.0: <error>error:</error> missing '"' at end of line
+ 10 | %token <error>"</error>
+ | <error>^</error>
+input.y:10.8-11.0: <error>error:</error> syntax error, unexpected string,
expecting char or identifier or <tag>
+ 10 | %token <error>"</error>
+ | <error>^</error>
]])
+## ------- ##
+## CR NL. ##
+## ------- ##
+
+# Check Windows EOLs.
+
+AT_TEST([[CR NL]],
+[[^M
+%token ^M FOO^M
+%token ^M FOO^M
+%%^M
+exp:^M
+]],
+[0],
+[[input.y:11.9-11: <warning>warning:</warning> symbol FOO redeclared
[<warning>-Wother</warning>]
+ 11 | %token
<warning>FOO</warning>
+ | <warning>^~~</warning>
+input.y:10.9-11: previous declaration
+ 10 | %token
<note>FOO</note>
+ | <note>^~~</note>
+input.y:13.5: <warning>warning:</warning> empty rule without %empty
[<warning>-Wempty-rule</warning>]
+ 13 | exp:
+ | <warning>^</warning>
+input.y: <warning>warning:</warning> fix-its can be applied. Rerun with
option '--update'. [<warning>-Wother</warning>]
+]])
+
+
+
m4_popdef([AT_TEST])