[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#58558: 29.0.50; re-search-forward is slow in some buffers
From: |
Stefan Monnier |
Subject: |
bug#58558: 29.0.50; re-search-forward is slow in some buffers |
Date: |
Thu, 13 Apr 2023 00:33:52 -0400 |
User-agent: |
Gnus/5.13 (Gnus v5.13) |
>> For the former, we could probably extend the `b_property` and
>> `e_property` fields of `gl_state` (which hold charpos) to also store
>> their bytepos equivalent, which should significantly reduce the number
>> of conversions between bytepos and charpos.
> I.e. something like the patch below (which passes all tests except for
> `test/src/comp-tests` for a reason that completely escapes me).
Found the culprit!
The patch below passes `make check`.
Stefan
diff --git a/src/fns.c b/src/fns.c
index e92ef7e4c81..591b00103da 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -1194,6 +1194,8 @@ string_char_to_byte (Lisp_Object string, ptrdiff_t
char_index)
if (best_above == best_above_byte)
return char_index;
+ eassert (char_index >= 0 && char_index <= best_above);
+
if (BASE_EQ (string, string_char_byte_cache_string))
{
if (string_char_byte_cache_charpos < char_index)
@@ -1254,6 +1256,8 @@ string_byte_to_char (Lisp_Object string, ptrdiff_t
byte_index)
if (best_above == best_above_byte)
return byte_index;
+ eassert (byte_index >= 0 && byte_index <= best_above_byte);
+
if (BASE_EQ (string, string_char_byte_cache_string))
{
if (string_char_byte_cache_bytepos < byte_index)
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 746779490ad..f75f805cd9c 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3979,7 +3979,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
/* Prevent shrinking and relocation of buffer text if GC happens
while we are inside this function. The calls to
- UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
+ RE_UPDATE_SYNTAX_TABLE_* macros can call Lisp (via
`internal--syntax-propertize`); these calls are careful to defend against
buffer modifications, but even with no modifications, the buffer text may
be relocated during GC by `compact_buffer` which would invalidate
@@ -4792,12 +4792,11 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
int s1, s2;
int dummy;
ptrdiff_t offset = POINTER_TO_OFFSET (d);
- ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
- UPDATE_SYNTAX_TABLE (charpos);
+ RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
nchars++;
s1 = SYNTAX (c1);
- UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+ RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
PREFETCH_NOLIMIT ();
GET_CHAR_AFTER (c2, d, dummy);
nchars++;
@@ -4832,8 +4831,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
int s1, s2;
int dummy;
ptrdiff_t offset = POINTER_TO_OFFSET (d);
- ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
- UPDATE_SYNTAX_TABLE (charpos);
+ RE_UPDATE_SYNTAX_TABLE (offset);
PREFETCH ();
GET_CHAR_AFTER (c2, d, dummy);
nchars++;
@@ -4848,7 +4846,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
{
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
nchars++;
- UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+ RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset);
s1 = SYNTAX (c1);
/* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
@@ -4875,8 +4873,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
int s1, s2;
int dummy;
ptrdiff_t offset = POINTER_TO_OFFSET (d);
- ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
- UPDATE_SYNTAX_TABLE (charpos);
+ RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
nchars++;
s1 = SYNTAX (c1);
@@ -4891,13 +4888,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
PREFETCH_NOLIMIT ();
GET_CHAR_AFTER (c2, d, dummy);
nchars++;
- UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+ RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
s2 = SYNTAX (c2);
/* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
returns 0. */
if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
- goto fail;
+ goto fail;
}
}
break;
@@ -4917,8 +4914,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
int c1, c2;
int s1, s2;
ptrdiff_t offset = POINTER_TO_OFFSET (d);
- ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
- UPDATE_SYNTAX_TABLE (charpos);
+ RE_UPDATE_SYNTAX_TABLE (offset);
PREFETCH ();
c2 = RE_STRING_CHAR (d, target_multibyte);
nchars++;
@@ -4933,7 +4929,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
{
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
nchars++;
- UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+ RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (offset);
s1 = SYNTAX (c1);
/* ... and S1 is Sword or Ssymbol. */
@@ -4958,8 +4954,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
int c1, c2;
int s1, s2;
ptrdiff_t offset = POINTER_TO_OFFSET (d);
- ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset) - 1;
- UPDATE_SYNTAX_TABLE (charpos);
+ RE_UPDATE_SYNTAX_TABLE_BEFORE (offset);
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
nchars++;
s1 = SYNTAX (c1);
@@ -4974,7 +4969,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
PREFETCH_NOLIMIT ();
c2 = RE_STRING_CHAR (d, target_multibyte);
nchars++;
- UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+ RE_UPDATE_SYNTAX_TABLE_FORWARD (offset);
s2 = SYNTAX (c2);
/* ... and S2 is Sword or Ssymbol. */
@@ -4994,8 +4989,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
PREFETCH ();
{
ptrdiff_t offset = POINTER_TO_OFFSET (d);
- ptrdiff_t pos1 = RE_SYNTAX_TABLE_BYTE_TO_CHAR (offset);
- UPDATE_SYNTAX_TABLE (pos1);
+ RE_UPDATE_SYNTAX_TABLE (offset);
}
{
int len;
diff --git a/src/syntax.c b/src/syntax.c
index e9e04e2d638..fbd08c74092 100644
--- a/src/syntax.c
+++ b/src/syntax.c
@@ -250,6 +250,8 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
gl_state.b_property = BEGV;
gl_state.e_property = ZV + 1;
gl_state.object = Qnil;
+ gl_state.b_re_byte = -1;
+ gl_state.e_re_byte = -1;
if (parse_sexp_lookup_properties)
{
if (count > 0)
@@ -265,14 +267,15 @@ SETUP_SYNTAX_TABLE (ptrdiff_t from, ptrdiff_t count)
/* Same as above, but in OBJECT. If OBJECT is nil, use current buffer.
If it is t (which is only used in fast_c_string_match_ignore_case),
ignore properties altogether.
- FROMBYTE is an regexp-byteoffset. */
+ FROMBYTE is a regexp-byteoffset. */
void
-RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
- ptrdiff_t frombyte)
+RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object, ptrdiff_t frombyte)
{
SETUP_BUFFER_SYNTAX_TABLE ();
gl_state.object = object;
+ gl_state.b_re_byte = -1;
+ gl_state.e_re_byte = -1;
if (BUFFERP (gl_state.object))
{
struct buffer *buf = XBUFFER (gl_state.object);
@@ -282,21 +285,25 @@ RE_SETUP_SYNTAX_TABLE_FOR_OBJECT (Lisp_Object object,
else if (NILP (gl_state.object))
{
gl_state.b_property = BEG;
- gl_state.e_property = ZV; /* FIXME: Why not +1 like in
SETUP_SYNTAX_TABLE? */
+ gl_state.e_property = ZV;
}
else if (EQ (gl_state.object, Qt))
{
gl_state.b_property = 0;
- gl_state.e_property = PTRDIFF_MAX;
+ /* -1 so we can do +1 in `re_update_byteoffsets`. */
+ gl_state.e_property = PTRDIFF_MAX - 1;
}
else
{
gl_state.b_property = 0;
- gl_state.e_property = 1 + SCHARS (gl_state.object);
+ gl_state.e_property = SCHARS (gl_state.object);
}
if (parse_sexp_lookup_properties)
- update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte),
- 1, 1, gl_state.object);
+ {
+ update_syntax_table (RE_SYNTAX_TABLE_BYTE_TO_CHAR (frombyte),
+ 1, 1, gl_state.object);
+ re_update_byteoffsets ();
+ }
}
/* Update gl_state to an appropriate interval which contains CHARPOS. The
diff --git a/src/syntax.h b/src/syntax.h
index 01982be25a0..420ba8f31dc 100644
--- a/src/syntax.h
+++ b/src/syntax.h
@@ -66,7 +66,7 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults,
syntax_table)
struct gl_state_s
{
Lisp_Object object; /* The object we are scanning. */
- ptrdiff_t start; /* Where to stop. */
+ ptrdiff_t start; /* Where to stop(?FIXME?). */
ptrdiff_t stop; /* Where to stop. */
bool use_global; /* Whether to use global_code
or c_s_t. */
@@ -85,6 +85,11 @@ #define Vstandard_syntax_table BVAR (&buffer_defaults,
syntax_table)
and possibly at the
intervals too, depending
on: */
+ /* The regexp engine prefers byteoffsets over char positions, so
+ store those to try and reduce the number of byte<->char conversions.
+ This is only kept uptodate when used from the regexp engine. */
+ ptrdiff_t b_re_byte; /* First byteoffset where c_s_t is valid. */
+ ptrdiff_t e_re_byte; /* First byteoffset where c_s_t is not valid. */
};
extern struct gl_state_s gl_state;
@@ -145,19 +150,14 @@ SYNTAX (int c)
extern unsigned char const syntax_spec_code[0400];
-/* Convert the regexp's BYTEOFFSET into a character position,
- for the object recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT.
-
- The value is meant for use in code that does nothing when
- parse_sexp_lookup_properties is false, so return 0 in that case,
- for speed. */
+/* Convert the BYTEOFFSET into a character position, for the object
+ recorded in gl_state with RE_SETUP_SYNTAX_TABLE_FOR_OBJECT. */
INLINE ptrdiff_t
RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset)
{
- return (! parse_sexp_lookup_properties
- ? 0
- : STRINGP (gl_state.object)
+ eassert (parse_sexp_lookup_properties);
+ return (STRINGP (gl_state.object)
? string_byte_to_char (gl_state.object, byteoffset)
: BUFFERP (gl_state.object)
? ((buf_bytepos_to_charpos
@@ -168,6 +168,44 @@ RE_SYNTAX_TABLE_BYTE_TO_CHAR (ptrdiff_t byteoffset)
: byteoffset);
}
+INLINE ptrdiff_t
+RE_SYNTAX_TABLE_CHAR_TO_BYTE (ptrdiff_t charpos)
+{
+ eassert (parse_sexp_lookup_properties);
+ return (STRINGP (gl_state.object)
+ ? string_char_to_byte (gl_state.object, charpos)
+ : BUFFERP (gl_state.object)
+ ? ((buf_charpos_to_bytepos
+ (XBUFFER (gl_state.object), charpos)
+ - BUF_BEGV_BYTE (XBUFFER (gl_state.object))))
+ : NILP (gl_state.object)
+ ? CHAR_TO_BYTE (charpos) - BEGV_BYTE
+ : charpos);
+}
+
+static void re_update_byteoffsets (void)
+{
+ gl_state.b_re_byte = RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.b_property);
+ eassert (gl_state.b_property
+ == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.b_re_byte));
+ /* `e_property` is often set to EOB+1 (or to some value
+ much further than `stop` in narrowed buffers). */
+ gl_state.e_re_byte
+ = gl_state.e_property > gl_state.stop
+ ? 1 + RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.stop)
+ : RE_SYNTAX_TABLE_CHAR_TO_BYTE (gl_state.e_property);
+ eassert (gl_state.e_property > gl_state.stop
+ ? gl_state.e_property
+ >= 1 + RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte - 1)
+ : gl_state.e_property
+ == RE_SYNTAX_TABLE_BYTE_TO_CHAR (gl_state.e_re_byte));
+}
+
+/* The regexp-engine doesn't keep track of char positions, but instead
+ uses byteoffsets, so `syntax.c` uses `UPDATE_SYNTAX_TABLE_*` functions,
+ passing them `charpos`s whereas `regexp.c` uses `RE_UPDATE_SYNTAX_TABLE_*`
+ functions, passing them byteoffsets. */
+
/* Make syntax table state (gl_state) good for CHARPOS, assuming it is
currently good for a position before CHARPOS. */
@@ -178,6 +216,36 @@ UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t charpos)
update_syntax_table_forward (charpos, false, gl_state.object);
}
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t byteoffset)
+{ /* Performs just-in-time syntax-propertization. */
+ if (!parse_sexp_lookup_properties)
+ return;
+ eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */
+ if (byteoffset >= gl_state.e_re_byte)
+ {
+ ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+ eassert (charpos >= gl_state.e_property);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ re_update_byteoffsets ();
+ }
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE (ptrdiff_t byteoffset)
+{ /* Performs just-in-time syntax-propertization. */
+ if (!parse_sexp_lookup_properties)
+ return;
+ eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */
+ if (byteoffset > gl_state.e_re_byte)
+ {
+ ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset) - 1;
+ eassert (charpos >= gl_state.e_property);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ re_update_byteoffsets ();
+ }
+}
+
/* Make syntax table state (gl_state) good for CHARPOS, assuming it is
currently good for a position after CHARPOS. */
@@ -188,6 +256,36 @@ UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t charpos)
update_syntax_table (charpos, -1, false, gl_state.object);
}
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BACKWARD (ptrdiff_t byteoffset)
+{
+ if (!parse_sexp_lookup_properties)
+ return;
+ eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */
+ if (byteoffset < gl_state.b_re_byte)
+ {
+ ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+ eassert (charpos < gl_state.b_property);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+ re_update_byteoffsets ();
+ }
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (ptrdiff_t byteoffset)
+{
+ if (!parse_sexp_lookup_properties)
+ return;
+ eassert (gl_state.e_re_byte >= 0); /* gl_state.b_re_byte can be negative. */
+ if (byteoffset <= gl_state.b_re_byte)
+ {
+ ptrdiff_t charpos = RE_SYNTAX_TABLE_BYTE_TO_CHAR (byteoffset);
+ eassert (charpos <= gl_state.b_property);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos - 1);
+ re_update_byteoffsets ();
+ }
+}
+
/* Make syntax table good for CHARPOS. */
INLINE void
@@ -197,6 +295,20 @@ UPDATE_SYNTAX_TABLE (ptrdiff_t charpos)
UPDATE_SYNTAX_TABLE_FORWARD (charpos);
}
+INLINE void
+RE_UPDATE_SYNTAX_TABLE (ptrdiff_t byteoffset)
+{
+ RE_UPDATE_SYNTAX_TABLE_BACKWARD (byteoffset);
+ RE_UPDATE_SYNTAX_TABLE_FORWARD (byteoffset);
+}
+
+INLINE void
+RE_UPDATE_SYNTAX_TABLE_BEFORE (ptrdiff_t byteoffset)
+{
+ RE_UPDATE_SYNTAX_TABLE_BACKWARD_BEFORE (byteoffset);
+ RE_UPDATE_SYNTAX_TABLE_FORWARD_BEFORE (byteoffset);
+}
+
/* Set up the buffer-global syntax table. */
INLINE void
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, (continued)
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stefan Monnier, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Eli Zaretskii, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stefan Monnier, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Eli Zaretskii, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Alan Mackenzie, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stephen Berman, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Ihor Radchenko, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Ihor Radchenko, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stefan Monnier, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stefan Monnier, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers,
Stefan Monnier <=
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Ihor Radchenko, 2023/04/13
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Eli Zaretskii, 2023/04/13
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Eli Zaretskii, 2023/04/13
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Alan Mackenzie, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stefan Monnier, 2023/04/12
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Stefan Monnier, 2023/04/13
- bug#58558: 29.0.50; re-search-forward is slow in some buffers, Ihor Radchenko, 2023/04/13
bug#58558: 29.0.50; re-search-forward is slow in some buffers, Mattias EngdegÄrd, 2023/04/10