diff --git a/src/pcresearch.c b/src/pcresearch.c index 5451029..6bff1e4 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -38,6 +38,8 @@ static pcre_extra *extra; # endif #endif +#define INVALID(C) (to_uchar (C) < 0x80 || to_uchar (C) > 0xbf) + /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty string matches when that flag is used. */ static int empty_match[2]; @@ -156,6 +158,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size, char const *line_start = buf; int e = PCRE_ERROR_NOMATCH; char const *line_end; + int invalid = 0; /* If the input type is unknown, the caller is still testing the input, which means the current buffer cannot contain encoding @@ -212,25 +215,54 @@ Pexecute (char const *buf, size_t size, size_t *match_size, if (multiline) options |= PCRE_NO_UTF8_CHECK; - e = pcre_exec (cre, extra, p, search_bytes, 0, - options, sub, NSUB); - if (e != PCRE_ERROR_BADUTF8) + int valid_bytes = search_bytes; + if (invalid) { - if (0 < e && multiline && sub[1] - sub[0] != 0) + /* At least an encoding error was found. Other such errors + are likely to occur, and detecting them here is faster + in average than relying on pcre. */ + options |= PCRE_NO_UTF8_CHECK; + char const *p2 = p; + while (p2 != line_end) { - char const *nl = memchr (p + sub[0], eolbyte, - sub[1] - sub[0]); - if (nl) + unsigned char c = p2[0]; + size_t len = + c < 0x80 ? 1 : + c < 0xc2 || c > 0xf7 || INVALID(p2[1]) ? 0 : + c < 0xe0 ? 2 : INVALID(p2[2]) ? 0 : + c < 0xf0 ? 3 : INVALID(p2[3]) ? 0 : 4; + if (len == 0) { - /* This match crosses a line boundary; reject it. */ - p += sub[0]; - line_end = nl; - continue; + valid_bytes = p2 - p; + break; } + p2 += len; } - break; } - int valid_bytes = sub[0]; + + if (valid_bytes == search_bytes) + { + e = pcre_exec (cre, extra, p, search_bytes, 0, + options, sub, NSUB); + if (e != PCRE_ERROR_BADUTF8) + { + if (0 < e && multiline && sub[1] - sub[0] != 0) + { + char const *nl = memchr (p + sub[0], eolbyte, + sub[1] - sub[0]); + if (nl) + { + /* This match crosses a line boundary; reject it. */ + p += sub[0]; + line_end = nl; + continue; + } + } + break; + } + invalid = 1; + valid_bytes = sub[0]; + } /* Try to match the string before the encoding error. Again, handle the empty-match case specially, for speed. */