[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5742-gf473b8e9
From: |
Arnold Robbins |
Subject: |
[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5742-gf473b8e9 |
Date: |
Sun, 1 Sep 2024 14:40:55 -0400 (EDT) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".
The branch, feature/minrx has been updated
via f473b8e963c15aa9318417d680e58d1b3c224c5d (commit)
from 4c0d3b2e9140a3e901bb741db9be0deea47ae81e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=f473b8e963c15aa9318417d680e58d1b3c224c5d
commit f473b8e963c15aa9318417d680e58d1b3c224c5d
Author: Arnold D. Robbins <arnold@skeeve.com>
Date: Sun Sep 1 21:40:37 2024 +0300
MinRX updates.
diff --git a/ChangeLog b/ChangeLog
index 126de0da..604057ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2024-09-01 Arnold D. Robbins <arnold@skeeve.com>
+
+ * re.c (make_regexp): Add MINRX_REG_NATIVE1B flag.
+
2024-08-28 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (CCLD): Set to $(CXX) so that builds with clang
diff --git a/re.c b/re.c
index ad7e854b..97978453 100644
--- a/re.c
+++ b/re.c
@@ -327,7 +327,8 @@ make_regexp(const char *s, size_t len, bool ignorecase,
bool dfa, bool canfatal)
} else
rp->dfareg = NULL;
} else {
- int flags = MINRX_REG_EXTENDED | MINRX_REG_BRACK_ESCAPE |
MINRX_REG_BRACE_COMPAT;
+ int flags = MINRX_REG_EXTENDED | MINRX_REG_BRACK_ESCAPE |
+ MINRX_REG_BRACE_COMPAT | MINRX_REG_NATIVE1B;
int ret;
if (ignorecase)
diff --git a/support/ChangeLog b/support/ChangeLog
index a537e8c5..b7643570 100644
--- a/support/ChangeLog
+++ b/support/ChangeLog
@@ -1,3 +1,7 @@
+2024-09-01 Arnold D. Robbins <arnold@skeeve.com>
+
+ * minrx.h, minrx.cpp: Update from Mike.
+
2024-08-28 Arnold D. Robbins <arnold@skeeve.com>
* minrx.cpp: Update from Mike.
diff --git a/support/minrx.cpp b/support/minrx.cpp
index 27403b84..e21143b8 100644
--- a/support/minrx.cpp
+++ b/support/minrx.cpp
@@ -18,6 +18,7 @@
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
+#include <cctype>
#include <climits>
#include <clocale>
#include <cstddef>
@@ -27,6 +28,7 @@
#include <cwctype>
#include <algorithm>
#include <deque>
+#include <limits>
#include <map>
#include <mutex>
#include <optional>
@@ -248,40 +250,108 @@ constexpr int32_t WCharMax = 0x10FFFF; // maximum code
point: valid for Unicode
class WConv {
public:
enum { End = -1 };
+ enum class Encoding { Byte, MBtoWC, UTF8 };
private:
+ WConv &(WConv::*const nextfn)();
const char *const bp;
const char *const ep;
const char *cp;
std::mbstate_t mbs;
WChar wch = End;
int len = 0;
+ static WConv &(WConv::*const nextfns[])();
public:
WConv(const WConv &) = default;
- WConv(const char *bp, const char *ep): bp(bp), ep(ep), cp(bp) {
std::memset(&mbs, 0, sizeof mbs); }
+ WConv(Encoding e, const char *bp, const char *ep)
+ : nextfn(nextfns[(int) e]), bp(bp), ep(ep), cp(bp) {
+ std::memset(&mbs, 0, sizeof mbs);
+ }
auto look() const { return wch; }
- auto lookahead(WConv &(WConv::*next)()) const { return
(WConv(*this).*next)().look(); }
- WConv &nextchr() {
+ auto lookahead() const { return WConv(*this).nextchr().look(); }
+ WConv &nextchr() { return (this->*nextfn)(); }
+ WConv &nextbyte() {
+ if ((cp += len) != ep)
+ len = 1, wch = (unsigned char) *cp;
+ else
+ len = 0, wch = End;
+ return *this;
+ }
+ WConv &nextmbtowc() {
wchar_t wct = L'\0';
if ((cp += len) != ep) {
auto n = mbrtowc(&wct, cp, ep - cp, &mbs);
if (n == 0 || n == (std::size_t) -1 || n ==
(std::size_t) -2) {
len = 1;
if (wct == L'\0')
- wct = *cp & 0xff; // or some
other magic or 0x80000000
- } else
+ wct = std::numeric_limits<WChar>::min()
+ (unsigned char) *cp;
+ } else {
len = n;
+ }
wch = wct;
} else {
- wch = End, len = 0;
+ len = 0, wch = End;
}
return *this;
}
+ WConv &nextutf8() {
+ if ((cp += len) != ep) {
+ WChar u = (unsigned char) cp[0];
+ if (u < 0x80) {
+ len = 1, wch = u;
+ return *this;
+ }
+ if ((u & 0x40) == 0 || cp + 1 == ep) {
+ error:
+ len = 1, wch =
std::numeric_limits<WChar>::min() + u;
+ return *this;
+ }
+ WChar v = (unsigned char) cp[1];
+ if ((v & 0xC0) != 0x80)
+ goto error;
+ if ((u & 0x20) == 0) {
+ WChar r = ((u & 0x1F) << 6) | (v & 0x3F);
+ if (r < 0x80)
+ goto error;
+ len = 2, wch = r;
+ return *this;
+ }
+ if (cp + 2 == ep)
+ goto error;
+ WChar w = (unsigned char) cp[2];
+ if ((w & 0xC0) != 0x80)
+ goto error;
+ if ((u & 0x10) == 0) {
+ WChar r = ((u & 0x0F) << 12) | ((v & 0x3F) <<
6) | (w & 0x3F);
+ if (r < 0x800)
+ goto error;
+ len = 3, wch = r;
+ return *this;
+ }
+ if (cp + 3 == ep)
+ goto error;
+ WChar x = (unsigned char) cp[3];
+ if ((x & 0xC0) != 0x80)
+ goto error;
+ if ((u & 0x08) != 0)
+ goto error;
+ WChar r = ((u & 0x07) << 18) | ((v & 0x3F) << 12) | ((w
& 0x3F) << 6) | (x & 0x3F);
+ if (r < 0x010000 || r > 0x10FFFF)
+ goto error;
+ len = 4, wch = r;
+ return *this;
+ } else {
+ len = 0, wch = End;
+ return *this;
+ }
+ }
std::size_t off() const { return cp - bp; }
auto ptr() const { return cp; }
auto save() { return std::make_tuple(cp, wch, len); }
void restore(std::tuple<const char *, WChar, int> t) { std::tie(cp,
wch, len) = t; }
};
+WConv &(WConv::*const WConv::nextfns[3])() = { &WConv::nextbyte,
&WConv::nextmbtowc, &WConv::nextutf8 };
+
struct CSet {
struct Range {
Range(WChar x, WChar y): min(std::min(x, y)), max(std::max(x,
y)) {}
@@ -299,7 +369,7 @@ struct CSet {
}
CSet &invert() { inverted = true; return *this; }
CSet &set(WChar wclo, WChar wchi) {
- auto e = Range(wclo - (wclo != -2147483648), wchi + (wchi !=
2147483647));
+ auto e = Range(wclo - (wclo !=
std::numeric_limits<WChar>::min()), wchi + (wchi !=
std::numeric_limits<WChar>::max()));
auto [x, y] = ranges.equal_range(e);
if (x == y) {
ranges.insert(Range(wclo, wchi));
@@ -317,7 +387,7 @@ struct CSet {
}
CSet &set(WChar wc) { return set(wc, wc); }
bool test(WChar wc) const {
- if (inverted && wc < 0)
+ if (wc < 0)
return false;
auto i = ranges.lower_bound(Range(wc, wc));
return inverted ^ (i != ranges.end() && wc >= i->min && wc <=
i->max);
@@ -359,17 +429,19 @@ struct Regexp {
const std::vector<Node> nodes;
std::size_t nstk;
std::size_t nsub;
+ WConv::Encoding enc;
minrx_result_t err;
};
struct Compile {
const minrx_regcomp_flags_t flags;
+ WConv::Encoding enc;
WConv wconv;
std::vector<CSet> csets;
std::optional<std::size_t> dot;
std::map<WChar, unsigned int> icmap;
NInt nsub = 0;
- Compile(const char *bp, const char *ep, minrx_regcomp_flags_t flags):
flags(flags), wconv(bp, ep) { wconv.nextchr(); }
+ Compile(WConv::Encoding e, const char *bp, const char *ep,
minrx_regcomp_flags_t flags): flags(flags), enc(e), wconv(e, bp, ep) {
wconv.nextchr(); }
static std::map<std::string, CSet> cclmemo;
static std::mutex cclmutex;
bool cclass(CSet &cs, const std::string &name) {
@@ -380,15 +452,26 @@ struct Compile {
auto i = cclmemo.find(key);
if (i == cclmemo.end()) {
CSet cs;
- for (WChar wc = 0; wc <= WCharMax; ++wc) {
- if (iswctype(wc, wct)) {
- cs.set(wc);
- if ((flags & MINRX_REG_ICASE)
!= 0) {
-
cs.set(std::towlower(wc));
-
cs.set(std::towupper(wc));
+ if (enc == WConv::Encoding::Byte)
+ for (WChar b = 0; b <= 0xFF; ++b) {
+ if
(std::iswctype(std::btowc(b), wct)) {
+ cs.set(b);
+ if ((flags &
MINRX_REG_ICASE) != 0) {
+
cs.set(std::tolower(b));
+
cs.set(std::toupper(b));
+ }
+ }
+ }
+ else
+ for (WChar wc = 0; wc <= WCharMax;
++wc) {
+ if (std::iswctype(wc, wct)) {
+ cs.set(wc);
+ if ((flags &
MINRX_REG_ICASE) != 0) {
+
cs.set(std::towlower(wc));
+
cs.set(std::towupper(wc));
+ }
}
}
- }
cclmemo.emplace(key, cs);
i = cclmemo.find(key);
}
@@ -547,7 +630,10 @@ struct Compile {
wconv.nextchr();
continue;
case L'{':
- if ((flags & MINRX_REG_BRACE_COMPAT) == 0 ||
std::iswdigit(wconv.lookahead(&WConv::nextchr))) {
+ if ((flags & MINRX_REG_BRACE_COMPAT) == 0
+ || (enc == WConv::Encoding::Byte ?
std::isdigit(wconv.lookahead())
+ :
std::iswdigit(wconv.lookahead())))
+ {
if (optional || infinite) {
lh = mkrep(lh, optional,
infinite, nstk);
optional = infinite = false;
@@ -606,21 +692,28 @@ struct Compile {
if ((flags & MINRX_REG_ICASE) == 0) {
lhs.push_back({(NInt) wc, {0, 0}, nstk});
} else {
- WChar wcl = std::towlower(wc), wcu =
std::towupper(wc);
- auto key = std::min(wc, std::min(wcl, wcu));
- if (icmap.find(key) == icmap.end()) {
- icmap.emplace(key, csets.size());
- csets.emplace_back();
- csets.back().set(wc);
- csets.back().set(wcl);
- csets.back().set(wcu);
+ WChar wcl = enc == WConv::Encoding::Byte ?
std::tolower(wc) : std::towlower(wc);
+ WChar wcu = enc == WConv::Encoding::Byte ?
std::toupper(wc) : std::towupper(wc);
+ if (wc != wcl || wc != wcu) {
+ auto key = std::min(wc, std::min(wcl,
wcu));
+ if (icmap.find(key) == icmap.end()) {
+ icmap.emplace(key,
csets.size());
+ csets.emplace_back();
+ csets.back().set(wc);
+ csets.back().set(wcl);
+ csets.back().set(wcu);
+ }
+ lhs.push_back({Node::CSet, {icmap[key],
0}, nstk});
+ } else {
+ lhs.push_back({(NInt) wc, {0, 0},
nstk});
}
- lhs.push_back({Node::CSet, {icmap[key], 0},
nstk});
}
wconv.nextchr();
break;
case L'{':
- if ((flags & MINRX_REG_BRACE_COMPAT) != 0 &&
!std::iswdigit(wconv.lookahead(&WConv::nextchr)))
+ if ((flags & MINRX_REG_BRACE_COMPAT) != 0
+ && (enc == WConv::Encoding::Byte ?
!std::isdigit(wconv.lookahead())
+ :
!std::iswdigit(wconv.lookahead())))
goto normal;
// fall through
case L'*':
@@ -710,15 +803,17 @@ struct Compile {
wc = L'-';
}
}
- if (wclo > wchi)
+ if (wclo > wchi || (wclo != wchi &&
(wclo < 0 || wchi < 0)))
return {{}, 0,
MINRX_REG_ERANGE};
- cs.set(wclo, wchi);
- if ((flags & MINRX_REG_ICASE) != 0)
- for (auto wc = wclo; wc <=
wchi; ++wc) {
-
cs.set(std::tolower(wc));
-
cs.set(std::toupper(wc));
- }
- if (range && wc == L'-' &&
wconv.lookahead(&WConv::nextchr) != L']')
+ if (wclo >= 0) {
+ cs.set(wclo, wchi);
+ if ((flags & MINRX_REG_ICASE)
!= 0)
+ for (auto wc = wclo; wc
<= wchi; ++wc) {
+ cs.set(enc ==
WConv::Encoding::Byte ? std::tolower(wc) : std::towlower(wc));
+ cs.set(enc ==
WConv::Encoding::Byte ? std::toupper(wc) : std::towupper(wc));
+ }
+ }
+ if (range && wc == L'-' &&
wconv.lookahead() != L']')
return {{}, 0,
MINRX_REG_ERANGE};
}
lhs.push_back({Node::CSet, {csets.size(), 0},
nstk});
@@ -861,7 +956,7 @@ struct Compile {
} else {
lhs.push_back({Node::Exit, {0, 0}, 0});
}
- return new Regexp{ std::move(csets), {lhs.begin(), lhs.end()},
nstk, nsub + 1, err };
+ return new Regexp{ std::move(csets), {lhs.begin(), lhs.end()},
nstk, nsub + 1, enc, err };
}
};
@@ -887,7 +982,7 @@ struct Execute {
std::optional<COWVec<std::size_t, (std::size_t) -1>> best;
QSet<NInt> epsq { r.nodes.size() };
QVec<NInt, NState> epsv { r.nodes.size() };
- Execute(const Regexp &r, minrx_regexec_flags_t flags, const char *bp,
const char *ep) : r(r), flags(flags), wconv(bp, ep) {}
+ Execute(const Regexp &r, minrx_regexec_flags_t flags, const char *bp,
const char *ep) : r(r), flags(flags), wconv(r.enc, bp, ep) {}
void add(QVec<NInt, NState> &ncsv, NInt n, const NState &ns) {
if (r.nodes[n].type <= Node::CSet) {
auto [newly, oldns] = ncsv.insert(n, ns);
@@ -899,9 +994,10 @@ struct Execute {
epsq.insert(n);
}
}
- bool is_word(int wc) { return wc == L'_' || std::iswalnum(wc); }
void epsclosure(QVec<NInt, NState> &ncsv) {
auto nodes = r.nodes;
+ auto is_word = r.enc == WConv::Encoding::Byte ? [](WChar b) {
return b == '_' || std::isalnum(b); }
+ : [](WChar wc) {
return wc == L'_' || std::iswalnum(wc); };
do {
NInt k = epsq.remove();
NState &ns = epsv.lookup(k);
@@ -1133,7 +1229,18 @@ minrx_regexec(minrx_regex_t *rx, const char *s,
std::size_t nm, minrx_regmatch_t
int
minrx_regncomp(minrx_regex_t *rx, std::size_t ns, const char *s, int flags)
{
- auto r = MinRX::Compile(s, s + ns, (minrx_regcomp_flags_t)
flags).compile();
+ auto enc = MinRX::WConv::Encoding::MBtoWC;
+ auto loc = std::setlocale(LC_CTYPE, nullptr);
+ if ((loc != nullptr && loc[0] == 'C' && loc[1] == '\0') || ((flags &
MINRX_REG_NATIVE1B) != 0 && MB_CUR_MAX == 1))
+ enc = MinRX::WConv::Encoding::Byte;
+ else if (auto utf = std::strchr(loc ? loc : "", '.');
+ utf != nullptr && (utf[1] == 'U' || utf[1] == 'u')
+ && (utf[2] == 'T' || utf[2] == 't')
+ && (utf[3] == 'F' || utf[3] == 'f')
+ && ( (utf[4] == '8' && utf[5] == '\0')
+ || (utf[4] == '-' && utf[5] == '8' &&
utf[6] == '\0')))
+ enc = MinRX::WConv::Encoding::UTF8;
+ auto r = MinRX::Compile(enc, s, s + ns, (minrx_regcomp_flags_t)
flags).compile();
rx->re_regexp = r;
rx->re_nsub = r->nsub - 1;
rx->re_compflags = (minrx_regcomp_flags_t) flags;
diff --git a/support/minrx.h b/support/minrx.h
index ddb955aa..88034af8 100644
--- a/support/minrx.h
+++ b/support/minrx.h
@@ -39,7 +39,8 @@ typedef enum { /* Flags for
minrx_reg*comp() */
MINRX_REG_BRACE_COMPAT = 16, /* { begins interval expression only
when followed by digit */
MINRX_REG_BRACK_ESCAPE = 32, /* bracket expressions [...] allow
backslash escapes */
MINRX_REG_EXTENSIONS_BSD = 64, /* enable BSD extensions \< and \> */
- MINRX_REG_EXTENSIONS_GNU = 128 /* enable GNU extensions \b \B \s \S \w
\W */
+ MINRX_REG_EXTENSIONS_GNU = 128, /* enable GNU extensions \b \B \s \S \w
\W */
+ MINRX_REG_NATIVE1B = 256 /* use native encoding for 8-bit
character sets (MB_CUR_LEN == 1) */
} minrx_regcomp_flags_t;
typedef enum { /* Flags for minrx_reg*exec() */
diff --git a/test/ChangeLog b/test/ChangeLog
index 71885b3c..00215c31 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,8 @@
+2024-09-01 Arnold D. Robbins <arnold@skeeve.com>
+
+ * backbigs1.in, backbigs1.ok: Use a valid non-space
+ character instead of an invalid one.
+
2024-08-30 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (makepmafile): Don't ues $< in the rule, that's a
diff --git a/test/backbigs1.in b/test/backbigs1.in
index 16b415f4..78981922 100644
--- a/test/backbigs1.in
+++ b/test/backbigs1.in
@@ -1 +1 @@
-
+a
diff --git a/test/backbigs1.ok b/test/backbigs1.ok
index 16b415f4..78981922 100644
--- a/test/backbigs1.ok
+++ b/test/backbigs1.ok
@@ -1 +1 @@
-
+a
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 4 ++
re.c | 3 +-
support/ChangeLog | 4 ++
support/minrx.cpp | 185 ++++++++++++++++++++++++++++++++++++++++++------------
support/minrx.h | 3 +-
test/ChangeLog | 5 ++
test/backbigs1.in | 2 +-
test/backbigs1.ok | 2 +-
8 files changed, 165 insertions(+), 43 deletions(-)
hooks/post-receive
--
gawk
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5742-gf473b8e9,
Arnold Robbins <=