gawk-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5742-gf473b8e9


From: Arnold Robbins
Subject: [SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5742-gf473b8e9
Date: Sun, 1 Sep 2024 14:40:55 -0400 (EDT)

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".

The branch, feature/minrx has been updated
       via  f473b8e963c15aa9318417d680e58d1b3c224c5d (commit)
      from  4c0d3b2e9140a3e901bb741db9be0deea47ae81e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=f473b8e963c15aa9318417d680e58d1b3c224c5d

commit f473b8e963c15aa9318417d680e58d1b3c224c5d
Author: Arnold D. Robbins <arnold@skeeve.com>
Date:   Sun Sep 1 21:40:37 2024 +0300

    MinRX updates.

diff --git a/ChangeLog b/ChangeLog
index 126de0da..604057ba 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2024-09-01         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * re.c (make_regexp): Add MINRX_REG_NATIVE1B flag.
+
 2024-08-28         Arnold D. Robbins     <arnold@skeeve.com>
 
        * Makefile.am (CCLD): Set to $(CXX) so that builds with clang
diff --git a/re.c b/re.c
index ad7e854b..97978453 100644
--- a/re.c
+++ b/re.c
@@ -327,7 +327,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                } else
                        rp->dfareg = NULL;
        } else {
-               int flags = MINRX_REG_EXTENDED | MINRX_REG_BRACK_ESCAPE | 
MINRX_REG_BRACE_COMPAT;
+               int flags = MINRX_REG_EXTENDED | MINRX_REG_BRACK_ESCAPE |
+                               MINRX_REG_BRACE_COMPAT | MINRX_REG_NATIVE1B;
                int ret;
 
                if (ignorecase)
diff --git a/support/ChangeLog b/support/ChangeLog
index a537e8c5..b7643570 100644
--- a/support/ChangeLog
+++ b/support/ChangeLog
@@ -1,3 +1,7 @@
+2024-09-01         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * minrx.h, minrx.cpp: Update from Mike.
+
 2024-08-28         Arnold D. Robbins     <arnold@skeeve.com>
 
        * minrx.cpp: Update from Mike.
diff --git a/support/minrx.cpp b/support/minrx.cpp
index 27403b84..e21143b8 100644
--- a/support/minrx.cpp
+++ b/support/minrx.cpp
@@ -18,6 +18,7 @@
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
 
+#include <cctype>
 #include <climits>
 #include <clocale>
 #include <cstddef>
@@ -27,6 +28,7 @@
 #include <cwctype>
 #include <algorithm>
 #include <deque>
+#include <limits>
 #include <map>
 #include <mutex>
 #include <optional>
@@ -248,40 +250,108 @@ constexpr int32_t WCharMax = 0x10FFFF;   // maximum code 
point: valid for Unicode
 class WConv {
 public:
        enum { End = -1 };
+       enum class Encoding { Byte, MBtoWC, UTF8 };
 private:
+       WConv &(WConv::*const nextfn)();
        const char *const bp;
        const char *const ep;
        const char *cp;
        std::mbstate_t mbs;
        WChar wch = End;
        int len = 0;
+       static WConv &(WConv::*const nextfns[])();
 public:
        WConv(const WConv &) = default;
-       WConv(const char *bp, const char *ep): bp(bp), ep(ep), cp(bp) { 
std::memset(&mbs, 0, sizeof mbs); }
+       WConv(Encoding e, const char *bp, const char *ep)
+       : nextfn(nextfns[(int) e]), bp(bp), ep(ep), cp(bp) {
+               std::memset(&mbs, 0, sizeof mbs);
+       }
        auto look() const { return wch; }
-       auto lookahead(WConv &(WConv::*next)()) const { return 
(WConv(*this).*next)().look(); }
-       WConv &nextchr() {
+       auto lookahead() const { return WConv(*this).nextchr().look(); }
+       WConv &nextchr() { return (this->*nextfn)(); }
+       WConv &nextbyte() {
+               if ((cp += len) != ep)
+                       len = 1, wch = (unsigned char) *cp;
+               else
+                       len = 0, wch = End;
+               return *this;
+       }
+       WConv &nextmbtowc() {
                wchar_t wct = L'\0';
                if ((cp += len) != ep) {
                        auto n = mbrtowc(&wct, cp, ep - cp, &mbs);
                        if (n == 0 || n == (std::size_t) -1 || n == 
(std::size_t) -2) {
                                len = 1;
                                if (wct == L'\0')
-                                       wct = *cp & 0xff;       // or some 
other magic or 0x80000000
-                       } else
+                                       wct = std::numeric_limits<WChar>::min() 
+ (unsigned char) *cp;
+                       } else {
                                len = n;
+                       }
                        wch = wct;
                } else {
-                       wch = End, len = 0;
+                       len = 0, wch = End;
                }
                return *this;
        }
+       WConv &nextutf8() {
+               if ((cp += len) != ep) {
+                       WChar u = (unsigned char) cp[0];
+                       if (u < 0x80) {
+                               len = 1, wch = u;
+                               return *this;
+                       }
+                       if ((u & 0x40) == 0 || cp + 1 == ep) {
+                       error:
+                               len = 1, wch = 
std::numeric_limits<WChar>::min() + u;
+                               return *this;
+                       }
+                       WChar v = (unsigned char) cp[1];
+                       if ((v & 0xC0) != 0x80)
+                               goto error;
+                       if ((u & 0x20) == 0) {
+                               WChar r = ((u & 0x1F) << 6) | (v & 0x3F);
+                               if (r < 0x80)
+                                       goto error;
+                               len = 2, wch = r;
+                               return *this;
+                       }
+                       if (cp + 2 == ep)
+                               goto error;
+                       WChar w = (unsigned char) cp[2];
+                       if ((w & 0xC0) != 0x80)
+                               goto error;
+                       if ((u & 0x10) == 0) {
+                               WChar r = ((u & 0x0F) << 12) | ((v & 0x3F) << 
6) | (w & 0x3F);
+                               if (r < 0x800)
+                                       goto error;
+                               len = 3, wch = r;
+                               return *this;
+                       }
+                       if (cp + 3 == ep)
+                               goto error;
+                       WChar x = (unsigned char) cp[3];
+                       if ((x & 0xC0) != 0x80)
+                               goto error;
+                       if ((u & 0x08) != 0)
+                               goto error;
+                       WChar r = ((u & 0x07) << 18) | ((v & 0x3F) << 12) | ((w 
& 0x3F) << 6) | (x & 0x3F);
+                       if (r < 0x010000 || r > 0x10FFFF)
+                               goto error;
+                       len = 4, wch = r;
+                       return *this;
+               } else {
+                       len = 0, wch = End;
+                       return *this;
+               }
+       }
        std::size_t off() const { return cp - bp; }
        auto ptr() const { return cp; }
        auto save() { return std::make_tuple(cp, wch, len); }
        void restore(std::tuple<const char *, WChar, int> t) { std::tie(cp, 
wch, len) = t; }
 };
 
+WConv &(WConv::*const WConv::nextfns[3])() = { &WConv::nextbyte, 
&WConv::nextmbtowc, &WConv::nextutf8 };
+
 struct CSet {
        struct Range {
                Range(WChar x, WChar y): min(std::min(x, y)), max(std::max(x, 
y)) {}
@@ -299,7 +369,7 @@ struct CSet {
        }
        CSet &invert() { inverted = true; return *this; }
        CSet &set(WChar wclo, WChar wchi) {
-               auto e = Range(wclo - (wclo != -2147483648), wchi + (wchi != 
2147483647));
+               auto e = Range(wclo - (wclo != 
std::numeric_limits<WChar>::min()), wchi + (wchi != 
std::numeric_limits<WChar>::max()));
                auto [x, y] = ranges.equal_range(e);
                if (x == y) {
                        ranges.insert(Range(wclo, wchi));
@@ -317,7 +387,7 @@ struct CSet {
        }
        CSet &set(WChar wc) { return set(wc, wc); }
        bool test(WChar wc) const {
-               if (inverted && wc < 0)
+               if (wc < 0)
                        return false;
                auto i = ranges.lower_bound(Range(wc, wc));
                return inverted ^ (i != ranges.end() && wc >= i->min && wc <= 
i->max);
@@ -359,17 +429,19 @@ struct Regexp {
        const std::vector<Node> nodes;
        std::size_t nstk;
        std::size_t nsub;
+       WConv::Encoding enc;
        minrx_result_t err;
 };
 
 struct Compile {
        const minrx_regcomp_flags_t flags;
+       WConv::Encoding enc;
        WConv wconv;
        std::vector<CSet> csets;
        std::optional<std::size_t> dot;
        std::map<WChar, unsigned int> icmap;
        NInt nsub = 0;
-       Compile(const char *bp, const char *ep, minrx_regcomp_flags_t flags): 
flags(flags), wconv(bp, ep) { wconv.nextchr(); }
+       Compile(WConv::Encoding e, const char *bp, const char *ep, 
minrx_regcomp_flags_t flags): flags(flags), enc(e), wconv(e, bp, ep) { 
wconv.nextchr(); }
        static std::map<std::string, CSet> cclmemo;
        static std::mutex cclmutex;
        bool cclass(CSet &cs, const std::string &name) {
@@ -380,15 +452,26 @@ struct Compile {
                        auto i = cclmemo.find(key);
                        if (i == cclmemo.end()) {
                                CSet cs;
-                               for (WChar wc = 0; wc <= WCharMax; ++wc) {
-                                       if (iswctype(wc, wct)) {
-                                               cs.set(wc);
-                                               if ((flags & MINRX_REG_ICASE) 
!= 0) {
-                                                       
cs.set(std::towlower(wc));
-                                                       
cs.set(std::towupper(wc));
+                               if (enc == WConv::Encoding::Byte)
+                                       for (WChar b = 0; b <= 0xFF; ++b) {
+                                               if 
(std::iswctype(std::btowc(b), wct)) {
+                                                       cs.set(b);
+                                                       if ((flags & 
MINRX_REG_ICASE) != 0) {
+                                                               
cs.set(std::tolower(b));
+                                                               
cs.set(std::toupper(b));
+                                                       }
+                                               }
+                                       }
+                               else
+                                       for (WChar wc = 0; wc <= WCharMax; 
++wc) {
+                                               if (std::iswctype(wc, wct)) {
+                                                       cs.set(wc);
+                                                       if ((flags & 
MINRX_REG_ICASE) != 0) {
+                                                               
cs.set(std::towlower(wc));
+                                                               
cs.set(std::towupper(wc));
+                                                       }
                                                }
                                        }
-                               }
                                cclmemo.emplace(key, cs);
                                i = cclmemo.find(key);
                        }
@@ -547,7 +630,10 @@ struct Compile {
                                wconv.nextchr();
                                continue;
                        case L'{':
-                               if ((flags & MINRX_REG_BRACE_COMPAT) == 0 || 
std::iswdigit(wconv.lookahead(&WConv::nextchr))) {
+                               if ((flags & MINRX_REG_BRACE_COMPAT) == 0
+                                   || (enc == WConv::Encoding::Byte ? 
std::isdigit(wconv.lookahead())
+                                                                    : 
std::iswdigit(wconv.lookahead())))
+                               {
                                        if (optional || infinite) {
                                                lh = mkrep(lh, optional, 
infinite, nstk);
                                                optional = infinite = false;
@@ -606,21 +692,28 @@ struct Compile {
                        if ((flags & MINRX_REG_ICASE) == 0) {
                                lhs.push_back({(NInt) wc, {0, 0}, nstk});
                        } else {
-                               WChar wcl = std::towlower(wc), wcu = 
std::towupper(wc);
-                               auto key = std::min(wc, std::min(wcl, wcu));
-                               if (icmap.find(key) == icmap.end()) {
-                                       icmap.emplace(key, csets.size());
-                                       csets.emplace_back();
-                                       csets.back().set(wc);
-                                       csets.back().set(wcl);
-                                       csets.back().set(wcu);
+                               WChar wcl = enc == WConv::Encoding::Byte ? 
std::tolower(wc) : std::towlower(wc);
+                               WChar wcu = enc == WConv::Encoding::Byte ? 
std::toupper(wc) : std::towupper(wc);
+                               if (wc != wcl || wc != wcu) {
+                                       auto key = std::min(wc, std::min(wcl, 
wcu));
+                                       if (icmap.find(key) == icmap.end()) {
+                                               icmap.emplace(key, 
csets.size());
+                                               csets.emplace_back();
+                                               csets.back().set(wc);
+                                               csets.back().set(wcl);
+                                               csets.back().set(wcu);
+                                       }
+                                       lhs.push_back({Node::CSet, {icmap[key], 
0}, nstk});
+                               } else {
+                                       lhs.push_back({(NInt) wc, {0, 0}, 
nstk});
                                }
-                               lhs.push_back({Node::CSet, {icmap[key], 0}, 
nstk});
                        }
                        wconv.nextchr();
                        break;
                case L'{':
-                       if ((flags & MINRX_REG_BRACE_COMPAT) != 0 && 
!std::iswdigit(wconv.lookahead(&WConv::nextchr)))
+                       if ((flags & MINRX_REG_BRACE_COMPAT) != 0
+                           && (enc == WConv::Encoding::Byte ? 
!std::isdigit(wconv.lookahead())
+                                                            : 
!std::iswdigit(wconv.lookahead())))
                                goto normal;
                        // fall through
                case L'*':
@@ -710,15 +803,17 @@ struct Compile {
                                                        wc = L'-';
                                                }
                                        }
-                                       if (wclo > wchi)
+                                       if (wclo > wchi || (wclo != wchi && 
(wclo < 0 || wchi < 0)))
                                                return {{}, 0, 
MINRX_REG_ERANGE};
-                                       cs.set(wclo, wchi);
-                                       if ((flags & MINRX_REG_ICASE) != 0)
-                                               for (auto wc = wclo; wc <= 
wchi; ++wc) {
-                                                       
cs.set(std::tolower(wc));
-                                                       
cs.set(std::toupper(wc));
-                                               }
-                                       if (range && wc == L'-' && 
wconv.lookahead(&WConv::nextchr) != L']')
+                                       if (wclo >= 0) {
+                                               cs.set(wclo, wchi);
+                                               if ((flags & MINRX_REG_ICASE) 
!= 0)
+                                                       for (auto wc = wclo; wc 
<= wchi; ++wc) {
+                                                               cs.set(enc == 
WConv::Encoding::Byte ? std::tolower(wc) : std::towlower(wc));
+                                                               cs.set(enc == 
WConv::Encoding::Byte ? std::toupper(wc) : std::towupper(wc));
+                                                       }
+                                       }
+                                       if (range && wc == L'-' && 
wconv.lookahead() != L']')
                                                return {{}, 0, 
MINRX_REG_ERANGE};
                                }
                                lhs.push_back({Node::CSet, {csets.size(), 0}, 
nstk});
@@ -861,7 +956,7 @@ struct Compile {
                } else {
                        lhs.push_back({Node::Exit, {0, 0}, 0});
                }
-               return new Regexp{ std::move(csets), {lhs.begin(), lhs.end()}, 
nstk, nsub + 1, err };
+               return new Regexp{ std::move(csets), {lhs.begin(), lhs.end()}, 
nstk, nsub + 1, enc, err };
        }
 };
 
@@ -887,7 +982,7 @@ struct Execute {
        std::optional<COWVec<std::size_t, (std::size_t) -1>> best;
        QSet<NInt> epsq { r.nodes.size() };
        QVec<NInt, NState> epsv { r.nodes.size() };
-       Execute(const Regexp &r, minrx_regexec_flags_t flags, const char *bp, 
const char *ep) : r(r), flags(flags), wconv(bp, ep) {}
+       Execute(const Regexp &r, minrx_regexec_flags_t flags, const char *bp, 
const char *ep) : r(r), flags(flags), wconv(r.enc, bp, ep) {}
        void add(QVec<NInt, NState> &ncsv, NInt n, const NState &ns) {
                if (r.nodes[n].type <= Node::CSet) {
                        auto [newly, oldns] = ncsv.insert(n, ns);
@@ -899,9 +994,10 @@ struct Execute {
                                epsq.insert(n);
                }
        }
-       bool is_word(int wc) { return wc == L'_' || std::iswalnum(wc); }
        void epsclosure(QVec<NInt, NState> &ncsv) {
                auto nodes = r.nodes;
+               auto is_word = r.enc == WConv::Encoding::Byte ? [](WChar b) { 
return b == '_' || std::isalnum(b); }
+                                                             : [](WChar wc) { 
return wc == L'_' || std::iswalnum(wc); };
                do {
                        NInt k = epsq.remove();
                        NState &ns = epsv.lookup(k);
@@ -1133,7 +1229,18 @@ minrx_regexec(minrx_regex_t *rx, const char *s, 
std::size_t nm, minrx_regmatch_t
 int
 minrx_regncomp(minrx_regex_t *rx, std::size_t ns, const char *s, int flags)
 {
-       auto r = MinRX::Compile(s, s + ns, (minrx_regcomp_flags_t) 
flags).compile();
+       auto enc = MinRX::WConv::Encoding::MBtoWC;
+       auto loc = std::setlocale(LC_CTYPE, nullptr);
+       if ((loc != nullptr && loc[0] == 'C' && loc[1] == '\0') || ((flags & 
MINRX_REG_NATIVE1B) != 0 && MB_CUR_MAX == 1))
+               enc = MinRX::WConv::Encoding::Byte;
+       else if (auto utf = std::strchr(loc ? loc : "", '.');
+                utf != nullptr && (utf[1] == 'U' || utf[1] == 'u')
+                               && (utf[2] == 'T' || utf[2] == 't')
+                               && (utf[3] == 'F' || utf[3] == 'f')
+                               && (   (utf[4] == '8' && utf[5] == '\0')
+                                   || (utf[4] == '-' && utf[5] == '8' && 
utf[6] == '\0')))
+               enc = MinRX::WConv::Encoding::UTF8;
+       auto r = MinRX::Compile(enc, s, s + ns, (minrx_regcomp_flags_t) 
flags).compile();
        rx->re_regexp = r;
        rx->re_nsub = r->nsub - 1;
        rx->re_compflags = (minrx_regcomp_flags_t) flags;
diff --git a/support/minrx.h b/support/minrx.h
index ddb955aa..88034af8 100644
--- a/support/minrx.h
+++ b/support/minrx.h
@@ -39,7 +39,8 @@ typedef enum {                                /* Flags for 
minrx_reg*comp() */
        MINRX_REG_BRACE_COMPAT = 16,    /* { begins interval expression only 
when followed by digit */
        MINRX_REG_BRACK_ESCAPE = 32,    /* bracket expressions [...] allow 
backslash escapes */
        MINRX_REG_EXTENSIONS_BSD = 64,  /* enable BSD extensions \< and \> */
-       MINRX_REG_EXTENSIONS_GNU = 128  /* enable GNU extensions \b \B \s \S \w 
\W */
+       MINRX_REG_EXTENSIONS_GNU = 128, /* enable GNU extensions \b \B \s \S \w 
\W */
+       MINRX_REG_NATIVE1B = 256        /* use native encoding for 8-bit 
character sets (MB_CUR_LEN == 1) */
 } minrx_regcomp_flags_t;
 
 typedef enum {                         /* Flags for minrx_reg*exec() */
diff --git a/test/ChangeLog b/test/ChangeLog
index 71885b3c..00215c31 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,8 @@
+2024-09-01         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * backbigs1.in, backbigs1.ok: Use a valid non-space
+       character instead of an invalid one.
+
 2024-08-30         Arnold D. Robbins     <arnold@skeeve.com>
 
        * Makefile.am (makepmafile): Don't ues $< in the rule, that's a
diff --git a/test/backbigs1.in b/test/backbigs1.in
index 16b415f4..78981922 100644
--- a/test/backbigs1.in
+++ b/test/backbigs1.in
@@ -1 +1 @@
-‚
+a
diff --git a/test/backbigs1.ok b/test/backbigs1.ok
index 16b415f4..78981922 100644
--- a/test/backbigs1.ok
+++ b/test/backbigs1.ok
@@ -1 +1 @@
-‚
+a

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog         |   4 ++
 re.c              |   3 +-
 support/ChangeLog |   4 ++
 support/minrx.cpp | 185 ++++++++++++++++++++++++++++++++++++++++++------------
 support/minrx.h   |   3 +-
 test/ChangeLog    |   5 ++
 test/backbigs1.in |   2 +-
 test/backbigs1.ok |   2 +-
 8 files changed, 165 insertions(+), 43 deletions(-)


hooks/post-receive
-- 
gawk



reply via email to

[Prev in Thread] Current Thread [Next in Thread]