[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5659-g5aa09a55
From: |
Arnold Robbins |
Subject: |
[SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5659-g5aa09a55 |
Date: |
Sat, 20 Jul 2024 15:24:51 -0400 (EDT) |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".
The branch, feature/minrx has been updated
via 5aa09a55106ecb72d3145e51b025c9fffe9796e5 (commit)
from da96ad23e44068f5904121e2881531251bb45777 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=5aa09a55106ecb72d3145e51b025c9fffe9796e5
commit 5aa09a55106ecb72d3145e51b025c9fffe9796e5
Author: Arnold D. Robbins <arnold@skeeve.com>
Date: Sat Jul 20 22:23:41 2024 +0300
Include minrx.{h,cpp} and some doc updates.
diff --git a/ChangeLog b/ChangeLog
index 9bd71428..f7888c52 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2024-07-20 Arnold D. Robbins <arnold@skeeve.com>
+
+ * version.c: Add "-minrx" to the version, for now.
+
2024-07-02 Arnold D. Robbins <arnold@skeeve.com>
* re.c (make_regexp): \u escapes also now treated literally
diff --git a/README_d/ChangeLog b/README_d/ChangeLog
index 0fd6b5b6..a815dc00 100644
--- a/README_d/ChangeLog
+++ b/README_d/ChangeLog
@@ -1,3 +1,7 @@
+2024-07-20 Arnold D. Robbins <arnold@skeeve.com>
+
+ * README.matchers: Updated.
+
2024-03-10 Arnold D. Robbins <arnold@skeeve.com>
* README.matchers: New file.
diff --git a/README_d/README.matchers b/README_d/README.matchers
index 80278993..50a726e3 100644
--- a/README_d/README.matchers
+++ b/README_d/README.matchers
@@ -1,11 +1,11 @@
-Sun 10 Mar 2024 10:39:42 IST
-============================
+Sat Jul 20 10:18:28 PM IDT 2024
+===============================
* I * M * P * O * R * T * A * N * T *
This release includes a new regular expression matcher, MinRX, written
by Mike Haertel, the original author of GNU grep. It's available from
-GITHUB URL HERE.
+https://github.com/mikehaertel/minrx.
This matcher is fully POSIX compliant, which the current GNU matchers
are not. In particular it follows POSIX rules for finding the longest
diff --git a/support/ChangeLog b/support/ChangeLog
index 346c2ee2..415829be 100644
--- a/support/ChangeLog
+++ b/support/ChangeLog
@@ -1,3 +1,7 @@
+2024-07-20 Arnold D. Robbins <arnold@skeeve.com>
+
+ * minrx.h, minrx.cpp: New files.
+
2024-07-16 Arnold D. Robbins <arnold@skeeve.com>
* Makefile.am (CXXFLAGS): Added for new version of minrx.
diff --git a/support/minrx.cpp b/support/minrx.cpp
new file mode 100644
index 00000000..ceb4b78b
--- /dev/null
+++ b/support/minrx.cpp
@@ -0,0 +1,1176 @@
+//
+// MinRX: a minimal matcher for POSIX Extended Regular Expressions.
+// Copyright (C) 2023, 2024 Michael J. Haertel.
+//
+// This file is part of MinRX.
+//
+// MinRX is free software; you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation; either version 3 of the License, or
+// (at your option) any later version.
+//
+// MinRX is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+// See the GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#include <cctype>
+#include <climits>
+#include <clocale>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <cwchar>
+#include <cwctype>
+#include <algorithm>
+#include <deque>
+#include <map>
+#include <mutex>
+#include <optional>
+#include <set>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "minrx.h"
+
+namespace MinRX {
+
+template <typename UINT> inline auto ctz(UINT x) { return __builtin_ctz(x); }
+template <> inline auto ctz(unsigned long x) { return __builtin_ctzl(x); }
+template <> inline auto ctz(unsigned long long x) { return __builtin_ctzll(x);
}
+
+template <typename TYPE, TYPE INIT = 0>
+struct COWVec {
+ struct Storage;
+ struct Allocator {
+ std::size_t length;
+ Storage *freelist = nullptr;
+ Allocator(std::size_t length)
+ : length(length)
+ {}
+ ~Allocator() {
+ for (Storage *s = freelist, *sfreelink = nullptr; s !=
nullptr; s = sfreelink) {
+ sfreelink = s->u.freelink;
+ ::operator delete(s);
+ }
+ }
+ Storage *alloc() {
+ Storage *r;
+ if (freelist) {
+ r = freelist;
+ freelist = r->u.freelink;
+ r->u.allocator = this;
+ r->refcnt = 1;
+ } else {
+ void *p = ::operator new(sizeof (Storage) +
(length - 1) * sizeof (TYPE));
+ r = new (p) Storage(*this);
+ }
+ for (std::size_t i = 0; i != length; ++i)
+ (*r)[i] = INIT;
+ return r;
+ }
+ void dealloc(Storage *s) {
+ s->u.freelink = freelist;
+ freelist = s;
+ }
+ };
+ struct Storage {
+ union {
+ Allocator *allocator;
+ Storage *freelink;
+ } u;
+ std::size_t refcnt;
+ TYPE hack[1];
+ const TYPE &operator[](std::size_t i) const { return
(&hack[0])[i]; }
+ TYPE &operator[](std::size_t i) { return (&hack[0])[i]; }
+ Storage *clone() {
+ auto s = u.allocator->alloc();
+ for (std::size_t i = 0; i != u.allocator->length; ++i)
+ (*s)[i] = (*this)[i];
+ return s;
+ }
+ Storage(Allocator &a)
+ : u({&a})
+ , refcnt(1)
+ {}
+ };
+ Storage *storage;
+ COWVec(): storage(nullptr) {}
+ COWVec(Allocator &a): storage(a.alloc()) {}
+ COWVec(const COWVec &cv): storage(cv.storage) { ++storage->refcnt; }
+ COWVec(COWVec &&cv): storage(cv.storage) { cv.storage = nullptr; }
+ COWVec &operator=(const COWVec &cv) {
+ ++cv.storage->refcnt;
+ if (storage && --storage->refcnt == 0)
+ storage->u.allocator->dealloc(storage);
+ storage = cv.storage;
+ return *this;
+ }
+ COWVec &operator=(COWVec &&cv) {
+ if (storage && --storage->refcnt == 0)
+ storage->u.allocator->dealloc(storage);
+ storage = cv.storage;
+ cv.storage = nullptr;
+ return *this;
+ }
+ ~COWVec() { if (storage && --storage->refcnt == 0)
storage->u.allocator->dealloc(storage); }
+ bool cmpgt(const COWVec &other, std::size_t limit) const {
+ std::size_t i = 0;
+ TYPE *xv = &(*storage)[0];
+ TYPE *yv = &(*other.storage)[0];
+ while (i < limit && xv[i] == yv[i])
+ ++i;
+ if (i == limit)
+ return false;
+ return xv[i] > yv[i];
+ }
+ const TYPE &get(std::size_t idx) const { return (*storage)[idx]; }
+ COWVec &put(std::size_t idx, TYPE val) {
+ if ((*storage)[idx] == val)
+ return *this;
+ if (storage->refcnt > 1) {
+ --storage->refcnt;
+ storage = storage->clone();
+ storage->refcnt = 1;
+ }
+ (*storage)[idx] = val;
+ return *this;
+ }
+ COWVec &sub(std::size_t idx, TYPE val) {
+ if (storage->refcnt > 1) {
+ --storage->refcnt;
+ storage = storage->clone();
+ storage->refcnt = 1;
+ }
+ (*storage)[idx] -= val;
+ return *this;
+ }
+};
+
+template <typename UINT>
+struct QSet {
+ std::uint64_t *bits[10];
+ unsigned int depth = 0;
+ QSet(UINT limit) {
+ std::size_t s[10], t = 0;
+ do
+ t += (limit = s[depth++] = (limit + 63u) / 64u);
+ while (limit > 1);
+ bits[0] = (std::uint64_t *) ::operator new(t * sizeof
(std::uint64_t));
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+ for (unsigned int i = 1; i < depth; ++i)
+ bits[i] = bits[i - 1] + s[i - 1];
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+ bits[depth - 1][0] = 0;
+ }
+ ~QSet() { ::operator delete(bits[0]); }
+ bool contains(UINT k) const {
+ for (auto d = depth; d > 0; ) {
+ std::size_t idx = k >> 6 * d;
+ unsigned int bit = (k >> 6 * --d) & 0x3f;
+ if ((bits[d][idx] & ((std::uint64_t) 1 << bit)) == 0)
+ return false;
+ }
+ return true;
+ }
+ bool empty() const { return !bits[depth - 1][0]; }
+ UINT remove() {
+ UINT k = 0;
+ auto d = depth;
+ do {
+ auto m = bits[--d][k];
+ k = (k << 6) | ctz(m);
+ } while (d != 0);
+ UINT r = k;
+ for (; d < depth && !(bits[d][k >> 6] &= ~((std::uint64_t) 1 <<
(k & 0x3f))); k >>= 6, ++d)
+ ;
+ return r;
+ }
+ bool insert(UINT k) {
+ bool r = false;
+ for (auto d = depth; d-- != 0; ) {
+ auto bp = bits[d] + ((k >> 6 * d) >> 6);
+ std::uint64_t m = (std::uint64_t) 1 << ((k >> 6 * d) &
0x3f);
+ if ((*bp & m) == 0) {
+ if (d)
+ bits[d - 1][k >> (6 * d)] = 0;
+ else
+ r = true;
+ }
+ *bp |= m;
+ }
+ return r;
+ }
+};
+
+template <typename UINT, typename DATA>
+struct QVec {
+ QSet<UINT> qset;
+ DATA *storage;
+ QVec(UINT l): qset(l), storage(static_cast<DATA *>(::operator new(l *
sizeof (DATA)))) {}
+ ~QVec() {
+ clear();
+ ::operator delete(storage);
+ }
+ void clear() {
+ while (!qset.empty()) {
+ auto i = qset.remove();
+ storage[i].~DATA();
+ }
+ }
+ bool contains(UINT k) const { return qset.contains(k); }
+ bool empty() const { return qset.empty(); }
+ std::tuple<bool, DATA&> insert(UINT k, const DATA& v) {
+ bool r = qset.insert(k);
+ if (r)
+ new (storage + k) DATA(v);
+ return {r, storage[k]};
+ }
+ DATA &lookup(UINT k) { return storage[k]; }
+ const DATA &lookup(UINT k) const { return storage[k]; }
+ std::tuple<UINT, DATA> remove() {
+ auto k = qset.remove();
+ return {k, std::move(storage[k])};
+ }
+};
+
+typedef int32_t WChar; // because wchar_t may not be 32 bits
+constexpr int32_t WCharMax = 0x10FFFF; // maximum code point: valid for
Unicode and (FIXME!) blithely assumed for non-Unicode
+struct WConv {
+ enum { End = -1 };
+ const char *const bp;
+ const char *const ep;
+ const char *cp;
+ std::mbstate_t mbs;
+ WChar wch = End;
+ int len = 0;
+ WConv(const WConv &) = default;
+ WConv(const char *bp, const char *ep): bp(bp), ep(ep), cp(bp) {
std::memset(&mbs, 0, sizeof mbs); }
+ auto look() const { return wch; }
+ auto lookahead(WConv &(WConv::*next)()) const { return
(WConv(*this).*next)().look(); }
+ WConv &nextchr() {
+ wchar_t wct = L'\0';
+ if ((cp += len) != ep) {
+ auto n = mbrtowc(&wct, cp, ep - cp, &mbs);
+ if (n == 0 || n == (std::size_t) -1 || n ==
(std::size_t) -2)
+ len = 1;
+ else
+ len = n;
+ wch = wct;
+ } else {
+ wch = End, len = 0;
+ }
+ return *this;
+ }
+ auto save() { return std::make_tuple(cp, wch, len); }
+ void restore(std::tuple<const char *, WChar, int> t) { std::tie(cp,
wch, len) = t; }
+};
+
+struct CSet {
+ struct Range {
+ Range(WChar x, WChar y): min(std::min(x, y)), max(std::max(x,
y)) {}
+ WChar min, max;
+ int operator<=>(const Range &r) const {
+ return (min > r.max) - (max < r.min);
+ }
+ };
+ std::set<Range> ranges;
+ bool inverted = false;
+ CSet &operator|=(const CSet &cs) {
+ for (const auto &e : cs.ranges)
+ set(e.min, e.max);
+ return *this;
+ }
+ CSet &invert() { inverted = true; return *this; }
+ CSet &set(WChar wclo, WChar wchi) {
+ auto e = Range(wclo - (wclo != -2147483648), wchi + (wchi !=
2147483647));
+ auto [x, y] = ranges.equal_range(e);
+ if (x == y) {
+ ranges.insert(Range(wclo, wchi));
+ } else {
+ if (x->max >= e.min)
+ wclo = std::min(wclo, x->min);
+ auto z = y;
+ --z;
+ if (z->min <= e.max)
+ wchi = std::max(wchi, z->max);
+ auto i = ranges.erase(x, y);
+ ranges.insert(i, Range(wclo, wchi));
+ }
+ return *this;
+ }
+ CSet &set(WChar wc) { return set(wc, wc); }
+ bool test(WChar wc) const {
+ auto i = ranges.lower_bound(Range(wc, wc));
+ return inverted ^ (i != ranges.end() && wc >= i->min && wc <=
i->max);
+ }
+};
+
+typedef std::size_t NInt;
+
+struct Node {
+ enum Type {
+ // character-matching node types
+ /* char <= uchar max */ // no args
+ CSet = WCharMax + 1, // args = index in Regexp::csets vector
+ // epsilon-matching node types
+ Exit, // no args
+ Fork, // args = priority delta, offset to next
+ Goto, // args = offset to join, offset to next
+ Join, // args = offset to fork
+ Loop, // args = offset to next, optional flag
+ Next, // args = offset to loop, infinite flag
+ SubL, // args = minimum and maximum contained
subexpression numbers
+ SubR, // args = minimum and maximum contained
subexpression numbers
+ ZBOB, // no args - match empty string at
beginning of buffer
+ ZEOB, // no args - match empty string at end
of buffer
+ ZBOL, // no args - match empty string at
beginning of buffer or following \n
+ ZEOL, // no args - match empty string at end
of buffer or preceding \n
+ ZBOW, // no args - match empty string at
beginning of word
+ ZEOW, // no args - match empty string at end
of word
+ ZXOW, // no args - match empty string at
either end of word
+ ZNWB // no args - match empty string at
non-word-boundary
+ };
+ NInt type;
+ NInt args[2];
+ NInt nstk;
+};
+
+struct Regexp {
+ const std::vector<CSet> csets;
+ const std::vector<Node> nodes;
+ std::size_t nstk;
+ std::size_t nsub;
+ minrx_result_t err;
+};
+
+struct Compile {
+ const minrx_regcomp_flags_t flags;
+ WConv wconv;
+ std::vector<CSet> csets;
+ std::optional<std::size_t> dot;
+ std::map<WChar, unsigned int> icmap;
+ NInt nsub = 0;
+ Compile(const char *bp, const char *ep, minrx_regcomp_flags_t flags):
flags(flags), wconv(bp, ep) { wconv.nextchr(); }
+ static std::map<std::string, CSet> cclmemo;
+ static std::mutex cclmutex;
+ bool cclass(CSet &cs, const std::string &name) {
+ auto wct = std::wctype(name.c_str());
+ if (wct) {
+ std::string key = name + ":" + std::setlocale(LC_CTYPE,
NULL) + ":" + ((flags & MINRX_REG_ICASE) != 0 ? "1" : "0");
+ std::lock_guard<std::mutex> lock(cclmutex);
+ auto i = cclmemo.find(key);
+ if (i == cclmemo.end()) {
+ CSet cs;
+ for (WChar wc = 0; wc <= WCharMax; ++wc) {
+ if (iswctype(wc, wct)) {
+ cs.set(wc);
+ if ((flags & MINRX_REG_ICASE)
!= 0) {
+
cs.set(std::towlower(wc));
+
cs.set(std::towupper(wc));
+ }
+ }
+ }
+ cclmemo.emplace(key, cs);
+ i = cclmemo.find(key);
+ }
+ cs |= i->second; // N.B. could probably be safely
outside the critical section, since cclmemo entries are never deleted
+ return true;
+ }
+ return false;
+ }
+ bool num(NInt &n) {
+ auto satmul = [](NInt x, NInt y) -> NInt {
+ return (x == 0 || y == 0) ? 0 : ((x * y / x == y) ? x *
y : -1);
+ };
+ auto wc = wconv.look();
+ if (wc < L'0' || wc > L'9')
+ return false;
+ NInt v = 0;
+ do {
+ v = satmul(v, 10);
+ if (v == (NInt) -1 || (v += wc - L'0') < (NInt) wc -
L'0') {
+ do
+ wc = wconv.nextchr().look();
+ while (wc >= L'0' && wc <= L'9');
+ n = -1;
+ return true;
+ }
+ wc = wconv.nextchr().look();
+ } while (wc >= L'0' && wc <= L'9');
+ n = v;
+ return true;
+ }
+ typedef std::tuple<std::deque<Node>, std::size_t, minrx_result_t>
Subexp;
+ Subexp alt(bool nested, NInt nstk) {
+ auto [lhs, lhmaxstk, err] = cat(nested, nstk);
+ if (err)
+ return {lhs, lhmaxstk, err};
+ if (wconv.look() == L'|') {
+ for (auto &l : lhs)
+ l.nstk += 2;
+ std::vector<Subexp> alts;
+ while (wconv.look() == L'|') {
+ wconv.nextchr();
+ alts.push_back(cat(nested, nstk + 1));
+ }
+ auto [rhs, rhmaxstk, err] = alts.back();
+ if (err)
+ return {rhs, rhmaxstk, err};
+ rhs.push_front({Node::Goto, {rhs.size(), rhs.size()},
nstk + 1});
+ alts.pop_back();
+ while (!alts.empty()) {
+ auto [mhs, mhmaxstk, _] = alts.back();
+ alts.pop_back();
+ rhs.insert(rhs.begin(), mhs.begin(), mhs.end());
+ rhmaxstk = std::max(mhmaxstk, rhmaxstk);
+ rhs.push_front({Node::Goto, {rhs.size(),
mhs.size()}, nstk + 1});
+ }
+ lhs.push_front({Node::Fork, {(NInt) -1, lhs.size()},
nstk});
+ lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+ lhmaxstk = std::max(lhmaxstk, rhmaxstk);
+ lhs.push_back({Node::Join, {lhs.size() - 1, 0}, nstk +
1});
+ }
+ return {lhs, lhmaxstk, MINRX_REG_SUCCESS};
+ }
+ Subexp cat(bool nested, NInt nstk) {
+ auto [lhs, lhmaxstk, err] = rep(nested, nstk);
+ if (err)
+ return {lhs, lhmaxstk, err};
+ auto wc = wconv.look();
+ while (wc != WConv::End && wc != L'|' && (wc != L')' ||
!nested)) {
+ auto [rhs, rhmaxstk, err] = rep(nested, nstk);
+ if (err)
+ return {rhs, rhmaxstk, err};
+ lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+ lhmaxstk = std::max(lhmaxstk, rhmaxstk);
+ wc = wconv.look();
+ }
+ return {lhs, lhmaxstk, MINRX_REG_SUCCESS};
+ }
+ Subexp mkrep(const Subexp &lh, bool optional, bool infinite, NInt nstk)
{
+ auto [lhs, lhmaxstk, _] = lh;
+ if (optional && !infinite) {
+ for (auto &l : lhs) l.nstk += 1;
+ auto lhsize = lhs.size();
+ lhs.push_front({Node::Fork, {(NInt) 1, lhsize}, nstk});
+ lhs.push_back({Node::Join, {lhsize, 0}, nstk + 1});
+ return {lhs, lhmaxstk + 1, MINRX_REG_SUCCESS};
+ } else {
+ for (auto &l : lhs) l.nstk += 3;
+ auto lhsize = lhs.size();
+ lhs.push_front({Node::Loop, {lhsize, (NInt) optional},
nstk});
+ lhs.push_back({Node::Next, {lhsize, (NInt) infinite},
nstk + 3});
+ return {lhs, lhmaxstk + 3, MINRX_REG_SUCCESS};
+ }
+ }
+ Subexp mkrep(const Subexp &lh, NInt m, NInt n, NInt nstk) {
+ if ((m != (NInt) -1 && m > RE_DUP_MAX) || (n != (NInt) -1 && n
> RE_DUP_MAX) || m > n)
+ return {{}, 0, MINRX_REG_BADBR};
+ if (n == 0)
+ return {{}, 0, MINRX_REG_SUCCESS};
+ if (m == 0 && n == 1)
+ return mkrep(lh, true, false, nstk);
+ if (m == 0 && n == (NInt) -1)
+ return mkrep(lh, true, true, nstk);
+ if (m == 1 && n == 1)
+ return lh;
+ if (m == 1 && n == (NInt) -1)
+ return mkrep(lh, false, true, nstk);
+ auto [lhs, lhmaxstk, _] = lh;
+ auto [rhs, rhmaxstk, __] = lh;
+ NInt k;
+ for (k = 1; k < m; ++k)
+ lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+ if (n != (NInt) -1 && k < n) {
+ lhmaxstk += 1;
+ rhmaxstk += 1;
+ for (auto &r : rhs)
+ r.nstk += 1;
+ auto rhsize = rhs.size();
+ rhs.push_front({Node::Fork, {1, rhsize}, nstk});
+ rhs.push_back({Node::Join, {rhsize, 0}, nstk + 1});
+ for (; k < n; ++k)
+ lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+ }
+ if (n == (NInt) -1) {
+ lhmaxstk += 3;
+ rhmaxstk += 3;
+ for (auto &r : rhs)
+ r.nstk += 3;
+ auto rhsize = rhs.size();
+ rhs.push_front({Node::Loop, {rhsize, 0}, nstk});
+ rhs.push_back({Node::Next, {rhsize, 1}, nstk + 3});
+ lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+ }
+ if (m == 0)
+ return mkrep({lhs, rhmaxstk, MINRX_REG_SUCCESS}, true,
false, nstk);
+ else
+ return {lhs, rhmaxstk, MINRX_REG_SUCCESS};
+ }
+ Subexp rep(bool nested, NInt nstk) {
+ auto lh = chr(nested, nstk);
+ if (std::get<minrx_result_t>(lh))
+ return lh;
+ bool optional = false, infinite = false;
+ for (;;) {
+ auto wc = wconv.look();
+ switch (wc) {
+ case L'?':
+ optional = true;
+ wconv.nextchr();
+ continue;
+ case L'*':
+ optional = infinite = true;
+ wconv.nextchr();
+ continue;
+ case L'+':
+ infinite = true;
+ wconv.nextchr();
+ continue;
+ case L'{':
+ if ((flags & MINRX_REG_BRACE_COMPAT) == 0 ||
std::isdigit(wconv.lookahead(&WConv::nextchr))) {
+ if (optional || infinite) {
+ lh = mkrep(lh, optional,
infinite, nstk);
+ optional = infinite = false;
+ }
+ wc = wconv.nextchr().look();
+ if (wc == WConv::End)
+ return {{}, 0,
MINRX_REG_EBRACE};
+ NInt m, n;
+ if (!num(m))
+ return {{}, 0, MINRX_REG_BADBR};
+ wc = wconv.look();
+ if (wc == L'}') {
+ lh = mkrep(lh, m, m, nstk);
+ wconv.nextchr();
+ continue;
+ }
+ if (wc == WConv::End)
+ return {{}, 0,
MINRX_REG_EBRACE};
+ if (wc != L',')
+ return {{}, 0, MINRX_REG_BADBR};
+ wc = wconv.nextchr().look();
+ if (wc == L'}') {
+ lh = mkrep(lh, m, -1, nstk);
+ wconv.nextchr();
+ continue;
+ }
+ if (!num(n))
+ return {{}, 0, MINRX_REG_BADBR};
+ wc = wconv.look();
+ if (wc == WConv::End)
+ return {{}, 0,
MINRX_REG_EBRACE};
+ if (wc != L'}')
+ return {{}, 0, MINRX_REG_BADBR};
+ lh = mkrep(lh, m, n, nstk);
+ wconv.nextchr();
+ continue;
+ }
+ // fall through
+ default:
+ if (optional || infinite) {
+ lh = mkrep(lh, optional, infinite,
nstk);
+ optional = infinite = false;
+ }
+ return lh;
+ }
+ }
+ }
+ Subexp chr(bool nested, NInt nstk) {
+ std::deque<Node> lhs;
+ std::size_t lhmaxstk;
+ auto wc = wconv.look();
+ switch (wc) {
+ default:
+ normal:
+ lhmaxstk = nstk;
+ if ((flags & MINRX_REG_ICASE) == 0) {
+ lhs.push_back({(NInt) wc, {0, 0}, nstk});
+ } else {
+ WChar wcl = std::towlower(wc), wcu =
std::towupper(wc);
+ auto key = std::min(wc, std::min(wcl, wcu));
+ if (icmap.find(key) == icmap.end()) {
+ icmap.emplace(key, csets.size());
+ csets.emplace_back();
+ csets.back().set(wc);
+ csets.back().set(wcl);
+ csets.back().set(wcu);
+ }
+ lhs.push_back({Node::CSet, {icmap[key], 0},
nstk});
+ }
+ wconv.nextchr();
+ break;
+ case L'{':
+ if ((flags & MINRX_REG_BRACE_COMPAT) != 0 &&
!std::isdigit(wconv.lookahead(&WConv::nextchr)))
+ goto normal;
+ // fall through
+ case L'*':
+ case L'+':
+ case L'?':
+ return {{}, 0, MINRX_REG_BADRPT};
+ case L'[':
+ {
+ lhmaxstk = nstk;
+ wc = wconv.nextchr().look();
+ bool invert = wc == L'^';
+ if (invert)
+ wc = wconv.nextchr().look();
+ CSet cs;
+ for (bool first = true;; first = false) {
+ auto wclo = wc, wchi = wc;
+ if (wclo == WConv::End)
+ return {{}, 0,
MINRX_REG_EBRACK};
+ wc = wconv.nextchr().look();
+ if (wclo == L']' && !first)
+ break;
+ if (wclo == L'\\' && (flags &
MINRX_REG_BRACK_ESCAPE) != 0) {
+ if (wc != WConv::End) {
+ wclo = wchi = wc;
+ wc =
wconv.nextchr().look();
+ } else {
+ return {{}, 0,
MINRX_REG_EESCAPE};
+ }
+ } else if (wclo == L'[') {
+ if (wc == L'.') {
+ wc =
wconv.nextchr().look();
+ wclo = wchi = wc;
+ wc =
wconv.nextchr().look();
+ if (wc != L'.' || (wc =
wconv.nextchr().look() != L']'))
+ return {{}, 0,
MINRX_REG_ECOLLATE};
+ } else if (wc == L':') {
+ wconv.nextchr();
+ // FIXME: search for
the matching :] using wconv.nextchr() rather than memchr()
+ const char *colon =
(const char *) std::memchr(wconv.cp, ':', wconv.ep - wconv.cp);
+ if (colon && wconv.ep -
colon >= 2 && colon[1] == ']') {
+ auto cclname =
std::string(wconv.cp, colon);
+ if (cclass(cs,
cclname)) {
+
wconv.cp = colon + 2;
+
wconv.len = 0;
+ wc =
wconv.nextchr().look();
+
continue; // can't be range endpoint
+ } else {
+ return
{{}, 0, MINRX_REG_ECTYPE};
+ }
+ } else {
+ return {{}, 0,
MINRX_REG_ECTYPE};
+ }
+ } else if (wc == L'=') {
+ // FIXME: recognize
some equivalence classes.
+ return {{}, 0,
MINRX_REG_ECOLLATE};
+ }
+ }
+ bool range = false;
+ if (wc == L'-') {
+ auto save = wconv.save();
+ wc = wconv.nextchr().look();
+ if (wc == WConv::End)
+ return {{}, 0,
MINRX_REG_EBRACK};
+ if (wc != L']') {
+ wchi = wc;
+ wc =
wconv.nextchr().look();
+ if (wchi == L'\\' &&
(flags & MINRX_REG_BRACK_ESCAPE) != 0) {
+ if (wc !=
WConv::End) {
+ wchi =
wc;
+ wc =
wconv.nextchr().look();
+ } else {
+ return
{{}, 0, MINRX_REG_EESCAPE};
+ }
+ } else if (wchi ==
L'[') {
+ if (wc == L'.')
{
+ wchi =
wconv.nextchr().look();
+ wc =
wconv.nextchr().look();
+ if (wc
!= L'.' || (wc = wconv.nextchr().look()) != L']')
+
return {{}, 0, MINRX_REG_ECOLLATE};
+ wc =
wconv.nextchr().look();
+ } else if (wc
== L':' || wc == L'=') {
+ return
{{}, 0, MINRX_REG_ERANGE}; // can't be range endpoint
+ }
+ }
+ range = true;
+ } else {
+ wconv.restore(save);
+ wc = L'-';
+ }
+ }
+ if (wclo > wchi)
+ return {{}, 0,
MINRX_REG_ERANGE};
+ cs.set(wclo, wchi);
+ if ((flags & MINRX_REG_ICASE) != 0)
+ for (auto wc = wclo; wc <=
wchi; ++wc) {
+
cs.set(std::tolower(wc));
+
cs.set(std::toupper(wc));
+ }
+ if (range && wc == L'-' &&
wconv.lookahead(&WConv::nextchr) != L']')
+ return {{}, 0,
MINRX_REG_ERANGE};
+ }
+ lhs.push_back({Node::CSet, {csets.size(), 0},
nstk});
+ if (invert) {
+ if ((flags & MINRX_REG_NEWLINE) != 0)
+ cs.set(L'\n');
+ cs.invert();
+ }
+ csets.emplace_back(cs);
+ }
+ break;
+ case L'.':
+ if (!dot.has_value()) {
+ dot = csets.size();
+ csets.emplace_back();
+ if ((flags & MINRX_REG_NEWLINE) != 0)
+ csets.back().set(L'\n');
+ csets.back().invert();
+ }
+ lhmaxstk = nstk;
+ lhs.push_back({Node::CSet, {*dot, 0}, nstk});
+ wconv.nextchr();
+ break;
+ case L'^':
+ lhmaxstk = nstk;
+ lhs.push_back({(flags & MINRX_REG_NEWLINE) == 0 ?
Node::ZBOB : Node::ZBOL, {0, 0}, nstk});
+ wconv.nextchr();
+ break;
+ case L'$':
+ lhmaxstk = nstk;
+ lhs.push_back({(flags & MINRX_REG_NEWLINE) == 0 ?
Node::ZEOB : Node::ZEOL, {0, 0}, nstk});
+ wconv.nextchr();
+ break;
+ case L'\\':
+ lhmaxstk = nstk;
+ wc = wconv.nextchr().look();
+ switch (wc) {
+ case L'<':
+ if ((flags & MINRX_REG_EXTENSIONS_BSD) == 0)
+ goto normal;
+ lhs.push_back({Node::ZBOW, {0, 0}, nstk});
+ break;
+ case L'>':
+ if ((flags & MINRX_REG_EXTENSIONS_BSD) == 0)
+ goto normal;
+ lhs.push_back({Node::ZEOW, {0, 0}, nstk});
+ break;
+ case L'`':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::ZBOB, {0, 0}, nstk});
+ break;
+ case L'\'':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::ZEOB, {0, 0}, nstk});
+ break;
+ case L'b':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::ZXOW, {0, 0}, nstk});
+ break;
+ case L'B':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::ZNWB, {0, 0}, nstk});
+ break;
+ case L's':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::CSet, {csets.size(), 0},
nstk});
+ csets.emplace_back();
+ cclass(csets.back(), "space");
+ break;
+ case L'S':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::CSet, {csets.size(), 0},
nstk});
+ csets.emplace_back();
+ cclass(csets.back(), "space");
+ csets.back().invert();
+ break;
+ case L'w':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::CSet, {csets.size(), 0},
nstk});
+ csets.emplace_back();
+ cclass(csets.back(), "alnum");
+ csets.back().set(L'_');
+ break;
+ case L'W':
+ if ((flags & MINRX_REG_EXTENSIONS_GNU) == 0)
+ goto normal;
+ lhs.push_back({Node::CSet, {csets.size(), 0},
nstk});
+ csets.emplace_back();
+ cclass(csets.back(), "alnum");
+ csets.back().set(L'_');
+ csets.back().invert();
+ break;
+ case WConv::End:
+ return {{}, 0, MINRX_REG_EESCAPE};
+ default:
+ goto normal;
+ }
+ wconv.nextchr();
+ break;
+ case L'(':
+ {
+ NInt n = ++nsub;
+ wconv.nextchr();
+ minrx_result_t err;
+ std::tie(lhs, lhmaxstk, err) = alt(true, nstk +
1);
+ if (err)
+ return {lhs, lhmaxstk, err};
+ if (wconv.look() != L')')
+ return {{}, 0, MINRX_REG_EPAREN};
+ lhs.push_front({Node::SubL, {n, nsub}, nstk});
+ lhs.push_back({Node::SubR, {n, nsub}, nstk +
1});
+ wconv.nextchr();
+ }
+ break;
+ case L')':
+ if (!nested)
+ goto normal;
+ // fall through
+ case L'|':
+ case WConv::End:
+ lhmaxstk = nstk;
+ break;
+ }
+ return {lhs, lhmaxstk, MINRX_REG_SUCCESS};
+ }
+ Regexp *compile() {
+ auto [lhs, nstk, err] = alt(false, 0);
+ if (err) {
+ csets.clear();
+ lhs.clear();
+ nstk = 0;
+ nsub = 0;
+ } else {
+ lhs.push_back({Node::Exit, {0, 0}, 0});
+ }
+ return new Regexp{ std::move(csets), {lhs.begin(), lhs.end()},
nstk, nsub + 1, err };
+ }
+};
+
+std::map<std::string, CSet> Compile::cclmemo;
+std::mutex Compile::cclmutex;
+
+struct Execute {
+ typedef COWVec<std::size_t, (std::size_t) -1> Vec;
+ struct NState {
+ const char *bp = nullptr;
+ Vec substack;
+ NState() {}
+ NState(Vec::Allocator &allocator): substack(allocator) {}
+ bool cmpgt(const NState &ns, std::size_t nstk) const {
+ return bp != ns.bp ? bp < ns.bp :
substack.cmpgt(ns.substack, nstk);
+ }
+ };
+ const Regexp &r;
+ const minrx_regexec_flags_t flags;
+ WConv wconv;
+ WChar lookback = WConv::End;
+ Vec::Allocator allocator { r.nstk + 2 * r.nsub };
+ std::optional<COWVec<std::size_t, (std::size_t) -1>> best;
+ QSet<NInt> epsq { r.nodes.size() };
+ QVec<NInt, NState> epsv { r.nodes.size() };
+ Execute(const Regexp &r, minrx_regexec_flags_t flags, const char *bp,
const char *ep) : r(r), flags(flags), wconv(bp, ep) {}
+ void add(QVec<NInt, NState> &ncsv, NInt n, const NState &ns) {
+ if (r.nodes[n].type <= Node::CSet) {
+ auto [newly, oldns] = ncsv.insert(n, ns);
+ if (!newly && ns.cmpgt(oldns, r.nodes[n].nstk))
+ oldns = ns;
+ } else {
+ auto [newly, oldns] = epsv.insert(n, ns);
+ if (newly || (ns.cmpgt(oldns, r.nodes[n].nstk) &&
(oldns = ns, true)))
+ epsq.insert(n);
+ }
+ }
+ bool is_word(int wc) { return wc == L'_' || std::iswalnum(wc); }
+ void epsclosure(QVec<NInt, NState> &ncsv) {
+ auto nodes = r.nodes;
+ do {
+ NInt k = epsq.remove();
+ NState &ns = epsv.lookup(k);
+ if (best.has_value() && (std::size_t) (ns.bp -
wconv.bp) > best->get(r.nstk + 0))
+ continue;
+ const auto &n = nodes[k];
+ switch (n.type) {
+ case Node::Exit:
+ {
+ std::size_t b = ns.bp - wconv.bp, e =
wconv.cp - wconv.bp;
+ if (!best.has_value()
+ || b < best->get(r.nstk + 0)
+ || (b == best->get(r.nstk + 0) && e
>= best->get(r.nstk + 1)))
+ {
+ best = ns.substack;
+ best->put(r.nstk + 0, b);
+ best->put(r.nstk + 1, e);
+ }
+ }
+ break;
+ case Node::Fork:
+ {
+ NState nscopy = ns;
+ NInt pridelta = n.args[0];
+ NInt priority = 0;
+ do {
+ nscopy.substack.put(n.nstk,
priority += pridelta);
+ add(ncsv, k + 1, nscopy);
+ k = k + 1 + nodes[k].args[1];
+ } while (nodes[k].type != Node::Join);
+ if (priority == pridelta) {
+ nscopy.substack.put(n.nstk,
priority += pridelta);
+ add(ncsv, k, nscopy);
+ }
+ }
+ break;
+ case Node::Goto:
+ add(ncsv, k + 1 + n.args[1], ns);
+ break;
+ case Node::Join:
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::Loop:
+ {
+ NState nscopy = ns;
+ nscopy.substack.put(n.nstk, wconv.cp -
wconv.bp);
+ nscopy.substack.put(n.nstk + 1, -1);
+ nscopy.substack.put(n.nstk + 2,
wconv.cp - wconv.bp);
+ add(ncsv, k + 1, nscopy);
+ if (n.args[1]) {
+ nscopy.substack.put(n.nstk + 1,
0);
+ add(ncsv, k + 1 + n.args[0],
nscopy);
+ }
+ }
+ break;
+ case Node::Next:
+ {
+ add(ncsv, k + 1, ns);
+ if (n.args[1] && (std::size_t)
(wconv.cp - wconv.bp) > ns.substack.get(n.nstk - 1)) {
+ NState nscopy = ns;
+ nscopy.substack.sub(n.nstk - 2,
1);
+ nscopy.substack.put(n.nstk - 1,
wconv.cp - wconv.bp);
+ add(ncsv, k - n.args[0],
nscopy);
+ }
+ }
+ break;
+ case Node::SubL:
+ {
+ NState nscopy = ns;
+ nscopy.substack.put(n.nstk, wconv.cp -
wconv.bp);
+ if (n.args[0] != (NInt) -1)
+ for (auto i = n.args[0]; i <=
n.args[1]; ++i) {
+
nscopy.substack.put(r.nstk + i * 2, -1);
+
nscopy.substack.put(r.nstk + i * 2 + 1, -1);
+ }
+ add(ncsv, k + 1, nscopy);
+ }
+ break;
+ case Node::SubR:
+ if (n.args[0] != (NInt) -1) {
+ NState nscopy = ns;
+ nscopy.substack.put(r.nstk + n.args[0]
* 2 + 0, ns.substack.get(n.nstk - 1));
+ nscopy.substack.put(r.nstk + n.args[0]
* 2 + 1, wconv.cp - wconv.bp);
+ add(ncsv, k + 1, nscopy);
+ } else {
+ add(ncsv, k + 1, ns);
+ }
+ break;
+ case Node::ZBOB:
+ if (wconv.cp == wconv.bp && (flags &
MINRX_REG_NOTBOL) == 0)
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZEOB:
+ if (wconv.cp == wconv.ep && (flags &
MINRX_REG_NOTEOL) == 0)
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZBOL:
+ if (((wconv.cp == wconv.bp && (flags &
MINRX_REG_NOTBOL) == 0)) || lookback == L'\n')
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZEOL:
+ if (((wconv.cp == wconv.ep && (flags &
MINRX_REG_NOTEOL) == 0)) || wconv.look() == L'\n')
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZBOW:
+ if ((wconv.cp == wconv.bp ||
!is_word(lookback)) && (wconv.cp != wconv.ep && is_word(wconv.look())))
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZEOW:
+ if ((wconv.cp != wconv.bp && is_word(lookback))
&& (wconv.cp == wconv.ep || !is_word(wconv.look())))
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZXOW:
+ if ( ((wconv.cp == wconv.bp ||
!is_word(lookback)) && (wconv.cp != wconv.ep && is_word(wconv.look())))
+ || ((wconv.cp != wconv.bp &&
is_word(lookback)) && (wconv.cp == wconv.ep || !is_word(wconv.look()))))
+ add(ncsv, k + 1, ns);
+ break;
+ case Node::ZNWB:
+ if ( (wconv.cp == wconv.bp && wconv.cp ==
wconv.ep)
+ || (wconv.cp == wconv.bp && wconv.cp !=
wconv.ep && !is_word(wconv.look()))
+ || (wconv.cp != wconv.bp && wconv.cp ==
wconv.ep && !is_word(lookback))
+ || (wconv.cp != wconv.bp && wconv.cp !=
wconv.ep && is_word(lookback) == is_word(wconv.look())))
+ add(ncsv, k + 1, ns);
+ break;
+ default:
+ abort();
+ break;
+ }
+ } while (!epsq.empty());
+ }
+ int execute(std::size_t nm, minrx_regmatch_t *rm) {
+ QVec<NInt, NState> mcsvs[2] { r.nodes.size(), r.nodes.size() };
+ auto nodes = &r.nodes[0];
+ wconv.nextchr();
+ if ((flags & MINRX_REG_RESUME) != 0 && rm && rm[0].rm_eo > 0)
+ while (wconv.cp != wconv.ep && wconv.cp - wconv.bp !=
rm[0].rm_eo)
+ lookback = wconv.look(), wconv.nextchr();
+ NState nsinit(allocator);
+ nsinit.bp = wconv.cp;
+ add(mcsvs[0], 0, nsinit);
+ if (!epsq.empty())
+ epsclosure(mcsvs[0]);
+ auto wc = wconv.look();
+ for (;;) { // unrolled to ping-pong roles of mcsvs[0]/[1]
+ if (wc == WConv::End)
+ break;
+ epsv.clear();
+ while (!mcsvs[0].empty()) {
+ auto [n, ns] = mcsvs[0].remove();
+ auto t = nodes[n].type;
+ if (t <= WCharMax) {
+ if (wc != t)
+ continue;
+ } else {
+ if (!r.csets[nodes[n].args[0]].test(wc))
+ continue;
+ }
+ add(mcsvs[1], n + 1, ns);
+ }
+ wconv.nextchr(), lookback = wc, wc = wconv.look();
+ if (!best.has_value()) {
+ nsinit.bp = wconv.cp;
+ add(mcsvs[1], 0, nsinit);
+ }
+ if (!epsq.empty())
+ epsclosure(mcsvs[1]);
+ if (best.has_value() && mcsvs[1].empty())
+ break;
+ if (wc == WConv::End)
+ break;
+ epsv.clear();
+ while (!mcsvs[1].empty()) {
+ auto [n, ns] = mcsvs[1].remove();
+ auto t = nodes[n].type;
+ if (t <= WCharMax) {
+ if (wc != t)
+ continue;
+ } else {
+ if (!r.csets[nodes[n].args[0]].test(wc))
+ continue;
+ }
+ add(mcsvs[0], n + 1, ns);
+ }
+ wconv.nextchr(), lookback = wc, wc = wconv.look();
+ if (!best.has_value()) {
+ nsinit.bp = wconv.cp;
+ add(mcsvs[0], 0, nsinit);
+ }
+ if (!epsq.empty())
+ epsclosure(mcsvs[0]);
+ if (best.has_value() && mcsvs[0].empty())
+ break;
+ }
+ if (best.has_value()) {
+ if (rm) {
+ std::size_t nsub = std::min(nm, r.nsub);
+ std::size_t i;
+ for (i = 0; i < nsub; ++i) {
+ rm[i].rm_so = (*best->storage)[r.nstk +
i * 2];
+ rm[i].rm_eo = (*best->storage)[r.nstk +
i * 2 + 1];
+ }
+ for (; i < nm; ++i)
+ rm[i].rm_so = rm[i].rm_eo = -1;
+ }
+ return 0;
+ } else {
+ if (rm)
+ for (std::size_t i = 0; i < nm; ++i)
+ rm[i].rm_so = rm[i].rm_eo = -1;
+ return MINRX_REG_NOMATCH;
+ }
+ }
+};
+
+}
+
+int
+minrx_regcomp(minrx_regex_t *rx, const char *s, int flags)
+{
+ return minrx_regncomp(rx, strlen(s), s, flags);
+}
+
+int
+minrx_regexec(minrx_regex_t *rx, const char *s, std::size_t nm,
minrx_regmatch_t *rm, int flags)
+{
+ return minrx_regnexec(rx, strlen(s), s, nm, rm, flags);
+}
+
+int
+minrx_regncomp(minrx_regex_t *rx, std::size_t ns, const char *s, int flags)
+{
+ auto r = MinRX::Compile(s, s + ns, (minrx_regcomp_flags_t)
flags).compile();
+ rx->re_regexp = r;
+ rx->re_nsub = r->nsub;
+ rx->re_compflags = (minrx_regcomp_flags_t) flags;
+ return r->err;
+}
+
+int
+minrx_regnexec(minrx_regex_t *rx, std::size_t ns, const char *s, std::size_t
nm, minrx_regmatch_t *rm, int flags)
+{
+ MinRX::Regexp *r = reinterpret_cast<MinRX::Regexp *>(rx->re_regexp);
+ return MinRX::Execute(*r, (minrx_regexec_flags_t) flags, s, s +
ns).execute(nm, rm);
+}
+
+void
+minrx_regfree(minrx_regex_t *rx)
+{
+ delete reinterpret_cast<MinRX::Regexp *>(rx->re_regexp);
+ rx->re_regexp = nullptr;
+}
+
+size_t
+minrx_regerror(int errcode, const minrx_regex_t *, char *errbuf, size_t
errsize)
+{
+ static const char *const messages[] = {
+ "success",
+ "bad pattern",
+ "invalid contents of {}",
+ "? * + or {interval} not preceded by valid subpattern",
+ "unbalanced {",
+ "unbalanced [",
+ "invalid collating element",
+ "invalid character class name",
+ "invalid trailing backslash",
+ "unbalanced (",
+ "invalid range endpoint",
+ "memory allocation failed",
+ "invalid \\digit",
+ "match not found",
+ "unknown error code",
+ };
+ if (errcode < 0 || errcode > MINRX_REG_UNKNOWN)
+ errcode = MINRX_REG_UNKNOWN;
+ size_t size = snprintf(errbuf, errsize, "%s", messages[errcode]);
+ if (errsize != 0 && size == errsize)
+ errbuf[errsize - 1] = '\0';
+ return size + 1;
+}
diff --git a/support/minrx.h b/support/minrx.h
new file mode 100644
index 00000000..ecae246e
--- /dev/null
+++ b/support/minrx.h
@@ -0,0 +1,94 @@
+//
+// MinRX: a minimal matcher for POSIX Extended Regular Expressions.
+// Copyright (C) 2023, 2024 Michael J. Haertel.
+//
+// This file is part of MinRX.
+//
+// MinRX is free software; you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation; either version 3 of the License, or
+// (at your option) any later version.
+//
+// MinRX is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+// See the GNU Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef _MINRX_H
+#define _MINRX_H
+
+#include <stddef.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Names and types in this file have been chosen so that if the minrx_ and
MINRX_ prefixes
+ * are erased throughout the result will be usable as a POSIX-conforming
regex.h.
+ */
+
+typedef enum { /* Flags for minrx_reg*comp() */
+ MINRX_REG_EXTENDED = 1, /* required in current version of MinRX
*/
+ MINRX_REG_ICASE = 2, /* ignore case in pattern and search */
+ MINRX_REG_NEWLINE = 4, /* exclude \n from . and [^...] and
treat as boundary for ^ and $ */
+ MINRX_REG_NOSUB = 8, /* yes/no result only; no regmatch_t
substrings results */
+ MINRX_REG_BRACE_COMPAT = 16, /* { begins interval expression only
when followed by digit */
+ MINRX_REG_BRACK_ESCAPE = 32, /* bracket expressions [...] allow
backslash escapes */
+ MINRX_REG_EXTENSIONS_BSD = 64, /* enable BSD extensions \< and \> */
+ MINRX_REG_EXTENSIONS_GNU = 128 /* enable GNU extensions \b \B \s \S \w
\W \y */
+} minrx_regcomp_flags_t;
+
+typedef enum { /* Flags for minrx_reg*exec() */
+ MINRX_REG_NOTBOL = 1, /* don't match ^ at beginning of string
*/
+ MINRX_REG_NOTEOL = 2, /* don't match $ at end of string */
+ MINRX_REG_RESUME = 4, /* MinRX extension: resume search from
rm[0].rm_eo */
+} minrx_regexec_flags_t;
+
+typedef enum { /* Return values from minrx_reg*comp()
and minrx_reg*exec() */
+ MINRX_REG_SUCCESS = 0, /* regcomp: compiled successfully;
regexec: match found */
+ MINRX_REG_BADPAT, /* regcomp: bad pattern (generic) */
+ MINRX_REG_BADBR, /* regcomp: invalid contents of {} */
+ MINRX_REG_BADRPT, /* regcomp: ? * + or {interval} not
preceded by valid subpattern */
+ MINRX_REG_EBRACE, /* regcomp: unbalanced { */
+ MINRX_REG_EBRACK, /* regcomp: unbalanced [ */
+ MINRX_REG_ECOLLATE, /* regcomp: invalid collating element */
+ MINRX_REG_ECTYPE, /* regcomp: invalid character class
name */
+ MINRX_REG_EESCAPE, /* regcomp: invalid backslash */
+ MINRX_REG_EPAREN, /* regcomp: unbalanced ( */
+ MINRX_REG_ERANGE, /* regcomp: invalid range endpoint */
+ MINRX_REG_ESPACE, /* regcomp: memory allocation failed */
+ MINRX_REG_ESUBREG, /* regcomp: invalid \digit */
+ MINRX_REG_NOMATCH, /* regexec: match not found */
+ MINRX_REG_UNKNOWN /* unknown error code */
+} minrx_result_t;
+
+typedef struct {
+ void *re_regexp;
+ size_t re_nsub;
+ minrx_regcomp_flags_t re_compflags;
+} minrx_regex_t;
+
+typedef struct {
+ ptrdiff_t rm_so;
+ ptrdiff_t rm_eo;
+} minrx_regmatch_t;
+
+/* Compile from user syntax to internal regex_t; returns minrx_result_t (as
integer) */
+int minrx_regcomp(minrx_regex_t *, const char * /* regexp (NUL terminated) */,
int /* minrx_regcomp_flags_t */);
+int minrx_regncomp(minrx_regex_t *, size_t /* nregexp */, const char * /*
regexp[nregexp] */, int /* minrx_regcomp_flags_t */);
+
+/* Search for previously-compiled regexp; return minrx_result_t (as integer) */
+int minrx_regexec(minrx_regex_t *, const char * /* string NUL-terminated */,
size_t /* nrm */, minrx_regmatch_t * /* rm[nrm] */, int /*
minrx_regexec_flags_t */);
+int minrx_regnexec(minrx_regex_t *, size_t /* nstring */, const char * /*
string[nstring] */, size_t /* nrm */, minrx_regmatch_t * /* rm[nrm] */, int /*
minrx_regexec_flags_t */);
+
+size_t minrx_regerror(int /* minrx_result_t */, const minrx_regex_t *, char *
/* errbuf[nerrbuf] */, size_t /* nerrbuf */);
+void minrx_regfree(minrx_regex_t *);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+#endif /* MINRX_H */
diff --git a/version.c b/version.c
index f5cf6ac0..2e45635a 100644
--- a/version.c
+++ b/version.c
@@ -1,3 +1,3 @@
#include "config.h"
-const char *version_string = PACKAGE_STRING;
+const char *version_string = PACKAGE_STRING "-minrx";
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 4 +
README_d/ChangeLog | 4 +
README_d/README.matchers | 6 +-
support/ChangeLog | 4 +
support/minrx.cpp | 1176 ++++++++++++++++++++++++++++++++++++++++++++++
support/minrx.h | 94 ++++
version.c | 2 +-
7 files changed, 1286 insertions(+), 4 deletions(-)
create mode 100644 support/minrx.cpp
create mode 100644 support/minrx.h
hooks/post-receive
--
gawk
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [SCM] gawk branch, feature/minrx, updated. gawk-4.1.0-5659-g5aa09a55,
Arnold Robbins <=