mirror of https://github.com/tildeclub/ex-vi.git
878 lines
19 KiB
C
878 lines
19 KiB
C
/*
|
|
* Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
|
|
*
|
|
* Sccsid @(#)regdfa.c 1.9 (gritter) 9/22/03
|
|
*/
|
|
/* UNIX(R) Regular Expresssion Library
|
|
*
|
|
* Note: Code is released under the GNU LGPL
|
|
*
|
|
* Copyright (C) 2001 Caldera International, Inc.
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to:
|
|
* Free Software Foundation, Inc.
|
|
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
|
|
/* #include "synonyms.h" */
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include "regdfa.h"
|
|
|
|
/*
|
|
* Deterministic Finite Automata.
|
|
*/
|
|
|
|
/*
|
|
* Postorder traversal that returns a copy of the subtree,
|
|
* except that ROP_BKT becomes ROP_BKTCOPY (since they
|
|
* share the same pointed to Bracket object).
|
|
*/
|
|
static Tree *
|
|
copy(regex_t *ep, Tree *tp)
|
|
{
|
|
Tree *np;
|
|
|
|
if ((np = malloc(sizeof(Tree))) == 0)
|
|
return 0;
|
|
switch (np->op = tp->op) /* almost always correct */
|
|
{
|
|
case ROP_BKT:
|
|
np->op = ROP_BKTCOPY;
|
|
/*FALLTHROUGH*/
|
|
case ROP_BKTCOPY:
|
|
np->right.info.bkt = tp->right.info.bkt;
|
|
/*FALLTHROUGH*/
|
|
default:
|
|
np->left.pos = ep->re_dfa->nposn++;
|
|
/*FALLTHROUGH*/
|
|
case ROP_EMPTY:
|
|
return np;
|
|
case ROP_CAT:
|
|
case ROP_OR:
|
|
if ((np->right.ptr = copy(ep, tp->right.ptr)) == 0)
|
|
{
|
|
free(np);
|
|
return 0;
|
|
}
|
|
np->right.ptr->parent = np;
|
|
/*FALLTHROUGH*/
|
|
case ROP_STAR:
|
|
case ROP_PLUS:
|
|
case ROP_QUEST:
|
|
case ROP_LP:
|
|
if ((np->left.ptr = copy(ep, tp->left.ptr)) == 0)
|
|
break;
|
|
np->left.ptr->parent = np;
|
|
return np;
|
|
}
|
|
libuxre_regdeltree(np, 1);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Postorder traversal.
|
|
* Assign unique ascending integer values to the leaves.
|
|
* Since the right child is traversed before the left,
|
|
* the position for ROP_END is guaranteed to be zero.
|
|
* The parse tree is rewritten in two cases:
|
|
* - Each ROP_BRACE is replaced by an equivalent--sometimes
|
|
* large--subtree using only ROP_CAT, ROP_QUEST, and
|
|
* ROP_PLUS.
|
|
* - If REG_ICASE, replace each simple character that has
|
|
* an uppercase equivalent with a ROP_OR subtree over the
|
|
* two versions.
|
|
* Since these rewrites occur bottom up, they have already
|
|
* been applied before any subtrees passed to copy().
|
|
*/
|
|
static Tree *
|
|
findposn(regex_t *ep, Tree *tp, int mb_cur_max)
|
|
{
|
|
unsigned int lo, hi;
|
|
Tree *ptr, *par;
|
|
w_type wc;
|
|
|
|
switch (tp->op)
|
|
{
|
|
default:
|
|
if (ep->re_flags & REG_ICASE
|
|
&& (wc = to_upper(tp->op)) != tp->op)
|
|
{
|
|
if ((ptr = libuxre_reg1tree(tp->op, 0)) == 0)
|
|
return 0;
|
|
ptr->parent = tp;
|
|
ptr->left.pos = ep->re_dfa->nposn++;
|
|
tp->op = ROP_OR;
|
|
tp->left.ptr = ptr;
|
|
ptr = libuxre_reg1tree(wc, 0);
|
|
if ((tp->right.ptr = ptr) == 0)
|
|
return 0;
|
|
ptr->parent = tp;
|
|
ptr->left.pos = ep->re_dfa->nposn++;
|
|
return tp;
|
|
}
|
|
/*FALLTHROUGH*/
|
|
case ROP_BOL:
|
|
case ROP_EOL:
|
|
case ROP_ALL:
|
|
case ROP_ANYCH:
|
|
case ROP_NOTNL:
|
|
case ROP_NONE:
|
|
case ROP_BKT:
|
|
case ROP_BKTCOPY:
|
|
case ROP_END:
|
|
tp->left.pos = ep->re_dfa->nposn++;
|
|
return tp;
|
|
case ROP_EMPTY:
|
|
return tp;
|
|
case ROP_OR:
|
|
case ROP_CAT:
|
|
if ((tp->right.ptr = findposn(ep, tp->right.ptr,
|
|
mb_cur_max)) == 0)
|
|
return 0;
|
|
/*FALLTHROUGH*/
|
|
case ROP_STAR:
|
|
case ROP_PLUS:
|
|
case ROP_QUEST:
|
|
case ROP_LP:
|
|
if ((tp->left.ptr = findposn(ep, tp->left.ptr,
|
|
mb_cur_max)) == 0)
|
|
return 0;
|
|
return tp;
|
|
case ROP_BRACE:
|
|
if ((tp->left.ptr = findposn(ep, tp->left.ptr,
|
|
mb_cur_max)) == 0)
|
|
return 0;
|
|
break;
|
|
}
|
|
/*
|
|
* ROP_BRACE as is cannot be handled in a DFA. This code
|
|
* duplicates the ROP_BRACE subtree as a left-towering
|
|
* series of ROP_CAT nodes, the first "lo" of which are
|
|
* direct copies of the original subtree. The tail of
|
|
* the series are either some number of ROP_QUESTs over
|
|
* copies of the original subtree, or a single ROP_PLUS
|
|
* over a copy (when "hi" is infinity).
|
|
*
|
|
* All interesting cases {lo,hi}:
|
|
* {0,0} -> ROP_EMPTY, parsing, temporary
|
|
* {0,1} -> ROP_QUEST, parsing
|
|
* {0,2} -> CAT(QUEST(left), QUEST(copy))
|
|
* {0,n} -> CAT({0,n-1}, QUEST(copy))
|
|
* {0,} -> ROP_STAR, parsing
|
|
*
|
|
* {1,1} -> ROP_NOP, parsing, temporary
|
|
* {1,2} -> CAT(left, QUEST(copy))
|
|
* {1,n} -> CAT({1,n-1}, QUEST(copy))
|
|
* {1,} -> ROP_PLUS, parsing
|
|
*
|
|
* {2,2} -> CAT(left, copy)
|
|
* {2,n} -> CAT({2,n-1}, QUEST(copy))
|
|
* {2,} -> CAT(left, PLUS(copy))
|
|
*
|
|
* {3,3} -> CAT({2,2}, copy)
|
|
* {3,n} -> CAT({3,n-1}, QUEST(copy))
|
|
* {3,} -> CAT({2,2}, PLUS(copy))
|
|
*
|
|
* {n,} -> CAT({n-1,n-1}, PLUS(copy))
|
|
*
|
|
* In all cases, the ROP_BRACE node is turned into the
|
|
* left-most ROP_CAT, and a copy of its original subtree
|
|
* is connected as the right child. Note that the bottom-
|
|
* up nature of this duplication guarantees that copy()
|
|
* never sees a ROP_BRACE node.
|
|
*/
|
|
par = tp->parent;
|
|
lo = tp->right.info.num[0];
|
|
hi = tp->right.info.num[1];
|
|
if ((ptr = copy(ep, tp->left.ptr)) == 0)
|
|
return 0;
|
|
ptr->parent = tp;
|
|
tp->op = ROP_CAT;
|
|
tp->right.ptr = ptr;
|
|
if (lo == 0)
|
|
{
|
|
if ((tp->left.ptr = libuxre_reg1tree(ROP_QUEST, tp->left.ptr))
|
|
== 0)
|
|
return 0;
|
|
tp->left.ptr->parent = tp;
|
|
}
|
|
else
|
|
{
|
|
if (hi == BRACE_INF || (hi -= lo) == 0)
|
|
lo--; /* lo > 1; no extra needed */
|
|
while (--lo != 0)
|
|
{
|
|
if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr)))
|
|
== 0)
|
|
return 0;
|
|
}
|
|
}
|
|
if (hi == BRACE_INF)
|
|
{
|
|
if ((tp->right.ptr = libuxre_reg1tree(ROP_PLUS, tp->right.ptr))
|
|
== 0)
|
|
return 0;
|
|
tp->right.ptr->parent = tp;
|
|
}
|
|
else if (hi != 0)
|
|
{
|
|
if ((tp->right.ptr = libuxre_reg1tree(ROP_QUEST, tp->right.ptr))
|
|
== 0)
|
|
return 0;
|
|
ptr = tp->right.ptr;
|
|
ptr->parent = tp;
|
|
while (--hi != 0)
|
|
{
|
|
if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr)))
|
|
== 0)
|
|
return 0;
|
|
}
|
|
}
|
|
tp->parent = par;
|
|
return tp;
|
|
}
|
|
|
|
/*
|
|
* Postorder traversal, but not always entire subtree.
|
|
* For each leaf reachable by the empty string, add it
|
|
* to the set. Return 0 if the subtree can match empty.
|
|
*/
|
|
static int
|
|
first(Dfa *dp, Tree *tp)
|
|
{
|
|
switch (tp->op)
|
|
{
|
|
case ROP_BOL:
|
|
if (dp->flags & REG_NOTBOL)
|
|
return 0;
|
|
break;
|
|
case ROP_EOL:
|
|
if (dp->flags & REG_NOTEOL)
|
|
return 0;
|
|
break;
|
|
case ROP_EMPTY:
|
|
return 0;
|
|
case ROP_OR:
|
|
return first(dp, tp->left.ptr) & first(dp, tp->right.ptr);
|
|
case ROP_CAT:
|
|
if (first(dp, tp->left.ptr) != 0)
|
|
return 1;
|
|
return first(dp, tp->right.ptr);
|
|
case ROP_BRACE:
|
|
if (tp->right.info.num[0] != 0 && first(dp, tp->left.ptr) != 0)
|
|
return 1;
|
|
/*FALLTHROUGH*/
|
|
case ROP_STAR:
|
|
case ROP_QUEST:
|
|
first(dp, tp->left.ptr);
|
|
return 0;
|
|
case ROP_LP:
|
|
case ROP_PLUS:
|
|
return first(dp, tp->left.ptr);
|
|
}
|
|
if (dp->posset[tp->left.pos] == 0)
|
|
{
|
|
dp->posset[tp->left.pos] = 1;
|
|
dp->nset++;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Walk from leaf up (most likely not to root).
|
|
* Determine follow set for the leaf by filling
|
|
* set[] with the positions reachable.
|
|
*/
|
|
static void
|
|
follow(Dfa *dp, Tree *tp)
|
|
{
|
|
Tree *pp;
|
|
|
|
switch ((pp = tp->parent)->op)
|
|
{
|
|
case ROP_CAT:
|
|
if (pp->left.ptr == tp && first(dp, pp->right.ptr) != 0)
|
|
break;
|
|
/*FALLTHROUGH*/
|
|
case ROP_OR:
|
|
case ROP_QUEST:
|
|
case ROP_LP:
|
|
follow(dp, pp);
|
|
break;
|
|
case ROP_STAR:
|
|
case ROP_PLUS:
|
|
case ROP_BRACE:
|
|
first(dp, tp);
|
|
follow(dp, pp);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Postorder traversal.
|
|
* At each leaf, copy it into posn[] and assign its follow set.
|
|
* Because the left-most subtree is ROP_ALL under ROP_STAR, the
|
|
* follow set for its leaf (position dp->nposn-1) is the same
|
|
* as the initial state's signature (prior to any ROP_BOL).
|
|
*/
|
|
static int
|
|
posnfoll(Dfa *dp, Tree *tp)
|
|
{
|
|
unsigned char *s;
|
|
size_t i, n;
|
|
size_t *fp;
|
|
Posn *p;
|
|
int ret;
|
|
|
|
switch (tp->op)
|
|
{
|
|
case ROP_OR:
|
|
case ROP_CAT:
|
|
if ((ret = posnfoll(dp, tp->right.ptr)) != 0)
|
|
return ret;
|
|
/*FALLTHROUGH*/
|
|
case ROP_STAR:
|
|
case ROP_PLUS:
|
|
case ROP_QUEST:
|
|
case ROP_LP:
|
|
if ((ret = posnfoll(dp, tp->left.ptr)) != 0)
|
|
return ret;
|
|
return 0;
|
|
case ROP_END: /* keeps follow() from walking above the root */
|
|
p = &dp->posn[tp->left.pos];
|
|
p->op = tp->op;
|
|
p->seti = 0;
|
|
p->nset = 0;
|
|
return 0;
|
|
case ROP_BKT:
|
|
case ROP_BKTCOPY:
|
|
p = &dp->posn[tp->left.pos];
|
|
p->bkt = tp->right.info.bkt;
|
|
goto skip;
|
|
case ROP_BOL:
|
|
dp->flags |= REG_NOTBOL; /* adjacent ROP_BOLs match empty */
|
|
break;
|
|
case ROP_EOL:
|
|
dp->flags |= REG_NOTEOL; /* adjacent ROP_EOLs match empty */
|
|
break;
|
|
}
|
|
p = &dp->posn[tp->left.pos];
|
|
skip:;
|
|
p->op = tp->op;
|
|
memset(dp->posset, 0, dp->nposn);
|
|
dp->nset = 0;
|
|
follow(dp, tp);
|
|
dp->flags &= ~(REG_NOTBOL | REG_NOTEOL);
|
|
fp = dp->posfoll;
|
|
if ((p->nset = dp->nset) > dp->avail) /* need more */
|
|
{
|
|
if ((n = p->nset << 1) < dp->nposn)
|
|
n = dp->nposn;
|
|
dp->avail += n;
|
|
if ((fp = realloc(dp->posfoll,
|
|
sizeof(size_t) * (dp->avail + dp->used))) == 0)
|
|
{
|
|
return REG_ESPACE;
|
|
}
|
|
dp->posfoll = fp;
|
|
}
|
|
p->seti = dp->used;
|
|
if ((i = dp->nset) != 0)
|
|
{
|
|
dp->used += i;
|
|
dp->avail -= i;
|
|
fp += p->seti;
|
|
s = dp->posset;
|
|
n = 0;
|
|
do
|
|
{
|
|
if (*s++ != 0)
|
|
{
|
|
*fp++ = n;
|
|
if (--i == 0)
|
|
break;
|
|
}
|
|
} while (++n != dp->nposn);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
addstate(Dfa *dp) /* install state if unique; return its index */
|
|
{
|
|
size_t *sp, *fp;
|
|
size_t t, n, i;
|
|
int flushed;
|
|
|
|
/*
|
|
* Compare dp->nset/dp->cursig[] against remembered states.
|
|
*/
|
|
t = dp->top;
|
|
do
|
|
{
|
|
if (dp->nsig[--t] != dp->nset)
|
|
continue;
|
|
if ((n = dp->nset) != 0)
|
|
{
|
|
fp = &dp->sigfoll[dp->sigi[t]];
|
|
sp = &dp->cursig[0];
|
|
loop:;
|
|
if (*fp++ != *sp++)
|
|
continue; /* to the do-while */
|
|
if (--n != 0)
|
|
goto loop;
|
|
}
|
|
return t + 1;
|
|
} while (t != 0);
|
|
/*
|
|
* Not in currently cached states; add it.
|
|
*/
|
|
flushed = 0;
|
|
if ((t = dp->top) >= CACHESZ) /* need to flush the cache */
|
|
{
|
|
flushed = 1;
|
|
n = dp->anybol;
|
|
n = dp->sigi[n] + dp->nsig[n]; /* past invariant states */
|
|
dp->avail += dp->used - n;
|
|
dp->used = n;
|
|
dp->top = n = dp->nfix;
|
|
memset((void *)&dp->trans, 0, sizeof(dp->trans));
|
|
memset((void *)&dp->acc[n], 0, CACHESZ - n);
|
|
t = n;
|
|
}
|
|
dp->top++;
|
|
fp = dp->sigfoll;
|
|
if ((n = dp->nset) > dp->avail) /* grow strip */
|
|
{
|
|
i = dp->avail + n << 1;
|
|
if ((fp = realloc(fp, sizeof(size_t) * (i + dp->used))) == 0)
|
|
return 0;
|
|
dp->avail = i;
|
|
dp->sigfoll = fp;
|
|
}
|
|
dp->acc[t] = 0;
|
|
if ((dp->nsig[t] = n) != 0)
|
|
{
|
|
sp = dp->cursig;
|
|
if (sp[0] == 0)
|
|
dp->acc[t] = 1;
|
|
dp->sigi[t] = i = dp->used;
|
|
dp->used += n;
|
|
dp->avail -= n;
|
|
fp += i;
|
|
do
|
|
*fp++ = *sp++;
|
|
while (--n != 0);
|
|
}
|
|
t++;
|
|
if (flushed)
|
|
return -t;
|
|
return t;
|
|
}
|
|
|
|
void
|
|
libuxre_regdeldfa(Dfa *dp)
|
|
{
|
|
Posn *pp;
|
|
size_t np;
|
|
|
|
if (dp->posfoll != 0)
|
|
free(dp->posfoll);
|
|
if (dp->sigfoll != 0)
|
|
free(dp->sigfoll);
|
|
if (dp->cursig != 0)
|
|
free(dp->cursig);
|
|
if ((pp = dp->posn) != 0)
|
|
{
|
|
/*
|
|
* Need to walk the positions list to free any
|
|
* space used for ROP_BKTs.
|
|
*/
|
|
np = dp->nposn;
|
|
do
|
|
{
|
|
if (pp->op == ROP_BKT)
|
|
{
|
|
libuxre_bktfree(pp->bkt);
|
|
free(pp->bkt);
|
|
}
|
|
} while (++pp, --np != 0);
|
|
free(dp->posn);
|
|
}
|
|
free(dp);
|
|
}
|
|
|
|
int
|
|
regtrans(Dfa *dp, int st, w_type wc, int mb_cur_max)
|
|
{
|
|
const unsigned char *s;
|
|
size_t *fp, *sp;
|
|
size_t i, n;
|
|
Posn *pp;
|
|
int nst;
|
|
|
|
if ((n = dp->nsig[st]) == 0) /* dead state */
|
|
return st + 1; /* stay here */
|
|
memset(dp->posset, 0, dp->nposn);
|
|
dp->nset = 0;
|
|
fp = &dp->sigfoll[dp->sigi[st]];
|
|
do
|
|
{
|
|
pp = &dp->posn[*fp];
|
|
switch (pp->op)
|
|
{
|
|
case ROP_EOL:
|
|
if (wc == '\0' && (dp->flags & REG_NOTEOL) == 0)
|
|
break;
|
|
/*FALLTHROUGH*/
|
|
case ROP_BOL:
|
|
default:
|
|
if (pp->op == wc)
|
|
break;
|
|
/*FALLTHROUGH*/
|
|
case ROP_END:
|
|
case ROP_NONE:
|
|
continue;
|
|
case ROP_NOTNL:
|
|
if (wc == '\n')
|
|
continue;
|
|
/*FALLTHROUGH*/
|
|
case ROP_ANYCH:
|
|
if (wc <= '\0')
|
|
continue;
|
|
break;
|
|
case ROP_ALL:
|
|
if (wc == '\0')
|
|
continue;
|
|
break;
|
|
case ROP_BKT:
|
|
case ROP_BKTCOPY:
|
|
/*
|
|
* Note that multiple character bracket matches
|
|
* are precluded from DFAs. (See regparse.c and
|
|
* regcomp.c.) Thus, the continuation string
|
|
* argument is not used in libuxre_bktmbexec().
|
|
*/
|
|
if (wc > '\0' &&
|
|
libuxre_bktmbexec(pp->bkt, wc, 0, mb_cur_max) == 0)
|
|
break;
|
|
continue;
|
|
}
|
|
/*
|
|
* Current character matches this position.
|
|
* For each position in its follow list,
|
|
* add that position to the new state's signature.
|
|
*/
|
|
i = pp->nset;
|
|
sp = &dp->posfoll[pp->seti];
|
|
do
|
|
{
|
|
if (dp->posset[*sp] == 0)
|
|
{
|
|
dp->posset[*sp] = 1;
|
|
dp->nset++;
|
|
}
|
|
} while (++sp, --i != 0);
|
|
} while (++fp, --n != 0);
|
|
/*
|
|
* Move the signature (if any) into cursig[] and install it.
|
|
*/
|
|
if ((i = dp->nset) != 0)
|
|
{
|
|
fp = dp->cursig;
|
|
s = dp->posset;
|
|
for (n = 0;; n++)
|
|
{
|
|
if (*s++ != 0)
|
|
{
|
|
*fp++ = n;
|
|
if (--i == 0)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if ((nst = addstate(dp)) < 0) /* flushed cache */
|
|
nst = -nst;
|
|
else if (nst > 0 && (wc & ~(long)(NCHAR - 1)) == 0)
|
|
dp->trans[st][wc] = nst;
|
|
return nst;
|
|
}
|
|
|
|
LIBUXRE_STATIC int
|
|
libuxre_regdfacomp(regex_t *ep, Tree *tp, Lex *lxp)
|
|
{
|
|
Tree *lp;
|
|
Dfa *dp;
|
|
Posn *p;
|
|
int st;
|
|
|
|
/*
|
|
* It's convenient to insert an STAR(ALL) subtree to the
|
|
* immediate left of the current tree. This makes the
|
|
* "any match" libuxre_regdfaexec() not a special case,
|
|
* and the initial state signature will fall out when
|
|
* building the follow sets for all the leaves.
|
|
*/
|
|
if ((lp = libuxre_reg1tree(ROP_ALL, 0)) == 0
|
|
|| (lp = libuxre_reg1tree(ROP_STAR, lp)) == 0
|
|
|| (tp->left.ptr = lp
|
|
= libuxre_reg2tree(ROP_CAT, lp, tp->left.ptr)) == 0)
|
|
{
|
|
return REG_ESPACE;
|
|
}
|
|
lp->parent = tp;
|
|
if ((dp = calloc(1, sizeof(Dfa))) == 0)
|
|
return REG_ESPACE;
|
|
ep->re_dfa = dp;
|
|
/*
|
|
* Just in case null pointers aren't just all bits zero...
|
|
*/
|
|
dp->posfoll = 0;
|
|
dp->sigfoll = 0;
|
|
dp->cursig = 0;
|
|
dp->posn = 0;
|
|
/*
|
|
* Assign position values to each of the tree's leaves
|
|
* (the important parts), meanwhile potentially rewriting
|
|
* the parse tree so that it fits within the restrictions
|
|
* of our DFA.
|
|
*/
|
|
if ((tp = findposn(ep, tp, lxp->mb_cur_max)) == 0)
|
|
goto err;
|
|
/*
|
|
* Get space for the array of positions and current set,
|
|
* now that the number of positions is known.
|
|
*/
|
|
if ((dp->posn = malloc(sizeof(Posn) * dp->nposn + dp->nposn)) == 0)
|
|
goto err;
|
|
dp->posset = (unsigned char *)&dp->posn[dp->nposn];
|
|
/*
|
|
* Get follow sets for each position.
|
|
*/
|
|
if (posnfoll(dp, tp) != 0)
|
|
goto err;
|
|
/*
|
|
* Set up the special invariant states:
|
|
* - dead state (no valid transitions); index 0.
|
|
* - initial state for any match [STAR(ALL) follow set]; index 1.
|
|
* - initial state for any match after ROP_BOL.
|
|
* - initial state for left-most longest if REG_NOTBOL.
|
|
* - initial state for left-most longest after ROP_BOL.
|
|
* The final two are not allocated if leftmost() cannot be called.
|
|
* The pairs of initial states are the same if there is no
|
|
* explicit ROP_BOL transition.
|
|
*/
|
|
dp->avail += dp->used;
|
|
dp->used = 0;
|
|
if ((dp->sigfoll = malloc(sizeof(size_t) * dp->avail)) == 0)
|
|
goto err;
|
|
p = &dp->posn[dp->nposn - 1]; /* same as first(root) */
|
|
dp->cursig = &dp->posfoll[p->seti];
|
|
dp->nset = p->nset;
|
|
dp->top = 1; /* index 0 is dead state */
|
|
addstate(dp); /* must be state index 1 (returns 2) */
|
|
if ((dp->cursig = malloc(sizeof(size_t) * dp->nposn)) == 0)
|
|
goto err;
|
|
dp->nfix = 2;
|
|
if ((st = regtrans(dp, 1, ROP_BOL, lxp->mb_cur_max)) == 0)
|
|
goto err;
|
|
if ((dp->anybol = st - 1) == 2) /* new state */
|
|
dp->nfix = 3;
|
|
if ((ep->re_flags & REG_NOSUB) == 0) /* leftmost() might be called */
|
|
{
|
|
/*
|
|
* leftmost() initial states are the same as the
|
|
* "any match" ones without the STAR(ALL) position.
|
|
*/
|
|
dp->sigi[dp->nfix] = 0;
|
|
dp->nsig[dp->nfix] = dp->nsig[1] - 1;
|
|
dp->acc[dp->nfix] = dp->acc[1];
|
|
dp->leftbol = dp->leftmost = dp->nfix;
|
|
dp->nfix++;
|
|
if (dp->anybol != 1) /* distinct state w/BOL */
|
|
{
|
|
dp->sigi[dp->nfix] = dp->sigi[2];
|
|
dp->nsig[dp->nfix] = dp->nsig[2] - 1;
|
|
dp->acc[dp->nfix] = dp->acc[2];
|
|
dp->leftbol = dp->nfix;
|
|
dp->nfix++;
|
|
}
|
|
dp->top = dp->nfix;
|
|
}
|
|
return 0;
|
|
err:;
|
|
libuxre_regdeldfa(dp);
|
|
return REG_ESPACE;
|
|
}
|
|
|
|
static int
|
|
leftmost(Dfa *dp, Exec *xp)
|
|
{
|
|
const unsigned char *s, *beg, *end;
|
|
int i, nst, st, mb_cur_max;
|
|
w_type wc;
|
|
|
|
mb_cur_max = xp->mb_cur_max;
|
|
beg = s = xp->str;
|
|
end = 0;
|
|
st = dp->leftbol;
|
|
if (xp->flags & REG_NOTBOL)
|
|
st = dp->leftmost;
|
|
if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0)
|
|
end = s; /* initial empty match allowed */
|
|
for (;;)
|
|
{
|
|
if ((wc = *s++) == '\n')
|
|
{
|
|
if (xp->flags & REG_NEWLINE)
|
|
wc = ROP_EOL;
|
|
}
|
|
else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0)
|
|
s += i;
|
|
if ((wc & ~(long)(NCHAR - 1)) != 0
|
|
|| (nst = dp->trans[st][wc]) == 0)
|
|
{
|
|
if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0)
|
|
return REG_ESPACE;
|
|
if (wc == ROP_EOL) /* REG_NEWLINE only */
|
|
{
|
|
if (dp->acc[nst - 1])
|
|
{
|
|
if (end == 0 || end < s)
|
|
end = s;
|
|
break;
|
|
}
|
|
beg = s;
|
|
st = dp->leftbol;
|
|
goto newst;
|
|
}
|
|
}
|
|
if ((st = nst - 1) == 0) /* dead state */
|
|
{
|
|
if (end != 0)
|
|
break;
|
|
if ((wc = *beg++) == '\0')
|
|
return REG_NOMATCH;
|
|
else if (!ISONEBYTE(wc) &&
|
|
(i = libuxre_mb2wc(&wc, beg)) > 0)
|
|
beg += i;
|
|
s = beg;
|
|
st = dp->leftmost;
|
|
goto newst;
|
|
}
|
|
if (wc == '\0')
|
|
{
|
|
if (dp->acc[st])
|
|
{
|
|
s--; /* don't include \0 */
|
|
if (end == 0 || end < s)
|
|
end = s;
|
|
break;
|
|
}
|
|
if (end != 0)
|
|
break;
|
|
return REG_NOMATCH;
|
|
}
|
|
newst:;
|
|
if (dp->acc[st])
|
|
{
|
|
if (end == 0 || end < s)
|
|
end = s;
|
|
}
|
|
}
|
|
xp->match[0].rm_so = beg - xp->str;
|
|
xp->match[0].rm_eo = end - xp->str;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Optimization by simplification: singlebyte locale and REG_NEWLINE not set.
|
|
* Performance gain for grep is 25% so it's worth the hack.
|
|
*/
|
|
static int
|
|
regdfaexec_opt(Dfa *dp, Exec *xp)
|
|
{
|
|
const unsigned char *s;
|
|
int nst, st;
|
|
|
|
s = xp->str;
|
|
st = dp->anybol;
|
|
if (xp->flags & REG_NOTBOL)
|
|
st = 1;
|
|
if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0)
|
|
return 0; /* initial empty match allowed */
|
|
do
|
|
{
|
|
if ((nst = dp->trans[st][*s]) == 0)
|
|
{
|
|
if ((nst = regtrans(dp, st, *s, 1)) == 0)
|
|
return REG_ESPACE;
|
|
}
|
|
if (dp->acc[st = nst - 1])
|
|
return 0;
|
|
} while (*s++ != '\0'); /* st != 0 */
|
|
return REG_NOMATCH;
|
|
}
|
|
|
|
LIBUXRE_STATIC int
|
|
libuxre_regdfaexec(Dfa *dp, Exec *xp)
|
|
{
|
|
const unsigned char *s;
|
|
int i, nst, st, mb_cur_max;
|
|
w_type wc;
|
|
|
|
dp->flags = xp->flags & REG_NOTEOL; /* for regtrans() */
|
|
mb_cur_max = xp->mb_cur_max;
|
|
if (xp->nmatch != 0)
|
|
return leftmost(dp, xp);
|
|
if (mb_cur_max == 1 && (xp->flags & REG_NEWLINE) == 0)
|
|
return regdfaexec_opt(dp, xp);
|
|
s = xp->str;
|
|
st = dp->anybol;
|
|
if (xp->flags & REG_NOTBOL)
|
|
st = 1;
|
|
if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0)
|
|
return 0; /* initial empty match allowed */
|
|
for (;;)
|
|
{
|
|
if ((wc = *s++) == '\n')
|
|
{
|
|
if (xp->flags & REG_NEWLINE)
|
|
wc = ROP_EOL;
|
|
}
|
|
else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0)
|
|
s += i;
|
|
if ((wc & ~(long)(NCHAR - 1)) != 0
|
|
|| (nst = dp->trans[st][wc]) == 0)
|
|
{
|
|
if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0)
|
|
return REG_ESPACE;
|
|
if (wc == ROP_EOL) /* REG_NEWLINE only */
|
|
{
|
|
if (dp->acc[nst - 1])
|
|
return 0;
|
|
if (dp->acc[st = dp->anybol])
|
|
return 0;
|
|
continue;
|
|
}
|
|
}
|
|
if (dp->acc[st = nst - 1])
|
|
return 0;
|
|
if (wc == '\0') /* st == 0 */
|
|
return REG_NOMATCH;
|
|
}
|
|
}
|