/* * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. * * Sccsid @(#)regdfa.c 1.9 (gritter) 9/22/03 */ /* UNIX(R) Regular Expresssion Library * * Note: Code is released under the GNU LGPL * * Copyright (C) 2001 Caldera International, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to: * Free Software Foundation, Inc. * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* #include "synonyms.h" */ #include #include #include #include "regdfa.h" /* * Deterministic Finite Automata. */ /* * Postorder traversal that returns a copy of the subtree, * except that ROP_BKT becomes ROP_BKTCOPY (since they * share the same pointed to Bracket object). */ static Tree * copy(regex_t *ep, Tree *tp) { Tree *np; if ((np = malloc(sizeof(Tree))) == 0) return 0; switch (np->op = tp->op) /* almost always correct */ { case ROP_BKT: np->op = ROP_BKTCOPY; /*FALLTHROUGH*/ case ROP_BKTCOPY: np->right.info.bkt = tp->right.info.bkt; /*FALLTHROUGH*/ default: np->left.pos = ep->re_dfa->nposn++; /*FALLTHROUGH*/ case ROP_EMPTY: return np; case ROP_CAT: case ROP_OR: if ((np->right.ptr = copy(ep, tp->right.ptr)) == 0) { free(np); return 0; } np->right.ptr->parent = np; /*FALLTHROUGH*/ case ROP_STAR: case ROP_PLUS: case ROP_QUEST: case ROP_LP: if ((np->left.ptr = copy(ep, tp->left.ptr)) == 0) break; np->left.ptr->parent = np; return np; } libuxre_regdeltree(np, 1); return 0; } /* * Postorder traversal. * Assign unique ascending integer values to the leaves. * Since the right child is traversed before the left, * the position for ROP_END is guaranteed to be zero. * The parse tree is rewritten in two cases: * - Each ROP_BRACE is replaced by an equivalent--sometimes * large--subtree using only ROP_CAT, ROP_QUEST, and * ROP_PLUS. * - If REG_ICASE, replace each simple character that has * an uppercase equivalent with a ROP_OR subtree over the * two versions. * Since these rewrites occur bottom up, they have already * been applied before any subtrees passed to copy(). */ static Tree * findposn(regex_t *ep, Tree *tp, int mb_cur_max) { unsigned int lo, hi; Tree *ptr, *par; w_type wc; switch (tp->op) { default: if (ep->re_flags & REG_ICASE && (wc = to_upper(tp->op)) != tp->op) { if ((ptr = libuxre_reg1tree(tp->op, 0)) == 0) return 0; ptr->parent = tp; ptr->left.pos = ep->re_dfa->nposn++; tp->op = ROP_OR; tp->left.ptr = ptr; ptr = libuxre_reg1tree(wc, 0); if ((tp->right.ptr = ptr) == 0) return 0; ptr->parent = tp; ptr->left.pos = ep->re_dfa->nposn++; return tp; } /*FALLTHROUGH*/ case ROP_BOL: case ROP_EOL: case ROP_ALL: case ROP_ANYCH: case ROP_NOTNL: case ROP_NONE: case ROP_BKT: case ROP_BKTCOPY: case ROP_END: tp->left.pos = ep->re_dfa->nposn++; return tp; case ROP_EMPTY: return tp; case ROP_OR: case ROP_CAT: if ((tp->right.ptr = findposn(ep, tp->right.ptr, mb_cur_max)) == 0) return 0; /*FALLTHROUGH*/ case ROP_STAR: case ROP_PLUS: case ROP_QUEST: case ROP_LP: if ((tp->left.ptr = findposn(ep, tp->left.ptr, mb_cur_max)) == 0) return 0; return tp; case ROP_BRACE: if ((tp->left.ptr = findposn(ep, tp->left.ptr, mb_cur_max)) == 0) return 0; break; } /* * ROP_BRACE as is cannot be handled in a DFA. This code * duplicates the ROP_BRACE subtree as a left-towering * series of ROP_CAT nodes, the first "lo" of which are * direct copies of the original subtree. The tail of * the series are either some number of ROP_QUESTs over * copies of the original subtree, or a single ROP_PLUS * over a copy (when "hi" is infinity). * * All interesting cases {lo,hi}: * {0,0} -> ROP_EMPTY, parsing, temporary * {0,1} -> ROP_QUEST, parsing * {0,2} -> CAT(QUEST(left), QUEST(copy)) * {0,n} -> CAT({0,n-1}, QUEST(copy)) * {0,} -> ROP_STAR, parsing * * {1,1} -> ROP_NOP, parsing, temporary * {1,2} -> CAT(left, QUEST(copy)) * {1,n} -> CAT({1,n-1}, QUEST(copy)) * {1,} -> ROP_PLUS, parsing * * {2,2} -> CAT(left, copy) * {2,n} -> CAT({2,n-1}, QUEST(copy)) * {2,} -> CAT(left, PLUS(copy)) * * {3,3} -> CAT({2,2}, copy) * {3,n} -> CAT({3,n-1}, QUEST(copy)) * {3,} -> CAT({2,2}, PLUS(copy)) * * {n,} -> CAT({n-1,n-1}, PLUS(copy)) * * In all cases, the ROP_BRACE node is turned into the * left-most ROP_CAT, and a copy of its original subtree * is connected as the right child. Note that the bottom- * up nature of this duplication guarantees that copy() * never sees a ROP_BRACE node. */ par = tp->parent; lo = tp->right.info.num[0]; hi = tp->right.info.num[1]; if ((ptr = copy(ep, tp->left.ptr)) == 0) return 0; ptr->parent = tp; tp->op = ROP_CAT; tp->right.ptr = ptr; if (lo == 0) { if ((tp->left.ptr = libuxre_reg1tree(ROP_QUEST, tp->left.ptr)) == 0) return 0; tp->left.ptr->parent = tp; } else { if (hi == BRACE_INF || (hi -= lo) == 0) lo--; /* lo > 1; no extra needed */ while (--lo != 0) { if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr))) == 0) return 0; } } if (hi == BRACE_INF) { if ((tp->right.ptr = libuxre_reg1tree(ROP_PLUS, tp->right.ptr)) == 0) return 0; tp->right.ptr->parent = tp; } else if (hi != 0) { if ((tp->right.ptr = libuxre_reg1tree(ROP_QUEST, tp->right.ptr)) == 0) return 0; ptr = tp->right.ptr; ptr->parent = tp; while (--hi != 0) { if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr))) == 0) return 0; } } tp->parent = par; return tp; } /* * Postorder traversal, but not always entire subtree. * For each leaf reachable by the empty string, add it * to the set. Return 0 if the subtree can match empty. */ static int first(Dfa *dp, Tree *tp) { switch (tp->op) { case ROP_BOL: if (dp->flags & REG_NOTBOL) return 0; break; case ROP_EOL: if (dp->flags & REG_NOTEOL) return 0; break; case ROP_EMPTY: return 0; case ROP_OR: return first(dp, tp->left.ptr) & first(dp, tp->right.ptr); case ROP_CAT: if (first(dp, tp->left.ptr) != 0) return 1; return first(dp, tp->right.ptr); case ROP_BRACE: if (tp->right.info.num[0] != 0 && first(dp, tp->left.ptr) != 0) return 1; /*FALLTHROUGH*/ case ROP_STAR: case ROP_QUEST: first(dp, tp->left.ptr); return 0; case ROP_LP: case ROP_PLUS: return first(dp, tp->left.ptr); } if (dp->posset[tp->left.pos] == 0) { dp->posset[tp->left.pos] = 1; dp->nset++; } return 1; } /* * Walk from leaf up (most likely not to root). * Determine follow set for the leaf by filling * set[] with the positions reachable. */ static void follow(Dfa *dp, Tree *tp) { Tree *pp; switch ((pp = tp->parent)->op) { case ROP_CAT: if (pp->left.ptr == tp && first(dp, pp->right.ptr) != 0) break; /*FALLTHROUGH*/ case ROP_OR: case ROP_QUEST: case ROP_LP: follow(dp, pp); break; case ROP_STAR: case ROP_PLUS: case ROP_BRACE: first(dp, tp); follow(dp, pp); break; } } /* * Postorder traversal. * At each leaf, copy it into posn[] and assign its follow set. * Because the left-most subtree is ROP_ALL under ROP_STAR, the * follow set for its leaf (position dp->nposn-1) is the same * as the initial state's signature (prior to any ROP_BOL). */ static int posnfoll(Dfa *dp, Tree *tp) { unsigned char *s; size_t i, n; size_t *fp; Posn *p; int ret; switch (tp->op) { case ROP_OR: case ROP_CAT: if ((ret = posnfoll(dp, tp->right.ptr)) != 0) return ret; /*FALLTHROUGH*/ case ROP_STAR: case ROP_PLUS: case ROP_QUEST: case ROP_LP: if ((ret = posnfoll(dp, tp->left.ptr)) != 0) return ret; return 0; case ROP_END: /* keeps follow() from walking above the root */ p = &dp->posn[tp->left.pos]; p->op = tp->op; p->seti = 0; p->nset = 0; return 0; case ROP_BKT: case ROP_BKTCOPY: p = &dp->posn[tp->left.pos]; p->bkt = tp->right.info.bkt; goto skip; case ROP_BOL: dp->flags |= REG_NOTBOL; /* adjacent ROP_BOLs match empty */ break; case ROP_EOL: dp->flags |= REG_NOTEOL; /* adjacent ROP_EOLs match empty */ break; } p = &dp->posn[tp->left.pos]; skip:; p->op = tp->op; memset(dp->posset, 0, dp->nposn); dp->nset = 0; follow(dp, tp); dp->flags &= ~(REG_NOTBOL | REG_NOTEOL); fp = dp->posfoll; if ((p->nset = dp->nset) > dp->avail) /* need more */ { if ((n = p->nset << 1) < dp->nposn) n = dp->nposn; dp->avail += n; if ((fp = realloc(dp->posfoll, sizeof(size_t) * (dp->avail + dp->used))) == 0) { return REG_ESPACE; } dp->posfoll = fp; } p->seti = dp->used; if ((i = dp->nset) != 0) { dp->used += i; dp->avail -= i; fp += p->seti; s = dp->posset; n = 0; do { if (*s++ != 0) { *fp++ = n; if (--i == 0) break; } } while (++n != dp->nposn); } return 0; } static int addstate(Dfa *dp) /* install state if unique; return its index */ { size_t *sp, *fp; size_t t, n, i; int flushed; /* * Compare dp->nset/dp->cursig[] against remembered states. */ t = dp->top; do { if (dp->nsig[--t] != dp->nset) continue; if ((n = dp->nset) != 0) { fp = &dp->sigfoll[dp->sigi[t]]; sp = &dp->cursig[0]; loop:; if (*fp++ != *sp++) continue; /* to the do-while */ if (--n != 0) goto loop; } return t + 1; } while (t != 0); /* * Not in currently cached states; add it. */ flushed = 0; if ((t = dp->top) >= CACHESZ) /* need to flush the cache */ { flushed = 1; n = dp->anybol; n = dp->sigi[n] + dp->nsig[n]; /* past invariant states */ dp->avail += dp->used - n; dp->used = n; dp->top = n = dp->nfix; memset((void *)&dp->trans, 0, sizeof(dp->trans)); memset((void *)&dp->acc[n], 0, CACHESZ - n); t = n; } dp->top++; fp = dp->sigfoll; if ((n = dp->nset) > dp->avail) /* grow strip */ { i = dp->avail + n << 1; if ((fp = realloc(fp, sizeof(size_t) * (i + dp->used))) == 0) return 0; dp->avail = i; dp->sigfoll = fp; } dp->acc[t] = 0; if ((dp->nsig[t] = n) != 0) { sp = dp->cursig; if (sp[0] == 0) dp->acc[t] = 1; dp->sigi[t] = i = dp->used; dp->used += n; dp->avail -= n; fp += i; do *fp++ = *sp++; while (--n != 0); } t++; if (flushed) return -t; return t; } void libuxre_regdeldfa(Dfa *dp) { Posn *pp; size_t np; if (dp->posfoll != 0) free(dp->posfoll); if (dp->sigfoll != 0) free(dp->sigfoll); if (dp->cursig != 0) free(dp->cursig); if ((pp = dp->posn) != 0) { /* * Need to walk the positions list to free any * space used for ROP_BKTs. */ np = dp->nposn; do { if (pp->op == ROP_BKT) { libuxre_bktfree(pp->bkt); free(pp->bkt); } } while (++pp, --np != 0); free(dp->posn); } free(dp); } int regtrans(Dfa *dp, int st, w_type wc, int mb_cur_max) { const unsigned char *s; size_t *fp, *sp; size_t i, n; Posn *pp; int nst; if ((n = dp->nsig[st]) == 0) /* dead state */ return st + 1; /* stay here */ memset(dp->posset, 0, dp->nposn); dp->nset = 0; fp = &dp->sigfoll[dp->sigi[st]]; do { pp = &dp->posn[*fp]; switch (pp->op) { case ROP_EOL: if (wc == '\0' && (dp->flags & REG_NOTEOL) == 0) break; /*FALLTHROUGH*/ case ROP_BOL: default: if (pp->op == wc) break; /*FALLTHROUGH*/ case ROP_END: case ROP_NONE: continue; case ROP_NOTNL: if (wc == '\n') continue; /*FALLTHROUGH*/ case ROP_ANYCH: if (wc <= '\0') continue; break; case ROP_ALL: if (wc == '\0') continue; break; case ROP_BKT: case ROP_BKTCOPY: /* * Note that multiple character bracket matches * are precluded from DFAs. (See regparse.c and * regcomp.c.) Thus, the continuation string * argument is not used in libuxre_bktmbexec(). */ if (wc > '\0' && libuxre_bktmbexec(pp->bkt, wc, 0, mb_cur_max) == 0) break; continue; } /* * Current character matches this position. * For each position in its follow list, * add that position to the new state's signature. */ i = pp->nset; sp = &dp->posfoll[pp->seti]; do { if (dp->posset[*sp] == 0) { dp->posset[*sp] = 1; dp->nset++; } } while (++sp, --i != 0); } while (++fp, --n != 0); /* * Move the signature (if any) into cursig[] and install it. */ if ((i = dp->nset) != 0) { fp = dp->cursig; s = dp->posset; for (n = 0;; n++) { if (*s++ != 0) { *fp++ = n; if (--i == 0) break; } } } if ((nst = addstate(dp)) < 0) /* flushed cache */ nst = -nst; else if (nst > 0 && (wc & ~(long)(NCHAR - 1)) == 0) dp->trans[st][wc] = nst; return nst; } LIBUXRE_STATIC int libuxre_regdfacomp(regex_t *ep, Tree *tp, Lex *lxp) { Tree *lp; Dfa *dp; Posn *p; int st; /* * It's convenient to insert an STAR(ALL) subtree to the * immediate left of the current tree. This makes the * "any match" libuxre_regdfaexec() not a special case, * and the initial state signature will fall out when * building the follow sets for all the leaves. */ if ((lp = libuxre_reg1tree(ROP_ALL, 0)) == 0 || (lp = libuxre_reg1tree(ROP_STAR, lp)) == 0 || (tp->left.ptr = lp = libuxre_reg2tree(ROP_CAT, lp, tp->left.ptr)) == 0) { return REG_ESPACE; } lp->parent = tp; if ((dp = calloc(1, sizeof(Dfa))) == 0) return REG_ESPACE; ep->re_dfa = dp; /* * Just in case null pointers aren't just all bits zero... */ dp->posfoll = 0; dp->sigfoll = 0; dp->cursig = 0; dp->posn = 0; /* * Assign position values to each of the tree's leaves * (the important parts), meanwhile potentially rewriting * the parse tree so that it fits within the restrictions * of our DFA. */ if ((tp = findposn(ep, tp, lxp->mb_cur_max)) == 0) goto err; /* * Get space for the array of positions and current set, * now that the number of positions is known. */ if ((dp->posn = malloc(sizeof(Posn) * dp->nposn + dp->nposn)) == 0) goto err; dp->posset = (unsigned char *)&dp->posn[dp->nposn]; /* * Get follow sets for each position. */ if (posnfoll(dp, tp) != 0) goto err; /* * Set up the special invariant states: * - dead state (no valid transitions); index 0. * - initial state for any match [STAR(ALL) follow set]; index 1. * - initial state for any match after ROP_BOL. * - initial state for left-most longest if REG_NOTBOL. * - initial state for left-most longest after ROP_BOL. * The final two are not allocated if leftmost() cannot be called. * The pairs of initial states are the same if there is no * explicit ROP_BOL transition. */ dp->avail += dp->used; dp->used = 0; if ((dp->sigfoll = malloc(sizeof(size_t) * dp->avail)) == 0) goto err; p = &dp->posn[dp->nposn - 1]; /* same as first(root) */ dp->cursig = &dp->posfoll[p->seti]; dp->nset = p->nset; dp->top = 1; /* index 0 is dead state */ addstate(dp); /* must be state index 1 (returns 2) */ if ((dp->cursig = malloc(sizeof(size_t) * dp->nposn)) == 0) goto err; dp->nfix = 2; if ((st = regtrans(dp, 1, ROP_BOL, lxp->mb_cur_max)) == 0) goto err; if ((dp->anybol = st - 1) == 2) /* new state */ dp->nfix = 3; if ((ep->re_flags & REG_NOSUB) == 0) /* leftmost() might be called */ { /* * leftmost() initial states are the same as the * "any match" ones without the STAR(ALL) position. */ dp->sigi[dp->nfix] = 0; dp->nsig[dp->nfix] = dp->nsig[1] - 1; dp->acc[dp->nfix] = dp->acc[1]; dp->leftbol = dp->leftmost = dp->nfix; dp->nfix++; if (dp->anybol != 1) /* distinct state w/BOL */ { dp->sigi[dp->nfix] = dp->sigi[2]; dp->nsig[dp->nfix] = dp->nsig[2] - 1; dp->acc[dp->nfix] = dp->acc[2]; dp->leftbol = dp->nfix; dp->nfix++; } dp->top = dp->nfix; } return 0; err:; libuxre_regdeldfa(dp); return REG_ESPACE; } static int leftmost(Dfa *dp, Exec *xp) { const unsigned char *s, *beg, *end; int i, nst, st, mb_cur_max; w_type wc; mb_cur_max = xp->mb_cur_max; beg = s = xp->str; end = 0; st = dp->leftbol; if (xp->flags & REG_NOTBOL) st = dp->leftmost; if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) end = s; /* initial empty match allowed */ for (;;) { if ((wc = *s++) == '\n') { if (xp->flags & REG_NEWLINE) wc = ROP_EOL; } else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) s += i; if ((wc & ~(long)(NCHAR - 1)) != 0 || (nst = dp->trans[st][wc]) == 0) { if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) return REG_ESPACE; if (wc == ROP_EOL) /* REG_NEWLINE only */ { if (dp->acc[nst - 1]) { if (end == 0 || end < s) end = s; break; } beg = s; st = dp->leftbol; goto newst; } } if ((st = nst - 1) == 0) /* dead state */ { if (end != 0) break; if ((wc = *beg++) == '\0') return REG_NOMATCH; else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, beg)) > 0) beg += i; s = beg; st = dp->leftmost; goto newst; } if (wc == '\0') { if (dp->acc[st]) { s--; /* don't include \0 */ if (end == 0 || end < s) end = s; break; } if (end != 0) break; return REG_NOMATCH; } newst:; if (dp->acc[st]) { if (end == 0 || end < s) end = s; } } xp->match[0].rm_so = beg - xp->str; xp->match[0].rm_eo = end - xp->str; return 0; } /* * Optimization by simplification: singlebyte locale and REG_NEWLINE not set. * Performance gain for grep is 25% so it's worth the hack. */ static int regdfaexec_opt(Dfa *dp, Exec *xp) { const unsigned char *s; int nst, st; s = xp->str; st = dp->anybol; if (xp->flags & REG_NOTBOL) st = 1; if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) return 0; /* initial empty match allowed */ do { if ((nst = dp->trans[st][*s]) == 0) { if ((nst = regtrans(dp, st, *s, 1)) == 0) return REG_ESPACE; } if (dp->acc[st = nst - 1]) return 0; } while (*s++ != '\0'); /* st != 0 */ return REG_NOMATCH; } LIBUXRE_STATIC int libuxre_regdfaexec(Dfa *dp, Exec *xp) { const unsigned char *s; int i, nst, st, mb_cur_max; w_type wc; dp->flags = xp->flags & REG_NOTEOL; /* for regtrans() */ mb_cur_max = xp->mb_cur_max; if (xp->nmatch != 0) return leftmost(dp, xp); if (mb_cur_max == 1 && (xp->flags & REG_NEWLINE) == 0) return regdfaexec_opt(dp, xp); s = xp->str; st = dp->anybol; if (xp->flags & REG_NOTBOL) st = 1; if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) return 0; /* initial empty match allowed */ for (;;) { if ((wc = *s++) == '\n') { if (xp->flags & REG_NEWLINE) wc = ROP_EOL; } else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) s += i; if ((wc & ~(long)(NCHAR - 1)) != 0 || (nst = dp->trans[st][wc]) == 0) { if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) return REG_ESPACE; if (wc == ROP_EOL) /* REG_NEWLINE only */ { if (dp->acc[nst - 1]) return 0; if (dp->acc[st = dp->anybol]) return 0; continue; } } if (dp->acc[st = nst - 1]) return 0; if (wc == '\0') /* st == 0 */ return REG_NOMATCH; } }