ex-vi/libuxre/bracket.c

/*
 * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
 *
 * Sccsid @(#)bracket.c	1.14 (gritter) 10/18/03
 */
/*  UNIX(R) Regular Expresssion Library
 *
 *  Note: Code is released under the GNU LGPL
 *
 *  Copyright (C) 2001 Caldera International, Inc.
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to:
 *        Free Software Foundation, Inc.
 *        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*	#include "synonyms.h"	*/
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "re.h"

/*
* Build and match the [...] part of REs.
*
* In general, each compiled bracket construct holds a set of mapped
* wide character values and a set of character classifications.
* The mapping applied (when the current LC_COLLATE is not CHF_ENCODED)
* is the "basic" weight (cep->weight[0]); otherwise the actual wide
* character is used.
*
* To support simplified range handling, this code assumes that a w_type,
* a signed integer type, can hold all valid basic weight values (as well
* as all wide character values for CHF_ENCODED locales) and that these
* are all positive.  Negative values indicate error conditions (BKT_*);
* zero (which must be the same as WGHT_IGNORE) indicates success, but
* that the item installed is not a range endpoint.
*/

static int
addwide(Bracket *bp, wchar_t ord)
{
	unsigned int nw;

	if ((nw = bp->nwide) < NWIDE)
		bp->wide[nw] = ord;
	else
	{
		if (nw % NWIDE == 0 && (bp->exwide =
			realloc(bp->exwide, nw * sizeof(wchar_t))) == 0)
		{
			return BKT_ESPACE;
		}
		nw -= NWIDE;
		bp->exwide[nw] = ord;
	}
	bp->nwide++;
	return 0;
}

#if USHRT_MAX == 65535	/* have 16 bits */
#define PLIND(n)	((n) >> 4)
#define PLBIT(n)	(1 << ((n) & 0xf))
#else
#define PLIND(n)	((n) / CHAR_BIT)
#define PLBIT(n)	(1 << ((n) % CHAR_BIT))
#endif

#define RANGE	((wchar_t)'-')	/* separates wide chars in ranges */

static int
addrange(Bracket *bp, wchar_t ord, w_type prev)
{
	int ret;

	if (prev > 0 && prev != ord) /* try for range */
	{
		if (prev > ord)
		{
			if (bp->flags & BKT_ODDRANGE)	/* prev only - done */
				return 0;
			else if ((bp->flags & BKT_BADRANGE) == 0)
				return BKT_ERANGE;
		}
		else
		{
			if (++prev <= UCHAR_MAX) /* "prev" already there */
			{
				do
				{
					bp->byte[PLIND(prev)] |= PLBIT(prev);
					if (prev == ord)
						return 0;
				} while (++prev <= UCHAR_MAX);
			}
			if ((ret = addwide(bp, prev)) != 0)
				return ret;
			if (++prev > ord)
				return 0;
			if (prev < ord && (ret = addwide(bp, RANGE)) != 0)
				return ret;
			return addwide(bp, ord);
		}
	}
	if (ord <= UCHAR_MAX)
	{
		bp->byte[PLIND(ord)] |= PLBIT(ord);
		return 0;
	}
	if (prev == ord) /* don't bother */
		return 0;
	return addwide(bp, ord);
}

static w_type
place(Bracket *bp, wchar_t wc, w_type prev, int mb_cur_max)
{
	const CollElem *cep;
	CollElem spare;
	int ret;

	if ((cep = libuxre_collelem(bp->col, &spare, wc)) != ELEM_ENCODED)
	{
		if (cep == ELEM_BADCHAR)
			return BKT_BADCHAR;
		wc = cep->weight[0];
	}
	if ((ret = addrange(bp, wc, prev)) != 0)
		return ret;
	return wc;
}

#ifndef CHARCLASS_NAME_MAX
#   define CHARCLASS_NAME_MAX	127
#endif

static w_type
chcls(Bracket *bp, const unsigned char *s, int n)
{
	char clsstr[CHARCLASS_NAME_MAX + 1];
	unsigned int nt;
	wctype_t wct;

	if (n > CHARCLASS_NAME_MAX)
		return BKT_ECTYPE;
	(void)memcpy(clsstr, s, n);
	clsstr[n] = '\0';
	if ((wct = wctype(clsstr)) == 0)
		return BKT_ECTYPE;
	if ((nt = bp->ntype) < NTYPE)
		bp->type[nt] = wct;
	else
	{
		if (nt % NTYPE == 0 && (bp->extype =
			realloc(bp->extype, nt * sizeof(wctype_t))) == 0)
		{
			return BKT_ESPACE;
		}
		nt -= NTYPE;
		bp->extype[nt] = wct;
	}
	bp->ntype++;
	return 0; /* cannot be end point of a range */
}

	/*
	* The purpose of mcce() and its Mcce structure is to locate
	* the next full collation element from "wc" and "s".  It is
	* called both at compile and execute time.  These two differ
	* primarily in that at compile time there is an exact number
	* of bytes to be consumed, while at execute time the longest
	* valid collation element is to be found.
	*
	* When BKT_ONECASE is set, MCCEs become particularly messy.
	* There is no guarantee that all possible combinations of
	* upper/lower case are defined as MCCEs.  Thus, this code
	* tries both lower- and uppercase (in that order) for each
	* character than might be part of an MCCE.
	*/

typedef struct
{
	const unsigned char	*max;	/* restriction by caller */
	const unsigned char	*aft;	/* longest successful */
	Bracket			*bp;	/* readonly */
	struct lc_collate	*col;	/* readonly */
	const CollElem		*cep;	/* entry matching longest */
	wchar_t			ch;	/* initial character (if any) */
	w_type			wc;	/* character matching "aft" */
} Mcce;

static int
mcce(Mcce *mcp, const CollElem *cep, const unsigned char *s, int mb_cur_max,
		int compile_time)
{
	const CollElem *nxt;
	CollElem spare;
	w_type ch, wc;
	int i;

	/*
	* Get next character.
	*/
	if ((wc = mcp->ch) != '\0')
	{
		mcp->ch = '\0';
	}
	else if (ISONEBYTE(wc = *s++))
	{
		if (wc == '\0')
			return 0;
	}
	else if ((i = libuxre_mb2wc(&wc, s)) > 0)
	{
		s += i;
		if (mcp->max != 0 && s > mcp->max)
			return 0;
	}
	else if (i < 0)
		return BKT_ILLSEQ;
	/*
	* Try out the this character as part of an MCCE.
	* If BKT_ONECASE is set, this code tries both the lower- and
	* uppercase version, continuing if it matches so far.
	*/
	ch = wc;
	if (mcp->bp->flags & BKT_ONECASE)
	{
		if ((wc = to_lower(wc)) == ch)
			ch = to_upper(wc);
	}
	for (;;) /* at most twice */
	{
		if (cep == ELEM_BADCHAR) /* first character */
		{
			if ((nxt = libuxre_collelem(mcp->col, &spare, wc))
				== ELEM_ENCODED
				|| (mcp->col->flags & CHF_MULTICH) == 0
				|| s == mcp->max)
			{
				mcp->aft = s;
				mcp->cep = nxt;
				mcp->wc = wc;
				break;
			}
		}
		else
		{
			nxt = libuxre_collmult(mcp->col, cep, wc);
		}
		if (nxt != ELEM_BADCHAR)
		{
			/*
			* Okay so far.  Record this collating element
			* if it's really one (not WGHT_IGNORE) and
			* we've reached a new high point or it's the
			* first match.
			*
			* If there's a possibility for more, call mcce()
			* recursively for the subsequent characters.
			*/
			if (nxt->weight[0] != WGHT_IGNORE
				&& (mcp->aft < s || mcp->cep == ELEM_BADCHAR))
			{
				mcp->aft = s;
				mcp->cep = nxt;
				mcp->wc = wc;
			}
			if (nxt->multbeg != 0
				&& (mcp->max == 0 || s < mcp->max))
			{
				if ((i = mcce(mcp, nxt, s, mb_cur_max,
						compile_time)) != 0)
					return i;
			}
		}
		if (wc == ch)
			break;
		wc = ch;
	}
	return 0;
}

static w_type
eqcls(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max)
{
	w_type last;
	Mcce mcbuf;
	int err;

	mcbuf.max = &s[n];
	mcbuf.aft = &s[0];
	mcbuf.bp = bp;
	mcbuf.col = bp->col;
	mcbuf.cep = ELEM_BADCHAR;
	mcbuf.ch = '\0';
	if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0)
		return err;
	if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max)
		return BKT_EEQUIV;
	last = mcbuf.wc;
	if (mcbuf.cep != ELEM_ENCODED && mcbuf.col->nweight > 1)
	{
		const CollElem *cep;

		/*
		* The first and last weight[0] values for equivalence
		* classes are stuffed into the terminator for the
		* multiple character lists.  If these values are
		* scattered (elements that are not part of this
		* equivalence class have weight[0] values between the
		* two end points), then SUBN_SPECIAL is placed in
		* this terminator.  Note that weight[1] of the
		* terminator must be other than WGHT_IGNORE, too.
		*/
		last = mcbuf.cep->weight[0];
		if ((cep = libuxre_collmult(bp->col, mcbuf.cep, 0))
				!= ELEM_BADCHAR
			&& cep->weight[1] != WGHT_IGNORE)
		{
			last = cep->weight[1];
			if (cep->subnbeg == SUBN_SPECIAL)
			{
				unsigned int nq;

				/*
				* Permit ranges up to the first and
				* after the last.
				*/
				if (prev > 0 && prev != cep->weight[0]
					&& (prev = addrange(bp,
						cep->weight[0], prev)) != 0)
				{
					return prev;
				}
				/*
				* Record the equivalence class by storing
				* the primary weight.
				*/
				if ((nq = bp->nquiv) < NQUIV)
					bp->quiv[nq] = mcbuf.cep->weight[1];
				else
				{
					if (nq % NQUIV == 0 && (bp->exquiv =
						realloc(bp->exquiv,
							nq * sizeof(wuchar_type)))
						== 0)
					{
						return REG_ESPACE;
					}
					nq -= NQUIV;
					bp->exquiv[nq] = mcbuf.cep->weight[1];
				}
				bp->nquiv++;
				return last;
			}
			mcbuf.cep = cep;
		}
		mcbuf.wc = mcbuf.cep->weight[0];
	}
	/*
	* Determine range, if any, to install.
	*
	* If there's a pending low (prev > 0), then try to use it.
	*
	* Otherwise, try to use mcbuf.wc as the low end of the range.
	* Since addrange() assumes that the low point has already been
	* placed, we try to fool it by using a prev of one less than
	* mcbuf.wc.  But, if that value would not look like a valid
	* low point of a range, we have to explicitly place mcbuf.wc.
	*/
	if (prev <= 0 && (prev = mcbuf.wc - 1) <= 0)
	{
		if ((prev = addrange(bp, mcbuf.wc, 0)) != 0)
			return prev;
	}
	if ((mcbuf.wc = addrange(bp, last, prev)) != 0)
		return mcbuf.wc;
	return last;
}

static w_type
clsym(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max)
{
	Mcce mcbuf;
	int err;

	mcbuf.max = &s[n];
	mcbuf.aft = &s[0];
	mcbuf.bp = bp;
	mcbuf.col = bp->col;
	mcbuf.cep = ELEM_BADCHAR;
	mcbuf.ch = '\0';
	if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0)
		return err;
	if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max)
		return BKT_ECOLLATE;
	if (mcbuf.cep != ELEM_ENCODED)
		mcbuf.wc = mcbuf.cep->weight[0];
	if ((err = addrange(bp, mcbuf.wc, prev)) != 0)
		return err;
	return mcbuf.wc;
}

	/*
	* Scans the rest of a bracket construction within a regular
	* expression and fills in a description for it.
	* The leading [ and the optional set complement indicator
	* were handled already by the caller.
	* Returns:
	*	<0 error (a BKT_* value)
	*	>0 success; equals how many bytes were scanned.
	*/
LIBUXRE_STATIC int
libuxre_bktmbcomp(Bracket *bp, const unsigned char *pat0,
		int flags, int mb_cur_max)
{
	static const Bracket zero = {0};
	const unsigned char *pat = pat0;
	struct lc_collate *savecol;
	w_type n, wc, prev = 0;

	/*
	* Set represented set to empty.  Easiest to copy an empty
	* version over the caller's, (re)setting col and flags.
	*/
	savecol = bp->col;
	*bp = zero;
	bp->col = savecol;
	bp->flags = flags
		& (BKT_NEGATED | BKT_ONECASE | BKT_NOTNL | BKT_BADRANGE |
				BKT_ODDRANGE);
	/*
	* Handle optional "empty" brackets; typically only used
	* in combination with BKT_QUOTE or BKT_ESCAPE.
	*/
	if ((wc = *pat) == ']' && (flags & BKT_EMPTY) != 0)
		return 1;
	/*
	* Populate *bp.
	*/
	for (;; prev = n)
	{
		switch (wc)
		{
		case '\0':
		ebrack:;
			n = BKT_EBRACK;
			goto err;
		case '\n':
			if (flags & BKT_NLBAD)
				goto ebrack;
			goto regular;
		case '/':
			if (flags & BKT_SLASHBAD)
				goto ebrack;
			goto regular;
		case '\\':
			if ((flags & (BKT_ESCAPE | BKT_QUOTE
				| BKT_ESCNL | BKT_ESCSEQ)) == 0)
			{
				goto regular;
			}
			switch (wc = *++pat)
			{
			default:
			noesc:;
				if ((flags & BKT_ESCAPE) == 0)
				{
					wc = '\\';
					pat--;
				}
				break;
			case '\\':
			case ']':
			case '-':
			case '^':
				if ((flags & BKT_QUOTE) == 0)
					goto noesc;
				break;
			case 'a':
				if ((flags & BKT_ESCSEQ) == 0 ||
						(flags & BKT_OLDESC))
					goto noesc;
				wc = '\a';
				break;
			case 'b':
				if ((flags & BKT_ESCSEQ) == 0)
					goto noesc;
				wc = '\b';
				break;
			case 'f':
				if ((flags & BKT_ESCSEQ) == 0)
					goto noesc;
				wc = '\f';
				break;
			case 'n':
				if ((flags & (BKT_ESCSEQ | BKT_ESCNL)) == 0)
					goto noesc;
				wc = '\n';
				break;
			case 'r':
				if ((flags & BKT_ESCSEQ) == 0)
					goto noesc;
				wc = '\r';
				break;
			case 't':
				if ((flags & BKT_ESCSEQ) == 0)
					goto noesc;
				wc = '\t';
				break;
			case 'v':
				if ((flags & BKT_ESCSEQ) == 0 ||
						(flags & BKT_OLDESC))
					goto noesc;
				wc = '\v';
				break;
			case 'x':
				if ((flags & BKT_ESCSEQ) == 0 ||
						(flags & BKT_OLDESC))
					goto noesc;
				if (!isxdigit(wc = *++pat))
				{
					pat--;
					goto noesc;
				}
				/*
				* Take as many hex digits as possible,
				* ignoring overflows.
				* Any positive result is okay.
				*/
				n = 0;
				do
				{
					if (isdigit(wc))
						wc -= '0';
					else if (isupper(wc))
						wc -= 'A' + 10;
					else
						wc -= 'a' + 10;
					n <<= 4;
					n |= wc;
				} while (isxdigit(wc = *++pat));
				pat--;
				if ((wc = n) <= 0)
				{
					n = BKT_BADESC;
					goto err;
				}
				break;
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				if ((flags & BKT_ESCSEQ) == 0 ||
						(flags & BKT_OLDESC))
					goto noesc;
				/*
				* For compatibility (w/awk),
				* permit "octal" 8 and 9.
				*/
				n = wc - '0';
				if ((wc = *++pat) >= '0' && wc <= '9')
				{
					n <<= 3;
					n += wc - '0';
					if ((wc = *++pat) >= '0' && wc <= '9')
					{
						n <<= 3;
						n += wc - '0';
					}
				}
				pat--;
				if ((wc = n) <= 0)
				{
					n = BKT_BADESC;
					goto err;
				}
				break;
			}
			goto regular;
		case '[':
			if (((wc = *++pat) == ':' || wc == '=' || wc == '.') &&
					(flags & BKT_NOI18N) == 0)
			{
				n = 0;
				while (*++pat != wc || pat[1] != ']')
				{
					if (*pat == '\0')
					{
					badpat:;
						n = BKT_BADPAT;
						goto err;
					}
					else if (*pat == '/')
					{
						if (flags & BKT_SLASHBAD)
							goto badpat;
					}
					else if (*pat == '\n')
					{
						if (flags & BKT_NLBAD)
							goto badpat;
					}
					n++;
				}
				if (n == 0)
				{
					n = BKT_EMPTYSUBBKT;
					goto err;
				}
				if (wc == ':')
					n = chcls(bp, &pat[-n], n);
				else if (wc == '=')
					n = eqcls(bp, &pat[-n], n, prev,
							mb_cur_max);
				else /* wc == '.' */
					n = clsym(bp, &pat[-n], n, prev,
							mb_cur_max);
				pat++;
				break;
			}
			wc = '[';
			pat--;
			goto regular;
		default:
			if (!ISONEBYTE(wc) &&
					(n = libuxre_mb2wc(&wc, pat + 1)) > 0)
				pat += n;
		regular:;
			n = place(bp, wc, prev, mb_cur_max);
			break;
		}
		if (n < 0) {
			n = BKT_ILLSEQ;
			goto err;
		}
		if ((wc = *++pat) == ']')
			break;
		if (wc == '-' && n != 0)
		{
			if (prev == 0 || (flags & BKT_SEPRANGE) == 0)
			{
				if ((wc = *++pat) != ']')
					continue; /* valid range */
				wc = '-';
				pat--;
			}
		}
		n = 0;	/* no range this time */
	}
	return pat - pat0 + 1;
err:;
	libuxre_bktfree(bp);
	return n;
}

LIBUXRE_STATIC void
libuxre_bktfree(Bracket *bp)
{
	if (bp->extype != 0)
		free(bp->extype);
	if (bp->exquiv != 0)
		free(bp->exquiv);
	if (bp->exwide != 0)
		free(bp->exwide);
}

LIBUXRE_STATIC int
libuxre_bktmbexec(Bracket *bp, wchar_t wc,
		const unsigned char *str, int mb_cur_max)
{
	unsigned int i;
	wchar_t lc, uc;
	Mcce mcbuf;

	mcbuf.aft = str; /* in case of match in character classes */
	mcbuf.ch = wc;
	/*
	* First: check the single wc against any character classes.
	* Since multiple character collating elements are not part
	* of this world, they don't apply here.
	*/
	if ((i = bp->ntype) != 0)
	{
		wctype_t *wctp = &bp->type[0];

		if (bp->flags & BKT_ONECASE)
		{
			if ((wc = to_lower(wc)) == mcbuf.ch)
				mcbuf.ch = to_upper(wc);
		}
		for (;;)
		{
			if (iswctype(mb_cur_max==1?btowc(wc):wc, *wctp))
				goto match;
			if (wc != mcbuf.ch &&
			    iswctype(mb_cur_max==1?btowc(mcbuf.ch):mcbuf.ch,
				    *wctp))
				goto match;
			if (--i == 0)
				break;
			if (++wctp == &bp->type[NTYPE])
				wctp = &bp->extype[0];
		}
	}
	/*
	* The main match is determined by the weight[0] value
	* of the character (or characters, if the input can be
	* taken as a multiple character collating element).
	*/
	mcbuf.max = 0;
	mcbuf.bp = bp;
	mcbuf.col = bp->col;
	mcbuf.cep = ELEM_BADCHAR;
	mcce(&mcbuf, ELEM_BADCHAR, str, mb_cur_max, 0);
	if (mcbuf.cep == ELEM_BADCHAR)
		return -1;	/* never matches */
	if (mcbuf.cep != ELEM_ENCODED)
		mcbuf.wc = mcbuf.cep->weight[0];
	/*
	* POSIX.2 demands that both a character and its case counterpart
	* can match if REG_ICASE is set. This means that [B-z] matches
	* 'A', 'a', and '['.
	*/
	if (bp->flags & BKT_ONECASE)
	{
		lc = to_lower(mcbuf.wc);
		uc = to_upper(mcbuf.wc);
	}
	else
		lc = uc = mcbuf.wc;
	/*
	* See if it's in the set.  Note that the list of true wide
	* character values has explicit ranges.
	*/
	if (mcbuf.wc <= UCHAR_MAX)
	{
		if (bp->byte[PLIND(lc)] & PLBIT(lc))
			goto match;
		if (lc != uc && (bp->byte[PLIND(uc)] & PLBIT(uc)))
			goto match;
	}
	else if ((i = bp->nwide) != 0)
	{
		wchar_t *wcp = &bp->wide[0];
		long lcmp, ucmp;

		for (;;)
		{
			if ((lcmp = lc - *wcp) == 0)
				goto match;
			ucmp = uc - *wcp;
			if (lc != uc && ucmp == 0)
				goto match;
			if (--i == 0)
				break;
			if (++wcp == &bp->wide[NWIDE])
				wcp = &bp->exwide[0];
			if (*wcp == RANGE)
			{
				if (++wcp == &bp->wide[NWIDE])
					wcp = &bp->exwide[0];
				if (lcmp > 0 && lc <= *wcp)
					goto match;
				if (lc != uc && ucmp > 0 && uc < *wcp)
					goto match;
				if ((i -= 2) == 0)
					break;
				if (++wcp == &bp->wide[NWIDE])
					wcp = &bp->exwide[0];
			}
		}
	}
	/*
	* The last chance for a match is if an equivalence class
	* was specified for which the primary weights are scattered
	* through the weight[0]s.
	*/
	if ((i = bp->nquiv) != 0 && mcbuf.cep != ELEM_ENCODED)
	{
		wuchar_type *wucp = &bp->quiv[0];

		mcbuf.wc = mcbuf.cep->weight[1];
		for (;;)
		{
			if (mcbuf.wc == *wucp)
				goto match;
			if (--i == 0)
				break;
			if (++wucp == &bp->quiv[NQUIV])
				wucp = &bp->exquiv[0];
		}
	}
	/*
	* Only here when no match against the set was found.
	* One final special case w/r/t newline.
	*/
	if (bp->flags & BKT_NEGATED)
	{
		if (wc != '\n' || (bp->flags & BKT_NOTNL) == 0)
			return mcbuf.aft - str;
	}
	return -1;
match:;
	/*
	* Only here when a match against the described set is found.
	*/
	if (bp->flags & BKT_NEGATED)
		return -1;
	return mcbuf.aft - str;
}