lobase

Linux port of OpenBSDs userland.
Log | Files | Refs | README

commit 287009f8e2cfa312ff99cc7be1bfb9b5b5906b7f
parent dd1611d7e5bade415f1fd4e559b06b222c24786e
Author: Duncaen <mail@duncano.de>
Date:   Wed,  8 Mar 2017 02:07:12 +0100

lib/libopenbsd: import regex(3)

Diffstat:
include/regex.h | 113+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/Makefile | 2++
lib/libopenbsd/regex/COPYRIGHT | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/Makefile.inc | 9+++++++++
lib/libopenbsd/regex/cclass.h | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/cname.h | 139+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/engine.c | 1036+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/re_format.7 | 762+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/regcomp.c | 1528+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/regerror.c | 129+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/regex.3 | 709+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/regex2.h | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/regexec.c | 163+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/regfree.c | 69+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/libopenbsd/regex/utils.h | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 4993 insertions(+), 0 deletions(-)

diff --git a/include/regex.h b/include/regex.h @@ -0,0 +1,113 @@ +/* $OpenBSD: regex.h,v 1.7 2012/12/05 23:19:57 deraadt Exp $ */ +/* $NetBSD: regex.h,v 1.4.6.1 1996/06/10 18:57:07 explorer Exp $ */ + +/*- + * Copyright (c) 1992 Henry Spencer. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer of the University of Toronto. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)regex.h 8.1 (Berkeley) 6/2/93 + */ + +#ifndef _REGEX_H_ +#define _REGEX_H_ + +#include <sys/types.h> + +#define regoff_t openbsd_regoff_t +#define regex_t openbsd_regex_t +#define regmatch_t openbsd_regmatch_t + +/* types */ +typedef off_t regoff_t; + +typedef struct { + int re_magic; + size_t re_nsub; /* number of parenthesized subexpressions */ + const char *re_endp; /* end pointer for REG_PEND */ + struct re_guts *re_g; /* none of your business :-) */ +} regex_t; + +typedef struct { + regoff_t rm_so; /* start of match */ + regoff_t rm_eo; /* end of match */ +} regmatch_t; + +/* regcomp() flags */ +#define REG_BASIC 0000 +#define REG_EXTENDED 0001 +#define REG_ICASE 0002 +#define REG_NOSUB 0004 +#define REG_NEWLINE 0010 +#define REG_NOSPEC 0020 +#define REG_PEND 0040 +#define REG_DUMP 0200 + +/* regerror() flags */ +#define REG_NOMATCH 1 +#define REG_BADPAT 2 +#define REG_ECOLLATE 3 +#define REG_ECTYPE 4 +#define REG_EESCAPE 5 +#define REG_ESUBREG 6 +#define REG_EBRACK 7 +#define REG_EPAREN 8 +#define REG_EBRACE 9 +#define REG_BADBR 10 +#define REG_ERANGE 11 +#define REG_ESPACE 12 +#define REG_BADRPT 13 +#define REG_EMPTY 14 +#define REG_ASSERT 15 +#define REG_INVARG 16 +#define REG_ATOI 255 /* convert name to number (!) */ +#define REG_ITOA 0400 /* convert number to name (!) */ + +/* regexec() flags */ +#define REG_NOTBOL 00001 +#define REG_NOTEOL 00002 +#define REG_STARTEND 00004 +#define REG_TRACE 00400 /* tracing of execution */ +#define REG_LARGE 01000 /* force large representation */ +#define REG_BACKR 02000 /* force use of backref code */ + +#define regcomp openbsd_regcomp +#define regerror openbsd_regerror +#define regexec openbsd_regexec +#define regfree openbsd_regfree + +__BEGIN_DECLS +int regcomp(regex_t *, const char *, int); +size_t regerror(int, const regex_t *, char *, size_t); +int regexec(const regex_t *, const char *, size_t, regmatch_t [], int); +void regfree(regex_t *); +__END_DECLS + +#endif /* !_REGEX_H_ */ diff --git a/lib/libopenbsd/Makefile b/lib/libopenbsd/Makefile @@ -20,6 +20,8 @@ SRCS += gen/errc.c gen/fts.c gen/warnc.c gen/verrc.c gen/vwarnc.c gen/setmode.c\ SRCS += stdio/fgetwln.c SRCS += net/base64.c +SRCS += regex/regcomp.c regex/regerror.c regex/regexec.c regex/regfree.c + include config.mk ifneq ($(HAVE_EXPLICIT_BZERO),1) diff --git a/lib/libopenbsd/regex/COPYRIGHT b/lib/libopenbsd/regex/COPYRIGHT @@ -0,0 +1,54 @@ +$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $ + +Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved. +This software is not subject to any license of the American Telephone +and Telegraph Company or of the Regents of the University of California. + +Permission is granted to anyone to use this software for any purpose on +any computer system, and to alter it and redistribute it, subject +to the following restrictions: + +1. The author is not responsible for the consequences of use of this + software, no matter how awful, even if they arise from flaws in it. + +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. Since few users ever read sources, + credits must appear in the documentation. + +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. Since few users + ever read sources, credits must appear in the documentation. + +4. This notice may not be removed or altered. + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94 + */ diff --git a/lib/libopenbsd/regex/Makefile.inc b/lib/libopenbsd/regex/Makefile.inc @@ -0,0 +1,9 @@ +# $OpenBSD: Makefile.inc,v 1.8 2016/03/30 06:38:41 jmc Exp $ +# @(#)Makefile.inc 8.1 (Berkeley) 6/4/93 + +# regex sources +VPATH+= ${LIBCSRCDIR}/regex + +SRCS+= regcomp.c regerror.c regexec.c regfree.c + +MAN+= regex.3 re_format.7 diff --git a/lib/libopenbsd/regex/cclass.h b/lib/libopenbsd/regex/cclass.h @@ -0,0 +1,68 @@ +/* $OpenBSD: cclass.h,v 1.5 2003/06/02 20:18:36 millert Exp $ */ + +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cclass.h 8.3 (Berkeley) 3/20/94 + */ + +/* character-class table */ +static struct cclass { + char *name; + char *chars; + char *multis; +} cclasses[] = { + { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789", ""} , + { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + ""} , + { "blank", " \t", ""} , + { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ +\25\26\27\30\31\32\33\34\35\36\37\177", ""} , + { "digit", "0123456789", ""} , + { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + ""} , + { "lower", "abcdefghijklmnopqrstuvwxyz", + ""} , + { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ +0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", + ""} , + { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", + ""} , + { "space", "\t\n\v\f\r ", ""} , + { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + ""} , + { "xdigit", "0123456789ABCDEFabcdef", + ""} , + { NULL, 0, "" } +}; diff --git a/lib/libopenbsd/regex/cname.h b/lib/libopenbsd/regex/cname.h @@ -0,0 +1,139 @@ +/* $OpenBSD: cname.h,v 1.5 2003/06/02 20:18:36 millert Exp $ */ + +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cname.h 8.3 (Berkeley) 3/20/94 + */ + +/* character-name table */ +static struct cname { + char *name; + char code; +} cnames[] = { + { "NUL", '\0' }, + { "SOH", '\001' }, + { "STX", '\002' }, + { "ETX", '\003' }, + { "EOT", '\004' }, + { "ENQ", '\005' }, + { "ACK", '\006' }, + { "BEL", '\007' }, + { "alert", '\007' }, + { "BS", '\010' }, + { "backspace", '\b' }, + { "HT", '\011' }, + { "tab", '\t' }, + { "LF", '\012' }, + { "newline", '\n' }, + { "VT", '\013' }, + { "vertical-tab", '\v' }, + { "FF", '\014' }, + { "form-feed", '\f' }, + { "CR", '\015' }, + { "carriage-return", '\r' }, + { "SO", '\016' }, + { "SI", '\017' }, + { "DLE", '\020' }, + { "DC1", '\021' }, + { "DC2", '\022' }, + { "DC3", '\023' }, + { "DC4", '\024' }, + { "NAK", '\025' }, + { "SYN", '\026' }, + { "ETB", '\027' }, + { "CAN", '\030' }, + { "EM", '\031' }, + { "SUB", '\032' }, + { "ESC", '\033' }, + { "IS4", '\034' }, + { "FS", '\034' }, + { "IS3", '\035' }, + { "GS", '\035' }, + { "IS2", '\036' }, + { "RS", '\036' }, + { "IS1", '\037' }, + { "US", '\037' }, + { "space", ' ' }, + { "exclamation-mark", '!' }, + { "quotation-mark", '"' }, + { "number-sign", '#' }, + { "dollar-sign", '$' }, + { "percent-sign", '%' }, + { "ampersand", '&' }, + { "apostrophe", '\'' }, + { "left-parenthesis", '(' }, + { "right-parenthesis", ')' }, + { "asterisk", '*' }, + { "plus-sign", '+' }, + { "comma", ',' }, + { "hyphen", '-' }, + { "hyphen-minus", '-' }, + { "period", '.' }, + { "full-stop", '.' }, + { "slash", '/' }, + { "solidus", '/' }, + { "zero", '0' }, + { "one", '1' }, + { "two", '2' }, + { "three", '3' }, + { "four", '4' }, + { "five", '5' }, + { "six", '6' }, + { "seven", '7' }, + { "eight", '8' }, + { "nine", '9' }, + { "colon", ':' }, + { "semicolon", ';' }, + { "less-than-sign", '<' }, + { "equals-sign", '=' }, + { "greater-than-sign", '>' }, + { "question-mark", '?' }, + { "commercial-at", '@' }, + { "left-square-bracket", '[' }, + { "backslash", '\\' }, + { "reverse-solidus", '\\' }, + { "right-square-bracket", ']' }, + { "circumflex", '^' }, + { "circumflex-accent", '^' }, + { "underscore", '_' }, + { "low-line", '_' }, + { "grave-accent", '`' }, + { "left-brace", '{' }, + { "left-curly-bracket", '{' }, + { "vertical-line", '|' }, + { "right-brace", '}' }, + { "right-curly-bracket", '}' }, + { "tilde", '~' }, + { "DEL", '\177' }, + { NULL, 0 } +}; diff --git a/lib/libopenbsd/regex/engine.c b/lib/libopenbsd/regex/engine.c @@ -0,0 +1,1036 @@ +/* $OpenBSD: engine.c,v 1.23 2016/05/26 05:46:44 martijn Exp $ */ + +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)engine.c 8.5 (Berkeley) 3/20/94 + */ + +/* + * The matching engine and friends. This file is #included by regexec.c + * after suitable #defines of a variety of macros used herein, so that + * different state representations can be used without duplicating masses + * of code. + */ + +#ifdef SNAMES +#define matcher smatcher +#define fast sfast +#define slow sslow +#define dissect sdissect +#define backref sbackref +#define step sstep +#define print sprint +#define at sat +#define match smat +#define nope snope +#endif +#ifdef LNAMES +#define matcher lmatcher +#define fast lfast +#define slow lslow +#define dissect ldissect +#define backref lbackref +#define step lstep +#define print lprint +#define at lat +#define match lmat +#define nope lnope +#endif + +/* another structure passed up and down to avoid zillions of parameters */ +struct match { + struct re_guts *g; + int eflags; + regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ + char *offp; /* offsets work from here */ + char *beginp; /* start of string -- virtual NUL precedes */ + char *endp; /* end of string -- virtual NUL here */ + char *coldp; /* can be no match starting before here */ + char **lastpos; /* [nplus+1] */ + STATEVARS; + states st; /* current states */ + states fresh; /* states for a fresh start */ + states tmp; /* temporary */ + states empty; /* empty set of states */ +}; + +static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); +static char *dissect(struct match *, char *, char *, sopno, sopno); +static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); +static char *fast(struct match *, char *, char *, sopno, sopno); +static char *slow(struct match *, char *, char *, sopno, sopno); +static states step(struct re_guts *, sopno, sopno, states, int, states); +#define MAX_RECURSION 100 +#define BOL (OUT+1) +#define EOL (BOL+1) +#define BOLEOL (BOL+2) +#define NOTHING (BOL+3) +#define BOW (BOL+4) +#define EOW (BOL+5) +/* update nonchars[] array below when adding fake chars here */ +#define CODEMAX (BOL+5) /* highest code used */ +#define NONCHAR(c) ((c) > CHAR_MAX) +#define NNONCHAR (CODEMAX-CHAR_MAX) +#ifdef REDEBUG +static void print(struct match *, char *, states, int, FILE *); +#endif +#ifdef REDEBUG +static void at(struct match *, char *, char *, char *, sopno, sopno); +#endif +#ifdef REDEBUG +static const char *pchar(int); +#endif + +#ifdef REDEBUG +#define SP(t, s, c) print(m, t, s, c, stdout) +#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) +#define NOTE(str) { if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); } +static int nope = 0; +#else +#define SP(t, s, c) /* nothing */ +#define AT(t, p1, p2, s1, s2) /* nothing */ +#define NOTE(s) /* nothing */ +#endif + +/* + - matcher - the actual matching engine + */ +static int /* 0 success, REG_NOMATCH failure */ +matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], + int eflags) +{ + char *endp; + int i; + struct match mv; + struct match *m = &mv; + char *dp; + const sopno gf = g->firststate+1; /* +1 for OEND */ + const sopno gl = g->laststate; + char *start; + char *stop; + + /* simplify the situation where possible */ + if (g->cflags&REG_NOSUB) + nmatch = 0; + if (eflags&REG_STARTEND) { + start = string + pmatch[0].rm_so; + stop = string + pmatch[0].rm_eo; + } else { + start = string; + stop = start + strlen(start); + } + if (stop < start) + return(REG_INVARG); + + /* prescreening; this does wonders for this rather slow code */ + if (g->must != NULL) { + for (dp = start; dp < stop; dp++) + if (*dp == g->must[0] && stop - dp >= g->mlen && + memcmp(dp, g->must, (size_t)g->mlen) == 0) + break; + if (dp == stop) /* we didn't find g->must */ + return(REG_NOMATCH); + } + + /* match struct setup */ + m->g = g; + m->eflags = eflags; + m->pmatch = NULL; + m->lastpos = NULL; + m->offp = string; + m->beginp = start; + m->endp = stop; + STATESETUP(m, 4); + SETUP(m->st); + SETUP(m->fresh); + SETUP(m->tmp); + SETUP(m->empty); + CLEAR(m->empty); + + /* this loop does only one repetition except for backrefs */ + for (;;) { + endp = fast(m, start, stop, gf, gl); + if (endp == NULL) { /* a miss */ + free(m->pmatch); + free(m->lastpos); + STATETEARDOWN(m); + return(REG_NOMATCH); + } + if (nmatch == 0 && !g->backrefs) + break; /* no further info needed */ + + /* where? */ + assert(m->coldp != NULL); + for (;;) { + NOTE("finding start"); + endp = slow(m, m->coldp, stop, gf, gl); + if (endp != NULL) + break; + assert(m->coldp < m->endp); + m->coldp++; + } + if (nmatch == 1 && !g->backrefs) + break; /* no further info needed */ + + /* oh my, he wants the subexpressions... */ + if (m->pmatch == NULL) + m->pmatch = reallocarray(NULL, m->g->nsub + 1, + sizeof(regmatch_t)); + if (m->pmatch == NULL) { + STATETEARDOWN(m); + return(REG_ESPACE); + } + for (i = 1; i <= m->g->nsub; i++) + m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; + if (!g->backrefs && !(m->eflags&REG_BACKR)) { + NOTE("dissecting"); + dp = dissect(m, m->coldp, endp, gf, gl); + } else { + if (g->nplus > 0 && m->lastpos == NULL) + m->lastpos = reallocarray(NULL, + g->nplus+1, sizeof(char *)); + if (g->nplus > 0 && m->lastpos == NULL) { + free(m->pmatch); + STATETEARDOWN(m); + return(REG_ESPACE); + } + NOTE("backref dissect"); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); + } + if (dp != NULL) + break; + + /* uh-oh... we couldn't find a subexpression-level match */ + assert(g->backrefs); /* must be back references doing it */ + assert(g->nplus == 0 || m->lastpos != NULL); + for (;;) { + if (dp != NULL || endp <= m->coldp) + break; /* defeat */ + NOTE("backoff"); + endp = slow(m, m->coldp, endp-1, gf, gl); + if (endp == NULL) + break; /* defeat */ + /* try it on a shorter possibility */ +#ifndef NDEBUG + for (i = 1; i <= m->g->nsub; i++) { + assert(m->pmatch[i].rm_so == -1); + assert(m->pmatch[i].rm_eo == -1); + } +#endif + NOTE("backoff dissect"); + dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); + } + assert(dp == NULL || dp == endp); + if (dp != NULL) /* found a shorter one */ + break; + + /* despite initial appearances, there is no match here */ + NOTE("false alarm"); + if (m->coldp == stop) + break; + start = m->coldp + 1; /* recycle starting later */ + } + + /* fill in the details if requested */ + if (nmatch > 0) { + pmatch[0].rm_so = m->coldp - m->offp; + pmatch[0].rm_eo = endp - m->offp; + } + if (nmatch > 1) { + assert(m->pmatch != NULL); + for (i = 1; i < nmatch; i++) + if (i <= m->g->nsub) + pmatch[i] = m->pmatch[i]; + else { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + } + } + + free(m->pmatch); + free(m->lastpos); + STATETEARDOWN(m); + return(0); +} + +/* + - dissect - figure out what matched what, no back references + */ +static char * /* == stop (success) always */ +dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) +{ + int i; + sopno ss; /* start sop of current subRE */ + sopno es; /* end sop of current subRE */ + char *sp; /* start of string matched by it */ + char *stp; /* string matched by it cannot pass here */ + char *rest; /* start of rest of string */ + char *tail; /* string unmatched by rest of RE */ + sopno ssub; /* start sop of subsubRE */ + sopno esub; /* end sop of subsubRE */ + char *ssp; /* start of string matched by subsubRE */ + char *sep; /* end of string matched by subsubRE */ + char *oldssp; /* previous ssp */ + char *dp; + + AT("diss", start, stop, startst, stopst); + sp = start; + for (ss = startst; ss < stopst; ss = es) { + /* identify end of subRE */ + es = ss; + switch (OP(m->g->strip[es])) { + case OPLUS_: + case OQUEST_: + es += OPND(m->g->strip[es]); + break; + case OCH_: + while (OP(m->g->strip[es]) != O_CH) + es += OPND(m->g->strip[es]); + break; + } + es++; + + /* figure out what it matched */ + switch (OP(m->g->strip[ss])) { + case OEND: + assert(nope); + break; + case OCHAR: + sp++; + break; + case OBOL: + case OEOL: + case OBOW: + case OEOW: + break; + case OANY: + case OANYOF: + sp++; + break; + case OBACK_: + case O_BACK: + assert(nope); + break; + /* cases where length of match is hard to find */ + case OQUEST_: + stp = stop; + for (;;) { + /* how long could this one be? */ + rest = slow(m, sp, stp, ss, es); + assert(rest != NULL); /* it did match */ + /* could the rest match the rest? */ + tail = slow(m, rest, stop, es, stopst); + if (tail == stop) + break; /* yes! */ + /* no -- try a shorter match for this one */ + stp = rest - 1; + assert(stp >= sp); /* it did work */ + } + ssub = ss + 1; + esub = es - 1; + /* did innards match? */ + if (slow(m, sp, rest, ssub, esub) != NULL) { + dp = dissect(m, sp, rest, ssub, esub); + assert(dp == rest); + } else /* no */ + assert(sp == rest); + sp = rest; + break; + case OPLUS_: + stp = stop; + for (;;) { + /* how long could this one be? */ + rest = slow(m, sp, stp, ss, es); + assert(rest != NULL); /* it did match */ + /* could the rest match the rest? */ + tail = slow(m, rest, stop, es, stopst); + if (tail == stop) + break; /* yes! */ + /* no -- try a shorter match for this one */ + stp = rest - 1; + assert(stp >= sp); /* it did work */ + } + ssub = ss + 1; + esub = es - 1; + ssp = sp; + oldssp = ssp; + for (;;) { /* find last match of innards */ + sep = slow(m, ssp, rest, ssub, esub); + if (sep == NULL || sep == ssp) + break; /* failed or matched null */ + oldssp = ssp; /* on to next try */ + ssp = sep; + } + if (sep == NULL) { + /* last successful match */ + sep = ssp; + ssp = oldssp; + } + assert(sep == rest); /* must exhaust substring */ + assert(slow(m, ssp, sep, ssub, esub) == rest); + dp = dissect(m, ssp, sep, ssub, esub); + assert(dp == sep); + sp = rest; + break; + case OCH_: + stp = stop; + for (;;) { + /* how long could this one be? */ + rest = slow(m, sp, stp, ss, es); + assert(rest != NULL); /* it did match */ + /* could the rest match the rest? */ + tail = slow(m, rest, stop, es, stopst); + if (tail == stop) + break; /* yes! */ + /* no -- try a shorter match for this one */ + stp = rest - 1; + assert(stp >= sp); /* it did work */ + } + ssub = ss + 1; + esub = ss + OPND(m->g->strip[ss]) - 1; + assert(OP(m->g->strip[esub]) == OOR1); + for (;;) { /* find first matching branch */ + if (slow(m, sp, rest, ssub, esub) == rest) + break; /* it matched all of it */ + /* that one missed, try next one */ + assert(OP(m->g->strip[esub]) == OOR1); + esub++; + assert(OP(m->g->strip[esub]) == OOR2); + ssub = esub + 1; + esub += OPND(m->g->strip[esub]); + if (OP(m->g->strip[esub]) == OOR2) + esub--; + else + assert(OP(m->g->strip[esub]) == O_CH); + } + dp = dissect(m, sp, rest, ssub, esub); + assert(dp == rest); + sp = rest; + break; + case O_PLUS: + case O_QUEST: + case OOR1: + case OOR2: + case O_CH: + assert(nope); + break; + case OLPAREN: + i = OPND(m->g->strip[ss]); + assert(0 < i && i <= m->g->nsub); + m->pmatch[i].rm_so = sp - m->offp; + break; + case ORPAREN: + i = OPND(m->g->strip[ss]); + assert(0 < i && i <= m->g->nsub); + m->pmatch[i].rm_eo = sp - m->offp; + break; + default: /* uh oh */ + assert(nope); + break; + } + } + + assert(sp == stop); + return(sp); +} + +/* + - backref - figure out what matched what, figuring in back references + */ +static char * /* == stop (success) or NULL (failure) */ +backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, + sopno lev, int rec) /* PLUS nesting level */ +{ + int i; + sopno ss; /* start sop of current subRE */ + char *sp; /* start of string matched by it */ + sopno ssub; /* start sop of subsubRE */ + sopno esub; /* end sop of subsubRE */ + char *ssp; /* start of string matched by subsubRE */ + char *dp; + size_t len; + int hard; + sop s; + regoff_t offsave; + cset *cs; + + AT("back", start, stop, startst, stopst); + sp = start; + + /* get as far as we can with easy stuff */ + hard = 0; + for (ss = startst; !hard && ss < stopst; ss++) + switch (OP(s = m->g->strip[ss])) { + case OCHAR: + if (sp == stop || *sp++ != (char)OPND(s)) + return(NULL); + break; + case OANY: + if (sp == stop) + return(NULL); + sp++; + break; + case OANYOF: + cs = &m->g->sets[OPND(s)]; + if (sp == stop || !CHIN(cs, *sp++)) + return(NULL); + break; + case OBOL: + if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) || + (sp > m->offp && sp < m->endp && + *(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE))) + { /* yes */ } + else + return(NULL); + break; + case OEOL: + if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) || + (sp < m->endp && *sp == '\n' && + (m->g->cflags&REG_NEWLINE)) ) + { /* yes */ } + else + return(NULL); + break; + case OBOW: + if (sp < m->endp && ISWORD(*sp) && + ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) || + (sp > m->offp && !ISWORD(*(sp-1))))) + { /* yes */ } + else + return(NULL); + break; + case OEOW: + if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) || + (sp < m->endp && *sp == '\n' && + (m->g->cflags&REG_NEWLINE)) || + (sp < m->endp && !ISWORD(*sp)) ) && + (sp > m->beginp && ISWORD(*(sp-1))) ) + { /* yes */ } + else + return(NULL); + break; + case O_QUEST: + break; + case OOR1: /* matches null but needs to skip */ + ss++; + s = m->g->strip[ss]; + do { + assert(OP(s) == OOR2); + ss += OPND(s); + } while (OP(s = m->g->strip[ss]) != O_CH); + /* note that the ss++ gets us past the O_CH */ + break; + default: /* have to make a choice */ + hard = 1; + break; + } + if (!hard) { /* that was it! */ + if (sp != stop) + return(NULL); + return(sp); + } + ss--; /* adjust for the for's final increment */ + + /* the hard stuff */ + AT("hard", sp, stop, ss, stopst); + s = m->g->strip[ss]; + switch (OP(s)) { + case OBACK_: /* the vilest depths */ + i = OPND(s); + assert(0 < i && i <= m->g->nsub); + if (m->pmatch[i].rm_eo == -1) + return(NULL); + assert(m->pmatch[i].rm_so != -1); + len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; + if (len == 0 && rec++ > MAX_RECURSION) + return(NULL); + assert(stop - m->beginp >= len); + if (sp > stop - len) + return(NULL); /* not enough left to match */ + ssp = m->offp + m->pmatch[i].rm_so; + if (memcmp(sp, ssp, len) != 0) + return(NULL); + while (m->g->strip[ss] != SOP(O_BACK, i)) + ss++; + return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); + break; + case OQUEST_: /* to null or not */ + dp = backref(m, sp, stop, ss+1, stopst, lev, rec); + if (dp != NULL) + return(dp); /* not */ + return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); + break; + case OPLUS_: + assert(m->lastpos != NULL); + assert(lev+1 <= m->g->nplus); + m->lastpos[lev+1] = sp; + return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); + break; + case O_PLUS: + if (sp == m->lastpos[lev]) /* last pass matched null */ + return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); + /* try another pass */ + m->lastpos[lev] = sp; + dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); + if (dp == NULL) + return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); + else + return(dp); + break; + case OCH_: /* find the right one, if any */ + ssub = ss + 1; + esub = ss + OPND(s) - 1; + assert(OP(m->g->strip[esub]) == OOR1); + for (;;) { /* find first matching branch */ + dp = backref(m, sp, stop, ssub, esub, lev, rec); + if (dp != NULL) + return(dp); + /* that one missed, try next one */ + if (OP(m->g->strip[esub]) == O_CH) + return(NULL); /* there is none */ + esub++; + assert(OP(m->g->strip[esub]) == OOR2); + ssub = esub + 1; + esub += OPND(m->g->strip[esub]); + if (OP(m->g->strip[esub]) == OOR2) + esub--; + else + assert(OP(m->g->strip[esub]) == O_CH); + } + break; + case OLPAREN: /* must undo assignment if rest fails */ + i = OPND(s); + assert(0 < i && i <= m->g->nsub); + offsave = m->pmatch[i].rm_so; + m->pmatch[i].rm_so = sp - m->offp; + dp = backref(m, sp, stop, ss+1, stopst, lev, rec); + if (dp != NULL) + return(dp); + m->pmatch[i].rm_so = offsave; + return(NULL); + break; + case ORPAREN: /* must undo assignment if rest fails */ + i = OPND(s); + assert(0 < i && i <= m->g->nsub); + offsave = m->pmatch[i].rm_eo; + m->pmatch[i].rm_eo = sp - m->offp; + dp = backref(m, sp, stop, ss+1, stopst, lev, rec); + if (dp != NULL) + return(dp); + m->pmatch[i].rm_eo = offsave; + return(NULL); + break; + default: /* uh oh */ + assert(nope); + break; + } + + /* "can't happen" */ + assert(nope); + /* NOTREACHED */ + return NULL; +} + +/* + - fast - step through the string at top speed + */ +static char * /* where tentative match ended, or NULL */ +fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) +{ + states st = m->st; + states fresh = m->fresh; + states tmp = m->tmp; + char *p = start; + int c; + int lastc; /* previous c */ + int flagch; + int i; + char *coldp; /* last p after which no match was underway */ + + if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL))) + c = OUT; + else + c = *(start-1); + + CLEAR(st); + SET1(st, startst); + st = step(m->g, startst, stopst, st, NOTHING, st); + ASSIGN(fresh, st); + SP("start", st, *p); + coldp = NULL; + for (;;) { + /* next character */ + lastc = c; + c = (p == m->endp) ? OUT : *p; + if (EQ(st, fresh)) + coldp = p; + + /* is there an EOL and/or BOL between lastc and c? */ + flagch = '\0'; + i = 0; + if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) || + (lastc == OUT && !(m->eflags&REG_NOTBOL))) { + flagch = BOL; + i = m->g->nbol; + } + if ((c == '\n' && m->g->cflags&REG_NEWLINE) || + (c == OUT && !(m->eflags&REG_NOTEOL)) ) { + flagch = (flagch == BOL) ? BOLEOL : EOL; + i += m->g->neol; + } + if (i != 0) { + for (; i > 0; i--) + st = step(m->g, startst, stopst, + st, flagch, st); + SP("boleol", st, c); + } + + /* how about a word boundary? */ + if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && + (c != OUT && ISWORD(c))) + flagch = BOW; + if ((lastc != OUT && ISWORD(lastc)) && + (flagch == EOL || (c != OUT && !ISWORD(c)))) + flagch = EOW; + if (flagch == BOW || flagch == EOW) { + st = step(m->g, startst, stopst, st, flagch, st); + SP("boweow", st, c); + } + + /* are we done? */ + if (ISSET(st, stopst) || p == stop) + break; /* NOTE BREAK OUT */ + + /* no, we must deal with this character */ + ASSIGN(tmp, st); + ASSIGN(st, fresh); + assert(c != OUT); + st = step(m->g, startst, stopst, tmp, c, st); + SP("aft", st, c); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + p++; + } + + assert(coldp != NULL); + m->coldp = coldp; + if (ISSET(st, stopst)) + return(p+1); + else + return(NULL); +} + +/* + - slow - step through the string more deliberately + */ +static char * /* where it ended */ +slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) +{ + states st = m->st; + states empty = m->empty; + states tmp = m->tmp; + char *p = start; + int c; + int lastc; /* previous c */ + int flagch; + int i; + char *matchp; /* last p at which a match ended */ + + if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL))) + c = OUT; + else + c = *(start-1); + + AT("slow", start, stop, startst, stopst); + CLEAR(st); + SET1(st, startst); + SP("sstart", st, *p); + st = step(m->g, startst, stopst, st, NOTHING, st); + matchp = NULL; + for (;;) { + /* next character */ + lastc = c; + c = (p == m->endp) ? OUT : *p; + + /* is there an EOL and/or BOL between lastc and c? */ + flagch = '\0'; + i = 0; + if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) || + (lastc == OUT && !(m->eflags&REG_NOTBOL))) { + flagch = BOL; + i = m->g->nbol; + } + if ((c == '\n' && m->g->cflags&REG_NEWLINE) || + (c == OUT && !(m->eflags&REG_NOTEOL))) { + flagch = (flagch == BOL) ? BOLEOL : EOL; + i += m->g->neol; + } + if (i != 0) { + for (; i > 0; i--) + st = step(m->g, startst, stopst, + st, flagch, st); + SP("sboleol", st, c); + } + + /* how about a word boundary? */ + if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && + (c != OUT && ISWORD(c))) + flagch = BOW; + if ((lastc != OUT && ISWORD(lastc)) && + (flagch == EOL || (c != OUT && !ISWORD(c)))) + flagch = EOW; + if (flagch == BOW || flagch == EOW) { + st = step(m->g, startst, stopst, st, flagch, st); + SP("sboweow", st, c); + } + + /* are we done? */ + if (ISSET(st, stopst)) + matchp = p; + if (EQ(st, empty) || p == stop) + break; /* NOTE BREAK OUT */ + + /* no, we must deal with this character */ + ASSIGN(tmp, st); + ASSIGN(st, empty); + assert(c != OUT); + st = step(m->g, startst, stopst, tmp, c, st); + SP("saft", st, c); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + p++; + } + + return(matchp); +} + + +/* + - step - map set of states reachable before char to set reachable after + */ +static states +step(struct re_guts *g, + sopno start, /* start state within strip */ + sopno stop, /* state after stop state within strip */ + states bef, /* states reachable before */ + int ch, /* character or NONCHAR code */ + states aft) /* states already known reachable after */ +{ + cset *cs; + sop s; + sopno pc; + onestate here; /* note, macros know this name */ + sopno look; + int i; + + for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { + s = g->strip[pc]; + switch (OP(s)) { + case OEND: + assert(pc == stop-1); + break; + case OCHAR: + /* only characters can match */ + assert(!NONCHAR(ch) || ch != (char)OPND(s)); + if (ch == (char)OPND(s)) + FWD(aft, bef, 1); + break; + case OBOL: + if (ch == BOL || ch == BOLEOL) + FWD(aft, bef, 1); + break; + case OEOL: + if (ch == EOL || ch == BOLEOL) + FWD(aft, bef, 1); + break; + case OBOW: + if (ch == BOW) + FWD(aft, bef, 1); + break; + case OEOW: + if (ch == EOW) + FWD(aft, bef, 1); + break; + case OANY: + if (!NONCHAR(ch)) + FWD(aft, bef, 1); + break; + case OANYOF: + cs = &g->sets[OPND(s)]; + if (!NONCHAR(ch) && CHIN(cs, ch)) + FWD(aft, bef, 1); + break; + case OBACK_: /* ignored here */ + case O_BACK: + FWD(aft, aft, 1); + break; + case OPLUS_: /* forward, this is just an empty */ + FWD(aft, aft, 1); + break; + case O_PLUS: /* both forward and back */ + FWD(aft, aft, 1); + i = ISSETBACK(aft, OPND(s)); + BACK(aft, aft, OPND(s)); + if (!i && ISSETBACK(aft, OPND(s))) { + /* oho, must reconsider loop body */ + pc -= OPND(s) + 1; + INIT(here, pc); + } + break; + case OQUEST_: /* two branches, both forward */ + FWD(aft, aft, 1); + FWD(aft, aft, OPND(s)); + break; + case O_QUEST: /* just an empty */ + FWD(aft, aft, 1); + break; + case OLPAREN: /* not significant here */ + case ORPAREN: + FWD(aft, aft, 1); + break; + case OCH_: /* mark the first two branches */ + FWD(aft, aft, 1); + assert(OP(g->strip[pc+OPND(s)]) == OOR2); + FWD(aft, aft, OPND(s)); + break; + case OOR1: /* done a branch, find the O_CH */ + if (ISSTATEIN(aft, here)) { + for (look = 1; + OP(s = g->strip[pc+look]) != O_CH; + look += OPND(s)) + assert(OP(s) == OOR2); + FWD(aft, aft, look); + } + break; + case OOR2: /* propagate OCH_'s marking */ + FWD(aft, aft, 1); + if (OP(g->strip[pc+OPND(s)]) != O_CH) { + assert(OP(g->strip[pc+OPND(s)]) == OOR2); + FWD(aft, aft, OPND(s)); + } + break; + case O_CH: /* just empty */ + FWD(aft, aft, 1); + break; + default: /* ooooops... */ + assert(nope); + break; + } + } + + return(aft); +} + +#ifdef REDEBUG +/* + - print - print a set of states + */ +static void +print(struct match *m, char *caption, states st, int ch, FILE *d) +{ + struct re_guts *g = m->g; + int i; + int first = 1; + + if (!(m->eflags&REG_TRACE)) + return; + + (void)fprintf(d, "%s", caption); + (void)fprintf(d, " %s", pchar(ch)); + for (i = 0; i < g->nstates; i++) { + if (ISSET(st, i)) { + (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); + first = 0; + } + } + (void)fprintf(d, "\n"); +} + +/* + - at - print current situation + */ +static void +at(struct match *m, char *title, char *start, char *stop, sopno startst, + sopno stopst) +{ + if (!(m->eflags&REG_TRACE)) + return; + + (void)printf("%s %s-", title, pchar(*start)); + (void)printf("%s ", pchar(*stop)); + (void)printf("%ld-%ld\n", (long)startst, (long)stopst); +} + +#ifndef PCHARDONE +#define PCHARDONE /* never again */ +static const char *nonchars[] = + { "OUT", "BOL", "EOL", "BOLEOL", "NOTHING", "BOW", "EOW" }; +#define PNONCHAR(c) \ + ((c) - OUT < (sizeof(nonchars)/sizeof(nonchars[0])) \ + ? nonchars[(c) - OUT] : "invalid") + +/* + - pchar - make a character printable + * + * Is this identical to regchar() over in debug.c? Well, yes. But a + * duplicate here avoids having a debugging-capable regexec.o tied to + * a matching debug.o, and this is convenient. It all disappears in + * the non-debug compilation anyway, so it doesn't matter much. + */ +static const char * /* -> representation */ +pchar(int ch) +{ + static char pbuf[10]; + + if (NONCHAR(ch)) { + if (ch - OUT < (sizeof(nonchars)/sizeof(nonchars[0]))) + return nonchars[ch - OUT]; + return "invalid"; + } + if (isprint((unsigned char)ch) || ch == ' ') + (void)snprintf(pbuf, sizeof pbuf, "%c", ch); + else + (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); + return(pbuf); +} +#endif +#endif + +#undef matcher +#undef fast +#undef slow +#undef dissect +#undef backref +#undef step +#undef print +#undef at +#undef match +#undef nope diff --git a/lib/libopenbsd/regex/re_format.7 b/lib/libopenbsd/regex/re_format.7 @@ -0,0 +1,762 @@ +.\" $OpenBSD: re_format.7,v 1.22 2015/09/14 20:06:58 schwarze Exp $ +.\" +.\" Copyright (c) 1997, Phillip F Knaack. All rights reserved. +.\" +.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. +.\" Copyright (c) 1992, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" Henry Spencer. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)re_format.7 8.3 (Berkeley) 3/20/94 +.\" +.Dd $Mdocdate: September 14 2015 $ +.Dt RE_FORMAT 7 +.Os +.Sh NAME +.Nm re_format +.Nd POSIX regular expressions +.Sh DESCRIPTION +Regular expressions (REs), +as defined in +.St -p1003.1-2004 , +come in two forms: +basic regular expressions +(BREs) +and extended regular expressions +(EREs). +Both forms of regular expressions are supported +by the interfaces described in +.Xr regex 3 . +Applications dealing with regular expressions +may use one or the other form +(or indeed both). +For example, +.Xr ed 1 +uses BREs, +whilst +.Xr egrep 1 +talks EREs. +Consult the manual page for the specific application to find out which +it uses. +.Pp +POSIX leaves some aspects of RE syntax and semantics open; +.Sq ** +marks decisions on these aspects that +may not be fully portable to other POSIX implementations. +.Pp +This manual page first describes regular expressions in general, +specifically extended regular expressions, +and then discusses differences between them and basic regular expressions. +.Sh EXTENDED REGULAR EXPRESSIONS +An ERE is one** or more non-empty** +.Em branches , +separated by +.Sq | . +It matches anything that matches one of the branches. +.Pp +A branch is one** or more +.Em pieces , +concatenated. +It matches a match for the first, followed by a match for the second, etc. +.Pp +A piece is an +.Em atom +possibly followed by a single** +.Sq * , +.Sq + , +.Sq ?\& , +or +.Em bound . +An atom followed by +.Sq * +matches a sequence of 0 or more matches of the atom. +An atom followed by +.Sq + +matches a sequence of 1 or more matches of the atom. +An atom followed by +.Sq ?\& +matches a sequence of 0 or 1 matches of the atom. +.Pp +A bound is +.Sq { +followed by an unsigned decimal integer, +possibly followed by +.Sq ,\& +possibly followed by another unsigned decimal integer, +always followed by +.Sq } . +The integers must lie between 0 and +.Dv RE_DUP_MAX +(255**) inclusive, +and if there are two of them, the first may not exceed the second. +An atom followed by a bound containing one integer +.Ar i +and no comma matches +a sequence of exactly +.Ar i +matches of the atom. +An atom followed by a bound +containing one integer +.Ar i +and a comma matches +a sequence of +.Ar i +or more matches of the atom. +An atom followed by a bound +containing two integers +.Ar i +and +.Ar j +matches a sequence of +.Ar i +through +.Ar j +(inclusive) matches of the atom. +.Pp +An atom is a regular expression enclosed in +.Sq () +(matching a part of the regular expression), +an empty set of +.Sq () +(matching the null string)**, +a +.Em bracket expression +(see below), +.Sq .\& +(matching any single character), +.Sq ^ +(matching the null string at the beginning of a line), +.Sq $ +(matching the null string at the end of a line), +a +.Sq \e +followed by one of the characters +.Sq ^.[$()|*+?{\e +(matching that character taken as an ordinary character), +a +.Sq \e +followed by any other character** +(matching that character taken as an ordinary character, +as if the +.Sq \e +had not been present**), +or a single character with no other significance (matching that character). +A +.Sq { +followed by a character other than a digit is an ordinary character, +not the beginning of a bound**. +It is illegal to end an RE with +.Sq \e . +.Pp +A bracket expression is a list of characters enclosed in +.Sq [] . +It normally matches any single character from the list (but see below). +If the list begins with +.Sq ^ , +it matches any single character +.Em not +from the rest of the list +(but see below). +If two characters in the list are separated by +.Sq - , +this is shorthand for the full +.Em range +of characters between those two (inclusive) in the +collating sequence, e.g.\& +.Sq [0-9] +in ASCII matches any decimal digit. +It is illegal** for two ranges to share an endpoint, e.g.\& +.Sq a-c-e . +Ranges are very collating-sequence-dependent, +and portable programs should avoid relying on them. +.Pp +To include a literal +.Sq ]\& +in the list, make it the first character +(following a possible +.Sq ^ ) . +To include a literal +.Sq - , +make it the first or last character, +or the second endpoint of a range. +To use a literal +.Sq - +as the first endpoint of a range, +enclose it in +.Sq [. +and +.Sq .] +to make it a collating element (see below). +With the exception of these and some combinations using +.Sq \&[ +(see next paragraphs), +all other special characters, including +.Sq \e , +lose their special significance within a bracket expression. +.Pp +Within a bracket expression, a collating element +(a character, +a multi-character sequence that collates as if it were a single character, +or a collating-sequence name for either) +enclosed in +.Sq [. +and +.Sq .] +stands for the sequence of characters of that collating element. +The sequence is a single element of the bracket expression's list. +A bracket expression containing a multi-character collating element +can thus match more than one character, +e.g. if the collating sequence includes a +.Sq ch +collating element, +then the RE +.Sq [[.ch.]]*c +matches the first five characters of +.Sq chchcc . +.Pp +Within a bracket expression, a collating element enclosed in +.Sq [= +and +.Sq =] +is an equivalence class, standing for the sequences of characters +of all collating elements equivalent to that one, including itself. +(If there are no other equivalent collating elements, +the treatment is as if the enclosing delimiters were +.Sq [. +and +.Sq .] . ) +For example, if +.Sq x +and +.Sq y +are the members of an equivalence class, +then +.Sq [[=x=]] , +.Sq [[=y=]] , +and +.Sq [xy] +are all synonymous. +An equivalence class may not** be an endpoint of a range. +.Pp +Within a bracket expression, the name of a +.Em character class +enclosed +in +.Sq [: +and +.Sq :] +stands for the list of all characters belonging to that class. +Standard character class names are: +.Bd -literal -offset indent +alnum digit punct +alpha graph space +blank lower upper +cntrl print xdigit +.Ed +.Pp +These stand for the character classes defined in +.Xr isalnum 3 , +.Xr isalpha 3 , +and so on. +A character class may not be used as an endpoint of a range. +.Pp +There are two special cases** of bracket expressions: +the bracket expressions +.Sq [[:<:]] +and +.Sq [[:>:]] +match the null string at the beginning and end of a word, respectively. +A word is defined as a sequence of +characters starting and ending with a word character +which is neither preceded nor followed by +word characters. +A word character is an +.Em alnum +character (as defined by +.Xr isalnum 3 ) +or an underscore. +This is an extension, +compatible with but not specified by POSIX, +and should be used with +caution in software intended to be portable to other systems. +The additional word delimiters +.Ql \e< +and +.Ql \e> +are provided to ease compatibility with traditional SVR4 +systems but are not portable and should be avoided. +.Pp +In the event that an RE could match more than one substring of a given +string, +the RE matches the one starting earliest in the string. +If the RE could match more than one substring starting at that point, +it matches the longest. +Subexpressions also match the longest possible substrings, subject to +the constraint that the whole match be as long as possible, +with subexpressions starting earlier in the RE taking priority over +ones starting later. +Note that higher-level subexpressions thus take priority over +their lower-level component subexpressions. +.Pp +Match lengths are measured in characters, not collating elements. +A null string is considered longer than no match at all. +For example, +.Sq bb* +matches the three middle characters of +.Sq abbbc ; +.Sq (wee|week)(knights|nights) +matches all ten characters of +.Sq weeknights ; +when +.Sq (.*).* +is matched against +.Sq abc , +the parenthesized subexpression matches all three characters; +and when +.Sq (a*)* +is matched against +.Sq bc , +both the whole RE and the parenthesized subexpression match the null string. +.Pp +If case-independent matching is specified, +the effect is much as if all case distinctions had vanished from the +alphabet. +When an alphabetic that exists in multiple cases appears as an +ordinary character outside a bracket expression, it is effectively +transformed into a bracket expression containing both cases, +e.g.\& +.Sq x +becomes +.Sq [xX] . +When it appears inside a bracket expression, +all case counterparts of it are added to the bracket expression, +so that, for example, +.Sq [x] +becomes +.Sq [xX] +and +.Sq [^x] +becomes +.Sq [^xX] . +.Pp +No particular limit is imposed on the length of REs**. +Programs intended to be portable should not employ REs longer +than 256 bytes, +as an implementation can refuse to accept such REs and remain +POSIX-compliant. +.Pp +The following is a list of extended regular expressions: +.Bl -tag -width Ds +.It Ar c +Any character +.Ar c +not listed below matches itself. +.It \e Ns Ar c +Any backslash-escaped character +.Ar c +matches itself. +.It \&. +Matches any single character that is not a newline +.Pq Sq \en . +.It Bq Ar char-class +Matches any single character in +.Ar char-class . +To include a +.Ql \&] +in +.Ar char-class , +it must be the first character. +A range of characters may be specified by separating the end characters +of the range with a +.Ql - ; +e.g.\& +.Ar a-z +specifies the lower case characters. +The following literal expressions can also be used in +.Ar char-class +to specify sets of characters: +.Bd -unfilled -offset indent +[:alnum:] [:cntrl:] [:lower:] [:space:] +[:alpha:] [:digit:] [:print:] [:upper:] +[:blank:] [:graph:] [:punct:] [:xdigit:] +.Ed +.Pp +If +.Ql - +appears as the first or last character of +.Ar char-class , +then it matches itself. +All other characters in +.Ar char-class +match themselves. +.Pp +Patterns in +.Ar char-class +of the form +.Eo [. +.Ar col-elm +.Ec .]\& +or +.Eo [= +.Ar col-elm +.Ec =]\& , +where +.Ar col-elm +is a collating element, are interpreted according to +.Xr setlocale 3 +.Pq not currently supported . +.It Bq ^ Ns Ar char-class +Matches any single character, other than newline, not in +.Ar char-class . +.Ar char-class +is defined as above. +.It ^ +If +.Sq ^ +is the first character of a regular expression, then it +anchors the regular expression to the beginning of a line. +Otherwise, it matches itself. +.It $ +If +.Sq $ +is the last character of a regular expression, +it anchors the regular expression to the end of a line. +Otherwise, it matches itself. +.It [[:<:]] +Anchors the single character regular expression or subexpression +immediately following it to the beginning of a word. +.It [[:>:]] +Anchors the single character regular expression or subexpression +immediately preceding it to the end of a word. +.It Pq Ar re +Defines a subexpression +.Ar re . +Any set of characters enclosed in parentheses +matches whatever the set of characters without parentheses matches +(that is a long-winded way of saying the constructs +.Sq (re) +and +.Sq re +match identically). +.It * +Matches the single character regular expression or subexpression +immediately preceding it zero or more times. +If +.Sq * +is the first character of a regular expression or subexpression, +then it matches itself. +The +.Sq * +operator sometimes yields unexpected results. +For example, the regular expression +.Ar b* +matches the beginning of the string +.Qq abbb +(as opposed to the substring +.Qq bbb ) , +since a null match is the only leftmost match. +.It + +Matches the singular character regular expression +or subexpression immediately preceding it +one or more times. +.It ? +Matches the singular character regular expression +or subexpression immediately preceding it +0 or 1 times. +.Sm off +.It Xo +.Pf { Ar n , m No }\ \& +.Pf { Ar n , No }\ \& +.Pf { Ar n No } +.Xc +.Sm on +Matches the single character regular expression or subexpression +immediately preceding it at least +.Ar n +and at most +.Ar m +times. +If +.Ar m +is omitted, then it matches at least +.Ar n +times. +If the comma is also omitted, then it matches exactly +.Ar n +times. +.It | +Used to separate patterns. +For example, +the pattern +.Sq cat|dog +matches either +.Sq cat +or +.Sq dog . +.El +.Sh BASIC REGULAR EXPRESSIONS +Basic regular expressions differ in several respects: +.Bl -bullet -offset 3n +.It +.Sq | , +.Sq + , +and +.Sq ?\& +are ordinary characters and there is no equivalent +for their functionality. +.It +The delimiters for bounds are +.Sq \e{ +and +.Sq \e} , +with +.Sq { +and +.Sq } +by themselves ordinary characters. +.It +The parentheses for nested subexpressions are +.Sq \e( +and +.Sq \e) , +with +.Sq \&( +and +.Sq )\& +by themselves ordinary characters. +.It +.Sq ^ +is an ordinary character except at the beginning of the +RE or** the beginning of a parenthesized subexpression. +.It +.Sq $ +is an ordinary character except at the end of the +RE or** the end of a parenthesized subexpression. +.It +.Sq * +is an ordinary character if it appears at the beginning of the +RE or the beginning of a parenthesized subexpression +(after a possible leading +.Sq ^ ) . +.It +Finally, there is one new type of atom, a +.Em back-reference : +.Sq \e +followed by a non-zero decimal digit +.Ar d +matches the same sequence of characters matched by the +.Ar d Ns th +parenthesized subexpression +(numbering subexpressions by the positions of their opening parentheses, +left to right), +so that, for example, +.Sq \e([bc]\e)\e1 +matches +.Sq bb\& +or +.Sq cc +but not +.Sq bc . +.El +.Pp +The following is a list of basic regular expressions: +.Bl -tag -width Ds +.It Ar c +Any character +.Ar c +not listed below matches itself. +.It \e Ns Ar c +Any backslash-escaped character +.Ar c , +except for +.Sq { , +.Sq } , +.Sq \&( , +and +.Sq \&) , +matches itself. +.It \&. +Matches any single character that is not a newline +.Pq Sq \en . +.It Bq Ar char-class +Matches any single character in +.Ar char-class . +To include a +.Ql \&] +in +.Ar char-class , +it must be the first character. +A range of characters may be specified by separating the end characters +of the range with a +.Ql - ; +e.g.\& +.Ar a-z +specifies the lower case characters. +The following literal expressions can also be used in +.Ar char-class +to specify sets of characters: +.Bd -unfilled -offset indent +[:alnum:] [:cntrl:] [:lower:] [:space:] +[:alpha:] [:digit:] [:print:] [:upper:] +[:blank:] [:graph:] [:punct:] [:xdigit:] +.Ed +.Pp +If +.Ql - +appears as the first or last character of +.Ar char-class , +then it matches itself. +All other characters in +.Ar char-class +match themselves. +.Pp +Patterns in +.Ar char-class +of the form +.Eo [. +.Ar col-elm +.Ec .]\& +or +.Eo [= +.Ar col-elm +.Ec =]\& , +where +.Ar col-elm +is a collating element, are interpreted according to +.Xr setlocale 3 +.Pq not currently supported . +.It Bq ^ Ns Ar char-class +Matches any single character, other than newline, not in +.Ar char-class . +.Ar char-class +is defined as above. +.It ^ +If +.Sq ^ +is the first character of a regular expression, then it +anchors the regular expression to the beginning of a line. +Otherwise, it matches itself. +.It $ +If +.Sq $ +is the last character of a regular expression, +it anchors the regular expression to the end of a line. +Otherwise, it matches itself. +.It [[:<:]] +Anchors the single character regular expression or subexpression +immediately following it to the beginning of a word. +.It [[:>:]] +Anchors the single character regular expression or subexpression +immediately following it to the end of a word. +.It \e( Ns Ar re Ns \e) +Defines a subexpression +.Ar re . +Subexpressions may be nested. +A subsequent backreference of the form +.Pf \e Ar n , +where +.Ar n +is a number in the range [1,9], expands to the text matched by the +.Ar n Ns th +subexpression. +For example, the regular expression +.Ar \e(.*\e)\e1 +matches any string consisting of identical adjacent substrings. +Subexpressions are ordered relative to their left delimiter. +.It * +Matches the single character regular expression or subexpression +immediately preceding it zero or more times. +If +.Sq * +is the first character of a regular expression or subexpression, +then it matches itself. +The +.Sq * +operator sometimes yields unexpected results. +For example, the regular expression +.Ar b* +matches the beginning of the string +.Qq abbb +(as opposed to the substring +.Qq bbb ) , +since a null match is the only leftmost match. +.Sm off +.It Xo +.Pf \e{ Ar n , m No \e}\ \& +.Pf \e{ Ar n , No \e}\ \& +.Pf \e{ Ar n No \e} +.Xc +.Sm on +Matches the single character regular expression or subexpression +immediately preceding it at least +.Ar n +and at most +.Ar m +times. +If +.Ar m +is omitted, then it matches at least +.Ar n +times. +If the comma is also omitted, then it matches exactly +.Ar n +times. +.El +.Sh SEE ALSO +.Xr regex 3 +.Sh STANDARDS +.St -p1003.1-2004 : +Base Definitions, Chapter 9 (Regular Expressions). +.Sh BUGS +Having two kinds of REs is a botch. +.Pp +The current POSIX spec says that +.Sq )\& +is an ordinary character in the absence of an unmatched +.Sq \&( ; +this was an unintentional result of a wording error, +and change is likely. +Avoid relying on it. +.Pp +Back-references are a dreadful botch, +posing major problems for efficient implementations. +They are also somewhat vaguely defined +(does +.Sq a\e(\e(b\e)*\e2\e)*d +match +.Sq abbbd ? ) . +Avoid using them. +.Pp +POSIX's specification of case-independent matching is vague. +The +.Dq one case implies all cases +definition given above +is the current consensus among implementors as to the right interpretation. +.Pp +The syntax for word boundaries is incredibly ugly. diff --git a/lib/libopenbsd/regex/regcomp.c b/lib/libopenbsd/regex/regcomp.c @@ -0,0 +1,1528 @@ +/* $OpenBSD: regcomp.c,v 1.28 2015/12/28 22:08:18 mmcc Exp $ */ +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 + */ + +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" + +#include "cclass.h" +#include "cname.h" + +/* + * parse structure, passed up and down to avoid global variables and + * other clumsinesses + */ +struct parse { + char *next; /* next character in RE */ + char *end; /* end of string (-> NUL normally) */ + int error; /* has an error been seen? */ + sop *strip; /* malloced strip */ + sopno ssize; /* malloced strip size (allocated) */ + sopno slen; /* malloced strip length (used) */ + int ncsalloc; /* number of csets allocated */ + struct re_guts *g; +# define NPAREN 10 /* we need to remember () 1-9 for back refs */ + sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ + sopno pend[NPAREN]; /* -> ) ([0] unused) */ +}; + +static void p_ere(struct parse *, int); +static void p_ere_exp(struct parse *); +static void p_str(struct parse *); +static void p_bre(struct parse *, int, int); +static int p_simp_re(struct parse *, int); +static int p_count(struct parse *); +static void p_bracket(struct parse *); +static void p_b_term(struct parse *, cset *); +static void p_b_cclass(struct parse *, cset *); +static void p_b_eclass(struct parse *, cset *); +static char p_b_symbol(struct parse *); +static char p_b_coll_elem(struct parse *, int); +static char othercase(int); +static void bothcases(struct parse *, int); +static void ordinary(struct parse *, int); +static void backslash(struct parse *, int); +static void nonnewline(struct parse *); +static void repeat(struct parse *, sopno, int, int); +static int seterr(struct parse *, int); +static cset *allocset(struct parse *); +static void freeset(struct parse *, cset *); +static int freezeset(struct parse *, cset *); +static int firstch(struct parse *, cset *); +static int nch(struct parse *, cset *); +static void mcadd(struct parse *, cset *, char *); +static void mcinvert(struct parse *, cset *); +static void mccase(struct parse *, cset *); +static int isinsets(struct re_guts *, int); +static int samesets(struct re_guts *, int, int); +static void categorize(struct parse *, struct re_guts *); +static sopno dupl(struct parse *, sopno, sopno); +static void doemit(struct parse *, sop, size_t); +static void doinsert(struct parse *, sop, size_t, sopno); +static void dofwd(struct parse *, sopno, sop); +static int enlarge(struct parse *, sopno); +static void stripsnug(struct parse *, struct re_guts *); +static void findmust(struct parse *, struct re_guts *); +static sopno pluscount(struct parse *, struct re_guts *); + +static char nuls[10]; /* place to point scanner in event of error */ + +/* + * macros for use with parse structure + * BEWARE: these know that the parse structure is named `p' !!! + */ +#define PEEK() (*p->next) +#define PEEK2() (*(p->next+1)) +#define MORE() (p->next < p->end) +#define MORE2() (p->next+1 < p->end) +#define SEE(c) (MORE() && PEEK() == (c)) +#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) +#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) +#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) +#define NEXT() (p->next++) +#define NEXT2() (p->next += 2) +#define NEXTn(n) (p->next += (n)) +#define GETNEXT() (*p->next++) +#define SETERROR(e) seterr(p, (e)) +#define REQUIRE(co, e) ((co) || SETERROR(e)) +#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) +#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) +#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) +#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) +#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) +#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) +#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) +#define HERE() (p->slen) +#define THERE() (p->slen - 1) +#define THERETHERE() (p->slen - 2) +#define DROP(n) (p->slen -= (n)) + +#ifndef NDEBUG +static int never = 0; /* for use in asserts; shuts lint up */ +#else +#define never 0 /* some <assert.h>s have bugs too */ +#endif + +/* + - regcomp - interface for parser and compilation + */ +int /* 0 success, otherwise REG_something */ +regcomp(regex_t *preg, const char *pattern, int cflags) +{ + struct parse pa; + struct re_guts *g; + struct parse *p = &pa; + int i; + size_t len; +#ifdef REDEBUG +# define GOODFLAGS(f) (f) +#else +# define GOODFLAGS(f) ((f)&~REG_DUMP) +#endif + + cflags = GOODFLAGS(cflags); + if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC)) + return(REG_INVARG); + + if (cflags&REG_PEND) { + if (preg->re_endp < pattern) + return(REG_INVARG); + len = preg->re_endp - pattern; + } else + len = strlen((char *)pattern); + + /* do the mallocs early so failure handling is easy */ + g = malloc(sizeof(struct re_guts)); + if (g == NULL) + return(REG_ESPACE); + p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ + p->strip = reallocarray(NULL, p->ssize, sizeof(sop)); + p->slen = 0; + if (p->strip == NULL) { + free(g); + return(REG_ESPACE); + } + + /* set things up */ + p->g = g; + p->next = (char *)pattern; /* convenience; we do not modify it */ + p->end = p->next + len; + p->error = 0; + p->ncsalloc = 0; + for (i = 0; i < NPAREN; i++) { + p->pbegin[i] = 0; + p->pend[i] = 0; + } + g->csetsize = NC; + g->sets = NULL; + g->setbits = NULL; + g->ncsets = 0; + g->cflags = cflags; + g->iflags = 0; + g->nbol = 0; + g->neol = 0; + g->must = NULL; + g->mlen = 0; + g->nsub = 0; + g->ncategories = 1; /* category 0 is "everything else" */ + g->categories = &g->catspace[-(CHAR_MIN)]; + memset(g->catspace, 0, sizeof(g->catspace)); + g->backrefs = 0; + + /* do it */ + EMIT(OEND, 0); + g->firststate = THERE(); + if (cflags&REG_EXTENDED) + p_ere(p, OUT); + else if (cflags&REG_NOSPEC) + p_str(p); + else + p_bre(p, OUT, OUT); + EMIT(OEND, 0); + g->laststate = THERE(); + + /* tidy up loose ends and fill things in */ + categorize(p, g); + stripsnug(p, g); + findmust(p, g); + g->nplus = pluscount(p, g); + g->magic = MAGIC2; + preg->re_nsub = g->nsub; + preg->re_g = g; + preg->re_magic = MAGIC1; +#ifndef REDEBUG + /* not debugging, so can't rely on the assert() in regexec() */ + if (g->iflags&BAD) + SETERROR(REG_ASSERT); +#endif + + /* win or lose, we're done */ + if (p->error != 0) /* lose */ + regfree(preg); + return(p->error); +} + +/* + - p_ere - ERE parser top level, concatenation and alternation + */ +static void +p_ere(struct parse *p, int stop) /* character this ERE should end at */ +{ + char c; + sopno prevback; + sopno prevfwd; + sopno conc; + int first = 1; /* is this the first alternative? */ + + for (;;) { + /* do a bunch of concatenated expressions */ + conc = HERE(); + while (MORE() && (c = PEEK()) != '|' && c != stop) + p_ere_exp(p); + REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ + + if (!EAT('|')) + break; /* NOTE BREAK OUT */ + + if (first) { + INSERT(OCH_, conc); /* offset is wrong */ + prevfwd = conc; + prevback = conc; + first = 0; + } + ASTERN(OOR1, prevback); + prevback = THERE(); + AHEAD(prevfwd); /* fix previous offset */ + prevfwd = HERE(); + EMIT(OOR2, 0); /* offset is very wrong */ + } + + if (!first) { /* tail-end fixups */ + AHEAD(prevfwd); + ASTERN(O_CH, prevback); + } + + assert(!MORE() || SEE(stop)); +} + +/* + - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op + */ +static void +p_ere_exp(struct parse *p) +{ + char c; + sopno pos; + int count; + int count2; + sopno subno; + int wascaret = 0; + + assert(MORE()); /* caller should have ensured this */ + c = GETNEXT(); + + pos = HERE(); + switch (c) { + case '(': + REQUIRE(MORE(), REG_EPAREN); + p->g->nsub++; + subno = p->g->nsub; + if (subno < NPAREN) + p->pbegin[subno] = HERE(); + EMIT(OLPAREN, subno); + if (!SEE(')')) + p_ere(p, ')'); + if (subno < NPAREN) { + p->pend[subno] = HERE(); + assert(p->pend[subno] != 0); + } + EMIT(ORPAREN, subno); + MUSTEAT(')', REG_EPAREN); + break; + case '^': + EMIT(OBOL, 0); + p->g->iflags |= USEBOL; + p->g->nbol++; + wascaret = 1; + break; + case '$': + EMIT(OEOL, 0); + p->g->iflags |= USEEOL; + p->g->neol++; + break; + case '|': + SETERROR(REG_EMPTY); + break; + case '*': + case '+': + case '?': + SETERROR(REG_BADRPT); + break; + case '.': + if (p->g->cflags&REG_NEWLINE) + nonnewline(p); + else + EMIT(OANY, 0); + break; + case '[': + p_bracket(p); + break; + case '\\': + REQUIRE(MORE(), REG_EESCAPE); + c = GETNEXT(); + backslash(p, c); + break; + case '{': /* okay as ordinary except if digit follows */ + REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); + /* FALLTHROUGH */ + default: + ordinary(p, c); + break; + } + + if (!MORE()) + return; + c = PEEK(); + /* we call { a repetition if followed by a digit */ + if (!( c == '*' || c == '+' || c == '?' || + (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) + return; /* no repetition, we're done */ + NEXT(); + + REQUIRE(!wascaret, REG_BADRPT); + switch (c) { + case '*': /* implemented as +? */ + /* this case does not require the (y|) trick, noKLUDGE */ + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + break; + case '+': + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); + break; + case '?': + /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ + INSERT(OCH_, pos); /* offset slightly wrong */ + ASTERN(OOR1, pos); /* this one's right */ + AHEAD(pos); /* fix the OCH_ */ + EMIT(OOR2, 0); /* offset very wrong... */ + AHEAD(THERE()); /* ...so fix it */ + ASTERN(O_CH, THERETHERE()); + break; + case '{': + count = p_count(p); + if (EAT(',')) { + if (isdigit((uch)PEEK())) { + count2 = p_count(p); + REQUIRE(count <= count2, REG_BADBR); + } else /* single number with comma */ + count2 = INFINITY; + } else /* just a single number */ + count2 = count; + repeat(p, pos, count, count2); + if (!EAT('}')) { /* error heuristics */ + while (MORE() && PEEK() != '}') + NEXT(); + REQUIRE(MORE(), REG_EBRACE); + SETERROR(REG_BADBR); + } + break; + } + + if (!MORE()) + return; + c = PEEK(); + if (!( c == '*' || c == '+' || c == '?' || + (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) + return; + SETERROR(REG_BADRPT); +} + +/* + - p_str - string (no metacharacters) "parser" + */ +static void +p_str(struct parse *p) +{ + REQUIRE(MORE(), REG_EMPTY); + while (MORE()) + ordinary(p, GETNEXT()); +} + +/* + - p_bre - BRE parser top level, anchoring and concatenation + * Giving end1 as OUT essentially eliminates the end1/end2 check. + * + * This implementation is a bit of a kludge, in that a trailing $ is first + * taken as an ordinary character and then revised to be an anchor. The + * only undesirable side effect is that '$' gets included as a character + * category in such cases. This is fairly harmless; not worth fixing. + * The amount of lookahead needed to avoid this kludge is excessive. + */ +static void +p_bre(struct parse *p, + int end1, /* first terminating character */ + int end2) /* second terminating character */ +{ + sopno start = HERE(); + int first = 1; /* first subexpression? */ + int wasdollar = 0; + + if (EAT('^')) { + EMIT(OBOL, 0); + p->g->iflags |= USEBOL; + p->g->nbol++; + } + while (MORE() && !SEETWO(end1, end2)) { + wasdollar = p_simp_re(p, first); + first = 0; + } + if (wasdollar) { /* oops, that was a trailing anchor */ + DROP(1); + EMIT(OEOL, 0); + p->g->iflags |= USEEOL; + p->g->neol++; + } + + REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ +} + +/* + - p_simp_re - parse a simple RE, an atom possibly followed by a repetition + */ +static int /* was the simple RE an unbackslashed $? */ +p_simp_re(struct parse *p, + int starordinary) /* is a leading * an ordinary character? */ +{ + int c; + int count; + int count2; + sopno pos; + int i; + sopno subno; +# define BACKSL (1<<CHAR_BIT) + + pos = HERE(); /* repetion op, if any, covers from here */ + + assert(MORE()); /* caller should have ensured this */ + c = GETNEXT(); + if (c == '\\') { + REQUIRE(MORE(), REG_EESCAPE); + c = BACKSL | GETNEXT(); + } + switch (c) { + case '.': + if (p->g->cflags&REG_NEWLINE) + nonnewline(p); + else + EMIT(OANY, 0); + break; + case '[': + p_bracket(p); + break; + case BACKSL|'<': + EMIT(OBOW, 0); + break; + case BACKSL|'>': + EMIT(OEOW, 0); + break; + case BACKSL|'{': + SETERROR(REG_BADRPT); + break; + case BACKSL|'(': + p->g->nsub++; + subno = p->g->nsub; + if (subno < NPAREN) + p->pbegin[subno] = HERE(); + EMIT(OLPAREN, subno); + /* the MORE here is an error heuristic */ + if (MORE() && !SEETWO('\\', ')')) + p_bre(p, '\\', ')'); + if (subno < NPAREN) { + p->pend[subno] = HERE(); + assert(p->pend[subno] != 0); + } + EMIT(ORPAREN, subno); + REQUIRE(EATTWO('\\', ')'), REG_EPAREN); + break; + case BACKSL|')': /* should not get here -- must be user */ + case BACKSL|'}': + SETERROR(REG_EPAREN); + break; + case BACKSL|'1': + case BACKSL|'2': + case BACKSL|'3': + case BACKSL|'4': + case BACKSL|'5': + case BACKSL|'6': + case BACKSL|'7': + case BACKSL|'8': + case BACKSL|'9': + i = (c&~BACKSL) - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; + case '*': + REQUIRE(starordinary, REG_BADRPT); + /* FALLTHROUGH */ + default: + ordinary(p, (char)c); + break; + } + + if (EAT('*')) { /* implemented as +? */ + /* this case does not require the (y|) trick, noKLUDGE */ + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + } else if (EATTWO('\\', '{')) { + count = p_count(p); + if (EAT(',')) { + if (MORE() && isdigit((uch)PEEK())) { + count2 = p_count(p); + REQUIRE(count <= count2, REG_BADBR); + } else /* single number with comma */ + count2 = INFINITY; + } else /* just a single number */ + count2 = count; + repeat(p, pos, count, count2); + if (!EATTWO('\\', '}')) { /* error heuristics */ + while (MORE() && !SEETWO('\\', '}')) + NEXT(); + REQUIRE(MORE(), REG_EBRACE); + SETERROR(REG_BADBR); + } + } else if (c == '$') /* $ (but not \$) ends it */ + return(1); + + return(0); +} + +/* + - p_count - parse a repetition count + */ +static int /* the value */ +p_count(struct parse *p) +{ + int count = 0; + int ndigits = 0; + + while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { + count = count*10 + (GETNEXT() - '0'); + ndigits++; + } + + REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); + return(count); +} + +/* + - p_bracket - parse a bracketed character list + * + * Note a significant property of this code: if the allocset() did SETERROR, + * no set operations are done. + */ +static void +p_bracket(struct parse *p) +{ + cset *cs; + int invert = 0; + + /* Dept of Truly Sickening Special-Case Kludges */ + if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { + EMIT(OBOW, 0); + NEXTn(6); + return; + } + if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { + EMIT(OEOW, 0); + NEXTn(6); + return; + } + + if ((cs = allocset(p)) == NULL) { + /* allocset did set error status in p */ + return; + } + + if (EAT('^')) + invert++; /* make note to invert set at end */ + if (EAT(']')) + CHadd(cs, ']'); + else if (EAT('-')) + CHadd(cs, '-'); + while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) + p_b_term(p, cs); + if (EAT('-')) + CHadd(cs, '-'); + MUSTEAT(']', REG_EBRACK); + + if (p->error != 0) { /* don't mess things up further */ + freeset(p, cs); + return; + } + + if (p->g->cflags&REG_ICASE) { + int i; + int ci; + + for (i = p->g->csetsize - 1; i >= 0; i--) + if (CHIN(cs, i) && isalpha(i)) { + ci = othercase(i); + if (ci != i) + CHadd(cs, ci); + } + if (cs->multis != NULL) + mccase(p, cs); + } + if (invert) { + int i; + + for (i = p->g->csetsize - 1; i >= 0; i--) + if (CHIN(cs, i)) + CHsub(cs, i); + else + CHadd(cs, i); + if (p->g->cflags&REG_NEWLINE) + CHsub(cs, '\n'); + if (cs->multis != NULL) + mcinvert(p, cs); + } + + assert(cs->multis == NULL); /* xxx */ + + if (nch(p, cs) == 1) { /* optimize singleton sets */ + ordinary(p, firstch(p, cs)); + freeset(p, cs); + } else + EMIT(OANYOF, freezeset(p, cs)); +} + +/* + - p_b_term - parse one term of a bracketed character list + */ +static void +p_b_term(struct parse *p, cset *cs) +{ + char c; + char start, finish; + int i; + + /* classify what we've got */ + switch ((MORE()) ? PEEK() : '\0') { + case '[': + c = (MORE2()) ? PEEK2() : '\0'; + break; + case '-': + SETERROR(REG_ERANGE); + return; /* NOTE RETURN */ + break; + default: + c = '\0'; + break; + } + + switch (c) { + case ':': /* character class */ + NEXT2(); + REQUIRE(MORE(), REG_EBRACK); + c = PEEK(); + REQUIRE(c != '-' && c != ']', REG_ECTYPE); + p_b_cclass(p, cs); + REQUIRE(MORE(), REG_EBRACK); + REQUIRE(EATTWO(':', ']'), REG_ECTYPE); + break; + case '=': /* equivalence class */ + NEXT2(); + REQUIRE(MORE(), REG_EBRACK); + c = PEEK(); + REQUIRE(c != '-' && c != ']', REG_ECOLLATE); + p_b_eclass(p, cs); + REQUIRE(MORE(), REG_EBRACK); + REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); + break; + default: /* symbol, ordinary character, or range */ +/* xxx revision needed for multichar stuff */ + start = p_b_symbol(p); + if (SEE('-') && MORE2() && PEEK2() != ']') { + /* range */ + NEXT(); + if (EAT('-')) + finish = '-'; + else + finish = p_b_symbol(p); + } else + finish = start; +/* xxx what about signed chars here... */ + REQUIRE(start <= finish, REG_ERANGE); + for (i = start; i <= finish; i++) + CHadd(cs, i); + break; + } +} + +/* + - p_b_cclass - parse a character-class name and deal with it + */ +static void +p_b_cclass(struct parse *p, cset *cs) +{ + char *sp = p->next; + struct cclass *cp; + size_t len; + char *u; + char c; + + while (MORE() && isalpha((uch)PEEK())) + NEXT(); + len = p->next - sp; + for (cp = cclasses; cp->name != NULL; cp++) + if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') + break; + if (cp->name == NULL) { + /* oops, didn't find it */ + SETERROR(REG_ECTYPE); + return; + } + + u = cp->chars; + while ((c = *u++) != '\0') + CHadd(cs, c); + for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) + MCadd(p, cs, u); +} + +/* + - p_b_eclass - parse an equivalence-class name and deal with it + * + * This implementation is incomplete. xxx + */ +static void +p_b_eclass(struct parse *p, cset *cs) +{ + char c; + + c = p_b_coll_elem(p, '='); + CHadd(cs, c); +} + +/* + - p_b_symbol - parse a character or [..]ed multicharacter collating symbol + */ +static char /* value of symbol */ +p_b_symbol(struct parse *p) +{ + char value; + + REQUIRE(MORE(), REG_EBRACK); + if (!EATTWO('[', '.')) + return(GETNEXT()); + + /* collating symbol */ + value = p_b_coll_elem(p, '.'); + REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); + return(value); +} + +/* + - p_b_coll_elem - parse a collating-element name and look it up + */ +static char /* value of collating element */ +p_b_coll_elem(struct parse *p, + int endc) /* name ended by endc,']' */ +{ + char *sp = p->next; + struct cname *cp; + int len; + + while (MORE() && !SEETWO(endc, ']')) + NEXT(); + if (!MORE()) { + SETERROR(REG_EBRACK); + return(0); + } + len = p->next - sp; + for (cp = cnames; cp->name != NULL; cp++) + if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') + return(cp->code); /* known name */ + if (len == 1) + return(*sp); /* single character */ + SETERROR(REG_ECOLLATE); /* neither */ + return(0); +} + +/* + - othercase - return the case counterpart of an alphabetic + */ +static char /* if no counterpart, return ch */ +othercase(int ch) +{ + ch = (uch)ch; + assert(isalpha(ch)); + if (isupper(ch)) + return ((uch)tolower(ch)); + else if (islower(ch)) + return ((uch)toupper(ch)); + else /* peculiar, but could happen */ + return(ch); +} + +/* + - bothcases - emit a dualcase version of a two-case character + * + * Boy, is this implementation ever a kludge... + */ +static void +bothcases(struct parse *p, int ch) +{ + char *oldnext = p->next; + char *oldend = p->end; + char bracket[3]; + + ch = (uch)ch; + assert(othercase(ch) != ch); /* p_bracket() would recurse */ + p->next = bracket; + p->end = bracket+2; + bracket[0] = ch; + bracket[1] = ']'; + bracket[2] = '\0'; + p_bracket(p); + assert(p->next == bracket+2); + p->next = oldnext; + p->end = oldend; +} + +/* + - ordinary - emit an ordinary character + */ +static void +ordinary(struct parse *p, int ch) +{ + cat_t *cap = p->g->categories; + + if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) + bothcases(p, ch); + else { + EMIT(OCHAR, (uch)ch); + if (cap[ch] == 0) + cap[ch] = p->g->ncategories++; + } +} + +/* + * do something magic with this character, but only if it's extra magic + */ +static void +backslash(struct parse *p, int ch) +{ + switch (ch) { + case '<': + EMIT(OBOW, 0); + break; + case '>': + EMIT(OEOW, 0); + break; + default: + ordinary(p, ch); + break; + } +} + +/* + - nonnewline - emit REG_NEWLINE version of OANY + * + * Boy, is this implementation ever a kludge... + */ +static void +nonnewline(struct parse *p) +{ + char *oldnext = p->next; + char *oldend = p->end; + char bracket[4]; + + p->next = bracket; + p->end = bracket+3; + bracket[0] = '^'; + bracket[1] = '\n'; + bracket[2] = ']'; + bracket[3] = '\0'; + p_bracket(p); + assert(p->next == bracket+3); + p->next = oldnext; + p->end = oldend; +} + +/* + - repeat - generate code for a bounded repetition, recursively if needed + */ +static void +repeat(struct parse *p, + sopno start, /* operand from here to end of strip */ + int from, /* repeated from this number */ + int to) /* to this number of times (maybe INFINITY) */ +{ + sopno finish = HERE(); +# define N 2 +# define INF 3 +# define REP(f, t) ((f)*8 + (t)) +# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) + sopno copy; + + if (p->error != 0) /* head off possible runaway recursion */ + return; + + assert(from <= to); + + switch (REP(MAP(from), MAP(to))) { + case REP(0, 0): /* must be user doing this */ + DROP(finish-start); /* drop the operand */ + break; + case REP(0, 1): /* as x{1,1}? */ + case REP(0, N): /* as x{1,n}? */ + case REP(0, INF): /* as x{1,}? */ + /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ + INSERT(OCH_, start); /* offset is wrong... */ + repeat(p, start+1, 1, to); + ASTERN(OOR1, start); + AHEAD(start); /* ... fix it */ + EMIT(OOR2, 0); + AHEAD(THERE()); + ASTERN(O_CH, THERETHERE()); + break; + case REP(1, 1): /* trivial case */ + /* done */ + break; + case REP(1, N): /* as x?x{1,n-1} */ + /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ + INSERT(OCH_, start); + ASTERN(OOR1, start); + AHEAD(start); + EMIT(OOR2, 0); /* offset very wrong... */ + AHEAD(THERE()); /* ...so fix it */ + ASTERN(O_CH, THERETHERE()); + copy = dupl(p, start+1, finish+1); + assert(copy == finish+4); + repeat(p, copy, 1, to-1); + break; + case REP(1, INF): /* as x+ */ + INSERT(OPLUS_, start); + ASTERN(O_PLUS, start); + break; + case REP(N, N): /* as xx{m-1,n-1} */ + copy = dupl(p, start, finish); + repeat(p, copy, from-1, to-1); + break; + case REP(N, INF): /* as xx{n-1,INF} */ + copy = dupl(p, start, finish); + repeat(p, copy, from-1, to); + break; + default: /* "can't happen" */ + SETERROR(REG_ASSERT); /* just in case */ + break; + } +} + +/* + - seterr - set an error condition + */ +static int /* useless but makes type checking happy */ +seterr(struct parse *p, int e) +{ + if (p->error == 0) /* keep earliest error condition */ + p->error = e; + p->next = nuls; /* try to bring things to a halt */ + p->end = nuls; + return(0); /* make the return value well-defined */ +} + +/* + - allocset - allocate a set of characters for [] + */ +static cset * +allocset(struct parse *p) +{ + int no = p->g->ncsets++; + size_t nc; + size_t nbytes; + cset *cs; + size_t css = (size_t)p->g->csetsize; + int i; + + if (no >= p->ncsalloc) { /* need another column of space */ + void *ptr; + + p->ncsalloc += CHAR_BIT; + nc = p->ncsalloc; + assert(nc % CHAR_BIT == 0); + + ptr = reallocarray(p->g->sets, nc, sizeof(cset)); + if (ptr == NULL) + goto nomem; + p->g->sets = ptr; + + ptr = reallocarray(p->g->setbits, nc / CHAR_BIT, css); + if (ptr == NULL) + goto nomem; + nbytes = (nc / CHAR_BIT) * css; + p->g->setbits = ptr; + + for (i = 0; i < no; i++) + p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); + + (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); + } + /* XXX should not happen */ + if (p->g->sets == NULL || p->g->setbits == NULL) + goto nomem; + + cs = &p->g->sets[no]; + cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); + cs->mask = 1 << ((no) % CHAR_BIT); + cs->hash = 0; + cs->smultis = 0; + cs->multis = NULL; + + return(cs); +nomem: + free(p->g->sets); + p->g->sets = NULL; + free(p->g->setbits); + p->g->setbits = NULL; + + SETERROR(REG_ESPACE); + /* caller's responsibility not to do set ops */ + return(NULL); +} + +/* + - freeset - free a now-unused set + */ +static void +freeset(struct parse *p, cset *cs) +{ + int i; + cset *top = &p->g->sets[p->g->ncsets]; + size_t css = (size_t)p->g->csetsize; + + for (i = 0; i < css; i++) + CHsub(cs, i); + if (cs == top-1) /* recover only the easy case */ + p->g->ncsets--; +} + +/* + - freezeset - final processing on a set of characters + * + * The main task here is merging identical sets. This is usually a waste + * of time (although the hash code minimizes the overhead), but can win + * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash + * is done using addition rather than xor -- all ASCII [aA] sets xor to + * the same value! + */ +static int /* set number */ +freezeset(struct parse *p, cset *cs) +{ + uch h = cs->hash; + int i; + cset *top = &p->g->sets[p->g->ncsets]; + cset *cs2; + size_t css = (size_t)p->g->csetsize; + + /* look for an earlier one which is the same */ + for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) + if (cs2->hash == h && cs2 != cs) { + /* maybe */ + for (i = 0; i < css; i++) + if (!!CHIN(cs2, i) != !!CHIN(cs, i)) + break; /* no */ + if (i == css) + break; /* yes */ + } + + if (cs2 < top) { /* found one */ + freeset(p, cs); + cs = cs2; + } + + return((int)(cs - p->g->sets)); +} + +/* + - firstch - return first character in a set (which must have at least one) + */ +static int /* character; there is no "none" value */ +firstch(struct parse *p, cset *cs) +{ + int i; + size_t css = (size_t)p->g->csetsize; + + for (i = 0; i < css; i++) + if (CHIN(cs, i)) + return((char)i); + assert(never); + return(0); /* arbitrary */ +} + +/* + - nch - number of characters in a set + */ +static int +nch(struct parse *p, cset *cs) +{ + int i; + size_t css = (size_t)p->g->csetsize; + int n = 0; + + for (i = 0; i < css; i++) + if (CHIN(cs, i)) + n++; + return(n); +} + +/* + - mcadd - add a collating element to a cset + */ +static void +mcadd( struct parse *p, cset *cs, char *cp) +{ + size_t oldend = cs->smultis; + void *np; + + cs->smultis += strlen(cp) + 1; + np = realloc(cs->multis, cs->smultis); + if (np == NULL) { + free(cs->multis); + cs->multis = NULL; + SETERROR(REG_ESPACE); + return; + } + cs->multis = np; + + strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); +} + +/* + - mcinvert - invert the list of collating elements in a cset + * + * This would have to know the set of possibilities. Implementation + * is deferred. + */ +static void +mcinvert(struct parse *p, cset *cs) +{ + assert(cs->multis == NULL); /* xxx */ +} + +/* + - mccase - add case counterparts of the list of collating elements in a cset + * + * This would have to know the set of possibilities. Implementation + * is deferred. + */ +static void +mccase(struct parse *p, cset *cs) +{ + assert(cs->multis == NULL); /* xxx */ +} + +/* + - isinsets - is this character in any sets? + */ +static int /* predicate */ +isinsets(struct re_guts *g, int c) +{ + uch *col; + int i; + int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; + unsigned uc = (uch)c; + + for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) + if (col[uc] != 0) + return(1); + return(0); +} + +/* + - samesets - are these two characters in exactly the same sets? + */ +static int /* predicate */ +samesets(struct re_guts *g, int c1, int c2) +{ + uch *col; + int i; + int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; + unsigned uc1 = (uch)c1; + unsigned uc2 = (uch)c2; + + for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) + if (col[uc1] != col[uc2]) + return(0); + return(1); +} + +/* + - categorize - sort out character categories + */ +static void +categorize(struct parse *p, struct re_guts *g) +{ + cat_t *cats = g->categories; + int c; + int c2; + cat_t cat; + + /* avoid making error situations worse */ + if (p->error != 0) + return; + + for (c = CHAR_MIN; c <= CHAR_MAX; c++) + if (cats[c] == 0 && isinsets(g, c)) { + cat = g->ncategories++; + cats[c] = cat; + for (c2 = c+1; c2 <= CHAR_MAX; c2++) + if (cats[c2] == 0 && samesets(g, c, c2)) + cats[c2] = cat; + } +} + +/* + - dupl - emit a duplicate of a bunch of sops + */ +static sopno /* start of duplicate */ +dupl(struct parse *p, + sopno start, /* from here */ + sopno finish) /* to this less one */ +{ + sopno ret = HERE(); + sopno len = finish - start; + + assert(finish >= start); + if (len == 0) + return(ret); + if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ + return(ret); + (void) memcpy((char *)(p->strip + p->slen), + (char *)(p->strip + start), (size_t)len*sizeof(sop)); + p->slen += len; + return(ret); +} + +/* + - doemit - emit a strip operator + * + * It might seem better to implement this as a macro with a function as + * hard-case backup, but it's just too big and messy unless there are + * some changes to the data structures. Maybe later. + */ +static void +doemit(struct parse *p, sop op, size_t opnd) +{ + /* avoid making error situations worse */ + if (p->error != 0) + return; + + /* deal with oversize operands ("can't happen", more or less) */ + assert(opnd < 1<<OPSHIFT); + + /* deal with undersized strip */ + if (p->slen >= p->ssize) + if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ + return; + + /* finally, it's all reduced to the easy case */ + p->strip[p->slen++] = SOP(op, opnd); +} + +/* + - doinsert - insert a sop into the strip + */ +static void +doinsert(struct parse *p, sop op, size_t opnd, sopno pos) +{ + sopno sn; + sop s; + int i; + + /* avoid making error situations worse */ + if (p->error != 0) + return; + + sn = HERE(); + EMIT(op, opnd); /* do checks, ensure space */ + assert(HERE() == sn+1); + s = p->strip[sn]; + + /* adjust paren pointers */ + assert(pos > 0); + for (i = 1; i < NPAREN; i++) { + if (p->pbegin[i] >= pos) { + p->pbegin[i]++; + } + if (p->pend[i] >= pos) { + p->pend[i]++; + } + } + + memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], + (HERE()-pos-1)*sizeof(sop)); + p->strip[pos] = s; +} + +/* + - dofwd - complete a forward reference + */ +static void +dofwd(struct parse *p, sopno pos, sop value) +{ + /* avoid making error situations worse */ + if (p->error != 0) + return; + + assert(value < 1<<OPSHIFT); + p->strip[pos] = OP(p->strip[pos]) | value; +} + +/* + - enlarge - enlarge the strip + */ +static int +enlarge(struct parse *p, sopno size) +{ + sop *sp; + + if (p->ssize >= size) + return 1; + + sp = reallocarray(p->strip, size, sizeof(sop)); + if (sp == NULL) { + SETERROR(REG_ESPACE); + return 0; + } + p->strip = sp; + p->ssize = size; + return 1; +} + +/* + - stripsnug - compact the strip + */ +static void +stripsnug(struct parse *p, struct re_guts *g) +{ + g->nstates = p->slen; + g->strip = reallocarray(p->strip, p->slen, sizeof(sop)); + if (g->strip == NULL) { + SETERROR(REG_ESPACE); + g->strip = p->strip; + } +} + +/* + - findmust - fill in must and mlen with longest mandatory literal string + * + * This algorithm could do fancy things like analyzing the operands of | + * for common subsequences. Someday. This code is simple and finds most + * of the interesting cases. + * + * Note that must and mlen got initialized during setup. + */ +static void +findmust(struct parse *p, struct re_guts *g) +{ + sop *scan; + sop *start; /* start initialized in the default case, after that */ + sop *newstart; /* newstart was initialized in the OCHAR case */ + sopno newlen; + sop s; + char *cp; + sopno i; + + /* avoid making error situations worse */ + if (p->error != 0) + return; + + /* find the longest OCHAR sequence in strip */ + newlen = 0; + scan = g->strip + 1; + do { + s = *scan++; + switch (OP(s)) { + case OCHAR: /* sequence member */ + if (newlen == 0) /* new sequence */ + newstart = scan - 1; + newlen++; + break; + case OPLUS_: /* things that don't break one */ + case OLPAREN: + case ORPAREN: + break; + case OQUEST_: /* things that must be skipped */ + case OCH_: + scan--; + do { + scan += OPND(s); + s = *scan; + /* assert() interferes w debug printouts */ + if (OP(s) != O_QUEST && OP(s) != O_CH && + OP(s) != OOR2) { + g->iflags |= BAD; + return; + } + } while (OP(s) != O_QUEST && OP(s) != O_CH); + /* fallthrough */ + default: /* things that break a sequence */ + if (newlen > g->mlen) { /* ends one */ + start = newstart; + g->mlen = newlen; + } + newlen = 0; + break; + } + } while (OP(s) != OEND); + + if (g->mlen == 0) /* there isn't one */ + return; + + /* turn it into a character string */ + g->must = malloc((size_t)g->mlen + 1); + if (g->must == NULL) { /* argh; just forget it */ + g->mlen = 0; + return; + } + cp = g->must; + scan = start; + for (i = g->mlen; i > 0; i--) { + while (OP(s = *scan++) != OCHAR) + continue; + assert(cp < g->must + g->mlen); + *cp++ = (char)OPND(s); + } + assert(cp == g->must + g->mlen); + *cp++ = '\0'; /* just on general principles */ +} + +/* + - pluscount - count + nesting + */ +static sopno /* nesting depth */ +pluscount(struct parse *p, struct re_guts *g) +{ + sop *scan; + sop s; + sopno plusnest = 0; + sopno maxnest = 0; + + if (p->error != 0) + return(0); /* there may not be an OEND */ + + scan = g->strip + 1; + do { + s = *scan++; + switch (OP(s)) { + case OPLUS_: + plusnest++; + break; + case O_PLUS: + if (plusnest > maxnest) + maxnest = plusnest; + plusnest--; + break; + } + } while (OP(s) != OEND); + if (plusnest != 0) + g->iflags |= BAD; + return(maxnest); +} diff --git a/lib/libopenbsd/regex/regerror.c b/lib/libopenbsd/regex/regerror.c @@ -0,0 +1,129 @@ +/* $OpenBSD: regerror.c,v 1.14 2015/11/01 03:45:29 guenther Exp $ */ +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)regerror.c 8.4 (Berkeley) 3/20/94 + */ + +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <regex.h> + +#include "utils.h" + +static char *regatoi(const regex_t *, char *, int); + +static struct rerr { + int code; + char *name; + char *explain; +} rerrs[] = { + { REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match" }, + { REG_BADPAT, "REG_BADPAT", "invalid regular expression" }, + { REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" }, + { REG_ECTYPE, "REG_ECTYPE", "invalid character class" }, + { REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)" }, + { REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" }, + { REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced" }, + { REG_EPAREN, "REG_EPAREN", "parentheses not balanced" }, + { REG_EBRACE, "REG_EBRACE", "braces not balanced" }, + { REG_BADBR, "REG_BADBR", "invalid repetition count(s)" }, + { REG_ERANGE, "REG_ERANGE", "invalid character range" }, + { REG_ESPACE, "REG_ESPACE", "out of memory" }, + { REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid" }, + { REG_EMPTY, "REG_EMPTY", "empty (sub)expression" }, + { REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" }, + { REG_INVARG, "REG_INVARG", "invalid argument to regex routine" }, + { 0, "", "*** unknown regexp error code ***" } +}; + +/* + - regerror - the interface to error numbers + = extern size_t regerror(int, const regex_t *, char *, size_t); + */ +size_t +regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) +{ + struct rerr *r; + size_t len; + int target = errcode &~ REG_ITOA; + char *s; + char convbuf[50]; + + if (errcode == REG_ATOI) + s = regatoi(preg, convbuf, sizeof convbuf); + else { + for (r = rerrs; r->code != 0; r++) + if (r->code == target) + break; + + if (errcode&REG_ITOA) { + if (r->code != 0) { + assert(strlen(r->name) < sizeof(convbuf)); + (void) strlcpy(convbuf, r->name, sizeof convbuf); + } else + (void)snprintf(convbuf, sizeof convbuf, + "REG_0x%x", target); + s = convbuf; + } else + s = r->explain; + } + + len = strlen(s) + 1; + if (errbuf_size > 0) { + strlcpy(errbuf, s, errbuf_size); + } + + return(len); +} + +/* + - regatoi - internal routine to implement REG_ATOI + */ +static char * +regatoi(const regex_t *preg, char *localbuf, int localbufsize) +{ + struct rerr *r; + + for (r = rerrs; r->code != 0; r++) + if (strcmp(r->name, preg->re_endp) == 0) + break; + if (r->code == 0) + return("0"); + + (void)snprintf(localbuf, localbufsize, "%d", r->code); + return(localbuf); +} diff --git a/lib/libopenbsd/regex/regex.3 b/lib/libopenbsd/regex/regex.3 @@ -0,0 +1,709 @@ +.\" $OpenBSD: regex.3,v 1.27 2016/05/26 05:46:44 martijn Exp $ +.\" +.\" Copyright (c) 1997, Phillip F Knaack. All rights reserved. +.\" +.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. +.\" Copyright (c) 1992, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" Henry Spencer. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)regex.3 8.4 (Berkeley) 3/20/94 +.\" +.Dd $Mdocdate: May 26 2016 $ +.Dt REGEXEC 3 +.Os +.Sh NAME +.Nm regcomp , +.Nm regexec , +.Nm regerror , +.Nm regfree +.Nd regular expression routines +.Sh SYNOPSIS +.In sys/types.h +.In regex.h +.Ft int +.Fn regcomp "regex_t *preg" "const char *pattern" "int cflags" +.Pp +.Ft int +.Fn regexec "const regex_t *preg" "const char *string" "size_t nmatch" \ + "regmatch_t pmatch[]" "int eflags" +.Pp +.Ft size_t +.Fn regerror "int errcode" "const regex_t *preg" "char *errbuf" \ + "size_t errbuf_size" +.Pp +.Ft void +.Fn regfree "regex_t *preg" +.Sh DESCRIPTION +These routines implement +.St -p1003.2 +regular expressions +.Pq Dq REs ; +see +.Xr re_format 7 . +.Fn regcomp +compiles an RE written as a string into an internal form, +.Fn regexec +matches that internal form against a string and reports results, +.Fn regerror +transforms error codes from either into human-readable messages, and +.Fn regfree +frees any dynamically allocated storage used by the internal form +of an RE. +.Pp +The header +.In regex.h +declares two structure types, +.Li regex_t +and +.Li regmatch_t , +the former for compiled internal forms and the latter for match reporting. +It also declares the four functions, +a type +.Li regoff_t , +and a number of constants with names starting with +.Dv REG_ . +.Pp +.Fn regcomp +compiles the regular expression contained in the +.Fa pattern +string, +subject to the flags in +.Fa cflags , +and places the results in the +.Li regex_t +structure pointed to by +.Fa preg . +The +.Fa cflags +argument is the bitwise +.Tn OR +of zero or more of the following values: +.Bl -tag -width XREG_EXTENDEDX +.It Dv REG_EXTENDED +Compile modern +.Pq Dq extended +REs, +rather than the obsolete +.Pq Dq basic +REs that are the default. +.It Dv REG_BASIC +This is a synonym for 0, +provided as a counterpart to +.Dv REG_EXTENDED +to improve readability. +.It Dv REG_NOSPEC +Compile with recognition of all special characters turned off. +All characters are thus considered ordinary, +so the RE is a literal string. +This is an extension, +compatible with but not specified by +.St -p1003.2 , +and should be used with +caution in software intended to be portable to other systems. +.Dv REG_EXTENDED +and +.Dv REG_NOSPEC +may not be used in the same call to +.Fn regcomp . +.It Dv REG_ICASE +Compile for matching that ignores upper/lower case distinctions. +See +.Xr re_format 7 . +.It Dv REG_NOSUB +Compile for matching that need only report success or failure, +not what was matched. +.It Dv REG_NEWLINE +Compile for newline-sensitive matching. +By default, newline is a completely ordinary character with no special +meaning in either REs or strings. +With this flag, +.Ql \&[^ +bracket expressions and +.Ql \&. +never match newline, +a +.Ql ^ +anchor matches the null string after any newline in the string +in addition to its normal function, +and the +.Ql $ +anchor matches the null string before any newline in the +string in addition to its normal function. +.It Dv REG_PEND +The regular expression ends, +not at the first NUL, +but just before the character pointed to by the +.Fa re_endp +member of the structure pointed to by +.Fa preg . +The +.Fa re_endp +member is of type +.Fa const\ char\ * . +This flag permits inclusion of NULs in the RE; +they are considered ordinary characters. +This is an extension, +compatible with but not specified by +.St -p1003.2 , +and should be used with +caution in software intended to be portable to other systems. +.El +.Pp +When successful, +.Fn regcomp +returns 0 and fills in the structure pointed to by +.Fa preg . +One member of that structure +(other than +.Fa re_endp ) +is publicized: +.Fa re_nsub , +of type +.Fa size_t , +contains the number of parenthesized subexpressions within the RE +(except that the value of this member is undefined if the +.Dv REG_NOSUB +flag was used). +If +.Fn regcomp +fails, it returns a non-zero error code; +see DIAGNOSTICS. +.Pp +.Fn regexec +matches the compiled RE pointed to by +.Fa preg +against the +.Fa string , +subject to the flags in +.Fa eflags , +and reports results using +.Fa nmatch , +.Fa pmatch , +and the returned value. +The RE must have been compiled by a previous invocation of +.Fn regcomp . +The compiled form is not altered during execution of +.Fn regexec , +so a single compiled RE can be used simultaneously by multiple threads. +.Pp +By default, +the NUL-terminated string pointed to by +.Fa string +is considered to be the text of an entire line, minus any terminating +newline. +The +.Fa eflags +argument is the bitwise +.Tn OR +of zero or more of the following values: +.Bl -tag -width XREG_STARTENDX +.It Dv REG_NOTBOL +The first character of the string is treated as the continuation +of a line. +This means that the anchors +.Ql ^ , +.Ql [[:<:]] , +and +.Ql \e< +do not match before it; but see +.Dv REG_STARTEND +below. +This does not affect the behavior of newlines under +.Dv REG_NEWLINE . +.It Dv REG_NOTEOL +The NUL terminating +the string +does not end a line, so the +.Ql $ +anchor does not match before it. +This does not affect the behavior of newlines under +.Dv REG_NEWLINE . +.It Dv REG_STARTEND +The string is considered to start at +.Fa string No + +.Fa pmatch Ns [0]. Ns Fa rm_so +and to end before the byte located at +.Fa string No + +.Fa pmatch Ns [0]. Ns Fa rm_eo , +regardless of the value of +.Fa nmatch . +See below for the definition of +.Fa pmatch +and +.Fa nmatch . +This is an extension, +compatible with but not specified by +.St -p1003.2 , +and should be used with +caution in software intended to be portable to other systems. +.Pp +Without +.Dv REG_NOTBOL , +the position +.Fa rm_so +is considered the beginning of a line, such that +.Ql ^ +matches before it, and the beginning of a word if there is a word +character at this position, such that +.Ql [[:<:]] +and +.Ql \e< +match before it. +.Pp +With +.Dv REG_NOTBOL , +the character at position +.Fa rm_so +is treated as the continuation of a line, and if +.Fa rm_so +is greater than 0, the preceding character is taken into consideration. +If the preceding character is a newline and the regular expression was compiled +with +.Dv REG_NEWLINE , +.Ql ^ +matches before the string; if the preceding character is not a word character +but the string starts with a word character, +.Ql [[:<:]] +and +.Ql \e< +match before the string. +.El +.Pp +See +.Xr re_format 7 +for a discussion of what is matched in situations where an RE or a +portion thereof could match any of several substrings of +.Fa string . +.Pp +Normally, +.Fn regexec +returns 0 for success and the non-zero code +.Dv REG_NOMATCH +for failure. +Other non-zero error codes may be returned in exceptional situations; +see DIAGNOSTICS. +.Pp +If +.Dv REG_NOSUB +was specified in the compilation of the RE, +or if +.Fa nmatch +is 0, +.Fn regexec +ignores the +.Fa pmatch +argument (but see below for the case where +.Dv REG_STARTEND +is specified). +Otherwise, +.Fa pmatch +points to an array of +.Fa nmatch +structures of type +.Li regmatch_t . +Such a structure has at least the members +.Fa rm_so +and +.Fa rm_eo , +both of type +.Fa regoff_t +(a signed arithmetic type at least as large as an +.Li off_t +and a +.Li ssize_t ) , +containing respectively the offset of the first character of a substring +and the offset of the first character after the end of the substring. +Offsets are measured from the beginning of the +.Fa string +argument given to +.Fn regexec . +An empty substring is denoted by equal offsets, +both indicating the character following the empty substring. +.Pp +The 0th member of the +.Fa pmatch +array is filled in to indicate what substring of +.Fa string +was matched by the entire RE. +Remaining members report what substring was matched by parenthesized +subexpressions within the RE; +member +.Va i +reports subexpression +.Va i , +with subexpressions counted (starting at 1) by the order of their opening +parentheses in the RE, left to right. +Unused entries in the array\(emcorresponding either to subexpressions that +did not participate in the match at all, or to subexpressions that do not +exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both +.Fa rm_so +and +.Fa rm_eo +set to \-1. +If a subexpression participated in the match several times, +the reported substring is the last one it matched. +(Note, as an example in particular, that when the RE +.Dq (b*)+ +matches +.Dq bbb , +the parenthesized subexpression matches each of the three +.Sq b Ns s +and then +an infinite number of empty strings following the last +.Sq b , +so the reported substring is one of the empties.) +.Pp +If +.Dv REG_STARTEND +is specified, +.Fa pmatch +must point to at least one +.Li regmatch_t +(even if +.Fa nmatch +is 0 or +.Dv REG_NOSUB +was specified), +to hold the input offsets for +.Dv REG_STARTEND . +Use for output is still entirely controlled by +.Fa nmatch ; +if +.Fa nmatch +is 0 or +.Dv REG_NOSUB +was specified, +the value of +.Fa pmatch[0] +will not be changed by a successful +.Fn regexec . +.Pp +.Fn regerror +maps a non-zero +.Va errcode +from either +.Fn regcomp +or +.Fn regexec +to a human-readable, printable message. +If +.Fa preg +is non-NULL, +the error code should have arisen from use of +the +.Li regex_t +pointed to by +.Fa preg , +and if the error code came from +.Fn regcomp , +it should have been the result from the most recent +.Fn regcomp +using that +.Li regex_t . +.Pf ( Fn regerror +may be able to supply a more detailed message using information +from the +.Li regex_t . ) +.Fn regerror +places the NUL-terminated message into the buffer pointed to by +.Fa errbuf , +limiting the length (including the NUL) to at most +.Fa errbuf_size +bytes. +If the whole message won't fit, +as much of it as will fit before the terminating NUL is supplied. +In any case, +the returned value is the size of buffer needed to hold the whole +message (including the terminating NUL). +If +.Fa errbuf_size +is 0, +.Fa errbuf +is ignored but the return value is still correct. +.Pp +If the +.Fa errcode +given to +.Fn regerror +is first +.Tn OR Ns 'ed +with +.Dv REG_ITOA , +the +.Dq message +that results is the printable name of the error code, +e.g., +.Dq REG_NOMATCH , +rather than an explanation thereof. +If +.Fa errcode +is +.Dv REG_ATOI , +then +.Fa preg +shall be non-null and the +.Fa re_endp +member of the structure it points to +must point to the printable name of an error code; +in this case, the result in +.Fa errbuf +is the decimal digits of +the numeric value of the error code +(0 if the name is not recognized). +.Dv REG_ITOA +and +.Dv REG_ATOI +are intended primarily as debugging facilities; +they are extensions, +compatible with but not specified by +.St -p1003.2 +and should be used with +caution in software intended to be portable to other systems. +Be warned also that they are considered experimental and changes are possible. +.Pp +.Fn regfree +frees any dynamically allocated storage associated with the compiled RE +pointed to by +.Fa preg . +The remaining +.Li regex_t +is no longer a valid compiled RE +and the effect of supplying it to +.Fn regexec +or +.Fn regerror +is undefined. +.Pp +None of these functions references global variables except for tables +of constants; +all are safe for use from multiple threads if the arguments are safe. +.Sh IMPLEMENTATION CHOICES +There are a number of decisions that +.St -p1003.2 +leaves up to the implementor, +either by explicitly saying +.Dq undefined +or by virtue of them being +forbidden by the RE grammar. +This implementation treats them as follows. +.Pp +See +.Xr re_format 7 +for a discussion of the definition of case-independent matching. +.Pp +There is no particular limit on the length of REs, +except insofar as memory is limited. +Memory usage is approximately linear in RE size, and largely insensitive +to RE complexity, except for bounded repetitions. +See +.Sx BUGS +for one short RE using them +that will run almost any system out of memory. +.Pp +A backslashed character other than one specifically given a magic meaning +by +.St -p1003.2 +(such magic meanings occur only in obsolete REs) +is taken as an ordinary character. +.Pp +Any unmatched +.Ql \&[ +is a +.Dv REG_EBRACK +error. +.Pp +Equivalence classes cannot begin or end bracket-expression ranges. +The endpoint of one range cannot begin another. +.Pp +RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255. +.Pp +A repetition operator (?, *, +, or bounds) cannot follow another +repetition operator. +A repetition operator cannot begin an expression or subexpression +or follow +.Ql ^ +or +.Ql | . +.Pp +A +.Ql | +cannot appear first or last in a (sub)expression, or after another +.Ql | , +i.e., an operand of +.Ql | +cannot be an empty subexpression. +An empty parenthesized subexpression, +.Ql \&(\&) , +is legal and matches an +empty (sub)string. +An empty string is not a legal RE. +.Pp +A +.Ql { +followed by a digit is considered the beginning of bounds for a +bounded repetition, which must then follow the syntax for bounds. +A +.Ql { +.Em not +followed by a digit is considered an ordinary character. +.Pp +.Ql ^ +and +.Ql $ +beginning and ending subexpressions in obsolete +.Pq Dq basic +REs are anchors, not ordinary characters. +.Sh DIAGNOSTICS +Non-zero error codes from +.Fn regcomp +and +.Fn regexec +include the following: +.Pp +.Bl -tag -compact -width XREG_ECOLLATEX +.It Er REG_NOMATCH +regexec() failed to match +.It Er REG_BADPAT +invalid regular expression +.It Er REG_ECOLLATE +invalid collating element +.It Er REG_ECTYPE +invalid character class +.It Er REG_EESCAPE +\e applied to unescapable character +.It Er REG_ESUBREG +invalid backreference number +.It Er REG_EBRACK +brackets [ ] not balanced +.It Er REG_EPAREN +parentheses ( ) not balanced +.It Er REG_EBRACE +braces { } not balanced +.It Er REG_BADBR +invalid repetition count(s) in { } +.It Er REG_ERANGE +invalid character range in [ ] +.It Er REG_ESPACE +ran out of memory +.It Er REG_BADRPT +?, *, or + operand invalid +.It Er REG_EMPTY +empty (sub)expression +.It Er REG_ASSERT +.Dq can't happen +\(emyou found a bug +.It Er REG_INVARG +invalid argument, e.g., negative-length string +.El +.Sh SEE ALSO +.Xr grep 1 , +.Xr re_format 7 +.Pp +.St -p1003.2 , +sections 2.8 (Regular Expression Notation) +and +B.5 (C Binding for Regular Expression Matching). +.Sh HISTORY +Predecessors to +.Fn regcomp +and +.Fn regexec , +.Fn regcmp +and +.Fn regex , +first appeared in the Programmer's Workbench (PWB/UNIX). +.Pp +The present functions were originally written by Henry Spencer +and altered for inclusion in the +.Bx 4.4 +distribution. +.Sh BUGS +This is an alpha release with known defects. +Please report problems. +.Pp +There is one known functionality bug. +The implementation of internationalization is incomplete: +the locale is always assumed to be the default one of +.St -p1003.2 , +and only the collating elements etc. of that locale are available. +.Pp +The back-reference code is subtle and doubts linger about its correctness +in complex cases. +.Pp +.Fn regexec +performance is poor. +This will improve with later releases. +.Fa nmatch +exceeding 0 is expensive; +.Fa nmatch +exceeding 1 is worse. +.Fn regexec +is largely insensitive to RE complexity +.Em except +that back references are massively expensive. +RE length does matter; in particular, there is a strong speed bonus +for keeping RE length under about 30 characters, +with most special characters counting roughly double. +.Pp +.Fn regcomp +implements bounded repetitions by macro expansion, +which is costly in time and space if counts are large +or bounded repetitions are nested. +A RE like, say, +.Dq ((((a{1,100}){1,100}){1,100}){1,100}){1,100} +will (eventually) run almost any existing machine out of swap space. +.Pp +There are suspected problems with response to obscure error conditions. +Notably, +certain kinds of internal overflow, +produced only by truly enormous REs or by multiply nested bounded repetitions, +are probably not handled well. +.Pp +Due to a mistake in +.St -p1003.2 , +things like +.Ql a)b +are legal REs because +.Ql \&) +is +a special character only in the presence of a previous unmatched +.Ql \&( . +This can't be fixed until the spec is fixed. +.Pp +The standard's definition of back references is vague. +For example, does +.Dq a\e(\e(b\e)*\e2\e)*d +match +.Dq abbbd ? +Until the standard is clarified, +behavior in such cases should not be relied on. +.Pp +The implementation of word-boundary matching is a bit of a kludge, +and bugs may lurk in combinations of word-boundary matching and anchoring. diff --git a/lib/libopenbsd/regex/regex2.h b/lib/libopenbsd/regex/regex2.h @@ -0,0 +1,157 @@ +/* $OpenBSD: regex2.h,v 1.8 2013/04/17 17:39:29 tedu Exp $ */ + +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)regex2.h 8.4 (Berkeley) 3/20/94 + */ + +/* + * internals of regex_t + */ +#define MAGIC1 ((('r'^0200)<<8) | 'e') + +/* + * The internal representation is a *strip*, a sequence of + * operators ending with an endmarker. (Some terminology etc. is a + * historical relic of earlier versions which used multiple strips.) + * Certain oddities in the representation are there to permit running + * the machinery backwards; in particular, any deviation from sequential + * flow must be marked at both its source and its destination. Some + * fine points: + * + * - OPLUS_ and O_PLUS are *inside* the loop they create. + * - OQUEST_ and O_QUEST are *outside* the bypass they create. + * - OCH_ and O_CH are *outside* the multi-way branch they create, while + * OOR1 and OOR2 are respectively the end and the beginning of one of + * the branches. Note that there is an implicit OOR2 following OCH_ + * and an implicit OOR1 preceding O_CH. + * + * In state representations, an operator's bit is on to signify a state + * immediately *preceding* "execution" of that operator. + */ +typedef unsigned long sop; /* strip operator */ +typedef long sopno; +#define OPRMASK 0xf8000000LU +#define OPDMASK 0x07ffffffLU +#define OPSHIFT ((unsigned)27) +#define OP(n) ((n)&OPRMASK) +#define OPND(n) ((n)&OPDMASK) +#define SOP(op, opnd) ((op)|(opnd)) +/* operators meaning operand */ +/* (back, fwd are offsets) */ +#define OEND (1LU<<OPSHIFT) /* endmarker - */ +#define OCHAR (2LU<<OPSHIFT) /* character unsigned char */ +#define OBOL (3LU<<OPSHIFT) /* left anchor - */ +#define OEOL (4LU<<OPSHIFT) /* right anchor - */ +#define OANY (5LU<<OPSHIFT) /* . - */ +#define OANYOF (6LU<<OPSHIFT) /* [...] set number */ +#define OBACK_ (7LU<<OPSHIFT) /* begin \d paren number */ +#define O_BACK (8LU<<OPSHIFT) /* end \d paren number */ +#define OPLUS_ (9LU<<OPSHIFT) /* + prefix fwd to suffix */ +#define O_PLUS (10LU<<OPSHIFT) /* + suffix back to prefix */ +#define OQUEST_ (11LU<<OPSHIFT) /* ? prefix fwd to suffix */ +#define O_QUEST (12LU<<OPSHIFT) /* ? suffix back to prefix */ +#define OLPAREN (13LU<<OPSHIFT) /* ( fwd to ) */ +#define ORPAREN (14LU<<OPSHIFT) /* ) back to ( */ +#define OCH_ (15LU<<OPSHIFT) /* begin choice fwd to OOR2 */ +#define OOR1 (16LU<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */ +#define OOR2 (17LU<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */ +#define O_CH (18LU<<OPSHIFT) /* end choice back to OOR1 */ +#define OBOW (19LU<<OPSHIFT) /* begin word - */ +#define OEOW (20LU<<OPSHIFT) /* end word - */ + +/* + * Structure for [] character-set representation. Character sets are + * done as bit vectors, grouped 8 to a byte vector for compactness. + * The individual set therefore has both a pointer to the byte vector + * and a mask to pick out the relevant bit of each byte. A hash code + * simplifies testing whether two sets could be identical. + * + * This will get trickier for multicharacter collating elements. As + * preliminary hooks for dealing with such things, we also carry along + * a string of multi-character elements, and decide the size of the + * vectors at run time. + */ +typedef struct { + uch *ptr; /* -> uch [csetsize] */ + uch mask; /* bit within array */ + uch hash; /* hash code */ + size_t smultis; + char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ +} cset; +/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ +#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) +#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) +#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) +#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ +#define MCsub(p, cs, cp) mcsub(p, cs, cp) +#define MCin(p, cs, cp) mcin(p, cs, cp) + +/* stuff for character categories */ +typedef unsigned char cat_t; + +/* + * main compiled-expression structure + */ +struct re_guts { + int magic; +# define MAGIC2 ((('R'^0200)<<8)|'E') + sop *strip; /* malloced area for strip */ + int csetsize; /* number of bits in a cset vector */ + int ncsets; /* number of csets in use */ + cset *sets; /* -> cset [ncsets] */ + uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ + int cflags; /* copy of regcomp() cflags argument */ + sopno nstates; /* = number of sops */ + sopno firststate; /* the initial OEND (normally 0) */ + sopno laststate; /* the final OEND */ + int iflags; /* internal flags */ +# define USEBOL 01 /* used ^ */ +# define USEEOL 02 /* used $ */ +# define BAD 04 /* something wrong */ + int nbol; /* number of ^ used */ + int neol; /* number of $ used */ + int ncategories; /* how many character categories */ + cat_t *categories; /* ->catspace[-CHAR_MIN] */ + char *must; /* match must contain this string */ + int mlen; /* length of must */ + size_t nsub; /* copy of re_nsub */ + int backrefs; /* does it use back references? */ + sopno nplus; /* how deep does it nest +s? */ + /* catspace must be last */ + cat_t catspace[NC]; /* actually [NC] */ +}; + +/* misc utilities */ +#define OUT (CHAR_MAX+1) /* a non-character value */ +#define ISWORD(c) (isalnum(c) || (c) == '_') diff --git a/lib/libopenbsd/regex/regexec.c b/lib/libopenbsd/regex/regexec.c @@ -0,0 +1,163 @@ +/* $OpenBSD: regexec.c,v 1.13 2014/10/11 04:23:12 doug Exp $ */ +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)regexec.c 8.3 (Berkeley) 3/20/94 + */ + +/* + * the outer shell of regexec() + * + * This file includes engine.c *twice*, after muchos fiddling with the + * macros that code uses. This lets the same code operate on two different + * representations for state sets. + */ +#include <sys/types.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <ctype.h> +#include <regex.h> + +#include "utils.h" +#include "regex2.h" + +/* macros for manipulating states, small version */ +#define states long +#define states1 states /* for later use in regexec() decision */ +#define CLEAR(v) ((v) = 0) +#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n))) +#define SET1(v, n) ((v) |= (unsigned long)1 << (n)) +#define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0) +#define ASSIGN(d, s) ((d) = (s)) +#define EQ(a, b) ((a) == (b)) +#define STATEVARS long dummy /* dummy version */ +#define STATESETUP(m, n) /* nothing */ +#define STATETEARDOWN(m) /* nothing */ +#define SETUP(v) ((v) = 0) +#define onestate long +#define INIT(o, n) ((o) = (unsigned long)1 << (n)) +#define INC(o) ((o) <<= 1) +#define ISSTATEIN(v, o) (((v) & (o)) != 0) +/* some abbreviations; note that some of these know variable names! */ +/* do "if I'm here, I can also be there" etc without branches */ +#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n)) +#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n)) +#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0) +/* function names */ +#define SNAMES /* engine.c looks after details */ + +#include "engine.c" + +/* now undo things */ +#undef states +#undef CLEAR +#undef SET0 +#undef SET1 +#undef ISSET +#undef ASSIGN +#undef EQ +#undef STATEVARS +#undef STATESETUP +#undef STATETEARDOWN +#undef SETUP +#undef onestate +#undef INIT +#undef INC +#undef ISSTATEIN +#undef FWD +#undef BACK +#undef ISSETBACK +#undef SNAMES + +/* macros for manipulating states, large version */ +#define states char * +#define CLEAR(v) memset(v, 0, m->g->nstates) +#define SET0(v, n) ((v)[n] = 0) +#define SET1(v, n) ((v)[n] = 1) +#define ISSET(v, n) ((v)[n]) +#define ASSIGN(d, s) memcpy(d, s, m->g->nstates) +#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0) +#define STATEVARS long vn; char *space +#define STATESETUP(m, nv) { (m)->space = reallocarray(NULL, \ + (m)->g->nstates, (nv)); \ + if ((m)->space == NULL) return(REG_ESPACE); \ + (m)->vn = 0; } +#define STATETEARDOWN(m) { free((m)->space); } +#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) +#define onestate long +#define INIT(o, n) ((o) = (n)) +#define INC(o) ((o)++) +#define ISSTATEIN(v, o) ((v)[o]) +/* some abbreviations; note that some of these know variable names! */ +/* do "if I'm here, I can also be there" etc without branches */ +#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here]) +#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here]) +#define ISSETBACK(v, n) ((v)[here - (n)]) +/* function names */ +#define LNAMES /* flag */ + +#include "engine.c" + +/* + - regexec - interface for matching + * + * We put this here so we can exploit knowledge of the state representation + * when choosing which matcher to call. Also, by this point the matchers + * have been prototyped. + */ +int /* 0 success, REG_NOMATCH failure */ +regexec(const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags) +{ + struct re_guts *g = preg->re_g; + char *s = (char *)string; /* XXX fucking gcc XXX */ + +#ifdef REDEBUG +# define GOODFLAGS(f) (f) +#else +# define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) +#endif + + if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) + return(REG_BADPAT); + assert(!(g->iflags&BAD)); + if (g->iflags&BAD) /* backstop for no-debug case */ + return(REG_BADPAT); + eflags = GOODFLAGS(eflags); + + if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE)) + return(smatcher(g, s, nmatch, pmatch, eflags)); + else + return(lmatcher(g, s, nmatch, pmatch, eflags)); +} diff --git a/lib/libopenbsd/regex/regfree.c b/lib/libopenbsd/regex/regfree.c @@ -0,0 +1,69 @@ +/* $OpenBSD: regfree.c,v 1.11 2015/12/28 22:27:03 mmcc Exp $ */ +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)regfree.c 8.3 (Berkeley) 3/20/94 + */ + +#include <sys/types.h> +#include <stdio.h> +#include <stdlib.h> +#include <regex.h> +#include <limits.h> + +#include "utils.h" +#include "regex2.h" + +/* + - regfree - free everything + */ +void +regfree(regex_t *preg) +{ + struct re_guts *g; + + if (preg->re_magic != MAGIC1) /* oops */ + return; /* nice to complain, but hard */ + + g = preg->re_g; + if (g == NULL || g->magic != MAGIC2) /* oops again */ + return; + preg->re_magic = 0; /* mark it invalid */ + g->magic = 0; /* mark it invalid */ + + free(g->strip); + free(g->sets); + free(g->setbits); + free(g->must); + free(g); +} +DEF_WEAK(regfree); diff --git a/lib/libopenbsd/regex/utils.h b/lib/libopenbsd/regex/utils.h @@ -0,0 +1,55 @@ +/* $OpenBSD: utils.h,v 1.4 2003/06/02 20:18:36 millert Exp $ */ + +/*- + * Copyright (c) 1992, 1993, 1994 Henry Spencer. + * Copyright (c) 1992, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Henry Spencer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)utils.h 8.3 (Berkeley) 3/20/94 + */ + +/* utility definitions */ +#define DUPMAX _POSIX2_RE_DUP_MAX /* xxx is this right? */ +#define INFINITY (DUPMAX + 1) +#define NC (CHAR_MAX - CHAR_MIN + 1) +typedef unsigned char uch; + +/* switch off assertions (if not already off) if no REDEBUG */ +#ifndef REDEBUG +#ifndef NDEBUG +#define NDEBUG /* no assertions please */ +#endif +#endif +#include <assert.h> + +/* for old systems with bcopy() but no memmove() */ +#ifdef USEBCOPY +#define memmove(d, s, c) bcopy(s, d, c) +#endif