From 7bcca6c9c8668abc5bf03fdc23a19676dc25b222 Mon Sep 17 00:00:00 2001 From: fcolin Date: Thu, 1 Feb 2007 13:04:14 +0000 Subject: Utilisateur : Fcolin Date : 16/06/00 Heure : 10:14 Créé (vss 1) --- Ivy/Regexp.cxx | 1757 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1757 insertions(+) create mode 100644 Ivy/Regexp.cxx (limited to 'Ivy/Regexp.cxx') diff --git a/Ivy/Regexp.cxx b/Ivy/Regexp.cxx new file mode 100644 index 0000000..6d44def --- /dev/null +++ b/Ivy/Regexp.cxx @@ -0,0 +1,1757 @@ +// Win32 porting notes. + +#if defined( _MBCS ) +#pragma message( __FILEINFO__ "This code is broken under _MBCS, " \ + "see the comments at the top of this file." ) +#endif //_MBCS +// +// +// In case this isn't obvious from the later comments this is an ALTERED +// version of the software. If you like my changes then cool, but nearly +// all of the functionality here is derived from Henry Spencer's original +// work. +// +// This code should work correctly under both _SBCS and _UNICODE, I did +// start working on making it work with _MBCS but gave up after a while +// since I don't need this particular port and it's not going to be as +// straight forward as the other two. +// +// The problem stems from the compiled program being stored as TCHARS, +// the individual items need to be wide enough to hold whatever character +// is thrown at them, but currently they are accessed as an array of +// whatever size integral type is appropriate. _MBCS would cause this +// to be char, but at times it would need to be larger. This would +// require making the program be an array of short with the appropriate +// conversions used everywhere. Certainly it's doable, but it's a pain. +// What's worse is that the current code will compile and run under _MBCS, +// only breaking when it gets wide characters thrown against it. +// +// I've marked at least one bit of code with #pragma messages, I may not +// get all of them, but they should be a start +// +// Guy Gascoigne - Piggford (ggp@bigfoot.com) Friday, February 27, 1998 + + +// regcomp and regexec -- regsub and regerror are elsewhere +// @(#)regexp.c 1.3 of 18 April 87 +// +// Copyright (c) 1986 by University of Toronto. +// Written by Henry Spencer. Not derived from licensed software. +// +// Permission is granted to anyone to use this software for any +// purpose on any computer system, and to redistribute it freely, +// subject to the following restrictions: +// +// 1. The author is not responsible for the consequences of use of +// this software, no matter how awful, even if they arise +// from defects in it. +// +// 2. The origin of this software must not be misrepresented, either +// by explicit claim or by omission. +// +// 3. Altered versions must be plainly marked as such, and must not +// be misrepresented as being the original software. +// *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, +// *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching +// *** as in BSD grep and ex. +// *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, +// *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. +// *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, +// *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. +// *** THIS IS AN ALTERED VERSION. It was altered by Geoffrey Noer, +// *** THIS IS AN ALTERED VERSION. It was altered by Guy Gascoigne - Piggford +// *** guy@wyrdrune.com, on 15 March 1998, porting it to C++ and converting +// *** it to be the engine for the Regexp class +// +// Beware that some of this code is subtly aware of the way operator +// precedence is structured in regular expressions. Serious changes in +// regular-expression syntax might require a total rethink. + +#include "stdafx.h" +#include "regexp.h" + +// The first byte of the regexp internal "program" is actually this magic +// number; the start node begins in the second byte. + +const char MAGIC = '\234'; + +#pragma warning( disable : 4711 ) // automatic inline selected + +// The "internal use only" fields in regexp.h are present to pass info from +// compile to execute that permits the execute phase to run lots faster on +// simple cases. They are: +// +// regstart char that must begin a match; '\0' if none obvious +// reganch is the match anchored (at beginning-of-line only)? +// regmust string (pointer into program) that match must include, or NULL +// regmlen length of regmust string +// +// Regstart and reganch permit very fast decisions on suitable starting +// points for a match, cutting down the work a lot. Regmust permits fast +// rejection of lines that cannot possibly match. The regmust tests are +// costly enough that regcomp() supplies a regmust only if the +// r.e. contains something potentially expensive (at present, the only +// such thing detected is * or + at the start of the r.e., which can +// involve a lot of backup). Regmlen is supplied because the test in +// regexec() needs it and regcomp() is computing it anyway. + +// Structure for regexp "program". This is essentially a linear encoding +// of a nondeterministic finite-state machine (aka syntax charts or +// "railroad normal form" in parsing technology). Each node is an opcode +// plus a "next" pointer, possibly plus an operand. "Next" pointers of +// all nodes except BRANCH implement concatenation; a "next" pointer with +// a BRANCH on both ends of it is connecting two alternatives. (Here we +// have one of the subtle syntax dependencies: an individual BRANCH (as +// opposed to a collection of them) is never concatenated with anything +// because of operator precedence.) The operand of some types of node is +// a literal string; for others, it is a node leading into a sub-FSM. In +// particular, the operand of a BRANCH node is the first node of the +// branch. (NB this is *not* a tree structure: the tail of the branch +// connects to the thing following the set of BRANCHes.) The opcodes +// are: + +enum { +// definition number opnd? meaning + END = 0, // no End of program. + BOL = 1, // no Match beginning of line. + EOL = 2, // no Match end of line. + ANY = 3, // no Match any character. + ANYOF = 4, // str Match any of these. + ANYBUT = 5, // str Match any but one of these. + BRANCH = 6, // node Match this, or the next..\&. + BACK = 7, // no "next" ptr points backward. + EXACTLY = 8, // str Match this string. + NOTHING = 9, // no Match empty string. + STAR = 10, // node Match this 0 or more times. + PLUS = 11, // node Match this 1 or more times. + WORDA = 12, // no Match "" at wordchar, where prev is nonword + WORDZ = 13, // no Match "" at nonwordchar, where prev is word + OPEN = 20, // no Sub-RE starts here. + // OPEN+1 is number 1, etc. + CLOSE = 40 // no Analogous to OPEN. +}; + +// Opcode notes: +// +// BRANCH The set of branches constituting a single choice are hooked +// together with their "next" pointers, since precedence prevents +// anything being concatenated to any individual branch. The +// "next" pointer of the last BRANCH in a choice points to the +// thing following the whole choice. This is also where the +// final "next" pointer of each individual branch points; each +// branch starts with the operand node of a BRANCH node. +// +// BACK Normal "next" pointers all implicitly point forward; BACK +// exists to make loop structures possible. +// +// STAR,PLUS '?', and complex '*' and '+', are implemented as circular +// BRANCH structures using BACK. Simple cases (one character +// per match) are implemented with STAR and PLUS for speed +// and to minimize recursive plunges. +// +// OPEN,CLOSE ...are numbered at compile time. + +// A node is one char of opcode followed by two chars of "next" pointer. +// "Next" pointers are stored as two 8-bit pieces, high order first. The +// value is a positive offset from the opcode of the node containing it. +// An operand, if any, simply follows the node. (Note that much of the +// code generation knows about this implicit relationship.) +// +// Using two bytes for the "next" pointer is vast overkill for most things, +// but allows patterns to get big without disasters. + + +enum +{ + REGERR_SENTINEL_VALUE = 0, + REGERR_NULLARG = 1, REGERR_CORRUPTED, REGERR_CORRUPTION, REGERR_CORRUPTED_POINTERS, + REGERR_BAD_REGREPEAT, REGERR_CORRUPTED_OPCODE, REGERR_NULL_TO_REGSUB, + REGERR_DAMAGED_REGEXP_REGSUB, REGERR_DAMAGED_MATCH_STRING, REGERR_NULL_TO_REGCOMP, + REGERR_TO_BIG, REGERR_TO_MANY_PAREN, REGERR_UNTERMINATED_PAREN, REGERR_UNMATCHED_PAREN, + REGERR_INTERNAL_ERROR_JUNK, REGERR_OP_COULD_BE_EMPTY, REGERR_NESTED_OP, REGERR_INVALID_RANGE, + REGERR_UNMATCHED_BRACE, REGERR_INTERNAL_UNEXPECTED_CHAR, REGERR_OP_FOLLOWS_NOTHING, + REGERR_TRAILING_ESC, REGERR_INTERNAL_STRSCSPN, REGERR_NO_REGEXP +}; + +struct regErr +{ + int m_id; + const char * m_err; +} errors[] = { + { REGERR_NULLARG, "NULL argument to regexec" }, + { REGERR_CORRUPTED, "corrupted regexp" }, + { REGERR_CORRUPTION, "regexp corruption" }, + { REGERR_CORRUPTED_POINTERS, "corrupted pointers" }, + { REGERR_BAD_REGREPEAT, "internal error: bad call of regrepeat" }, + { REGERR_CORRUPTED_OPCODE, "corrupted opcode" }, + { REGERR_NULL_TO_REGSUB, "NULL parm to regsub" }, + { REGERR_DAMAGED_REGEXP_REGSUB, "damaged regexp fed to regsub" }, + { REGERR_DAMAGED_MATCH_STRING, "damaged match string" }, + { REGERR_NULL_TO_REGCOMP, "NULL argument to regcomp" }, + { REGERR_TO_BIG, "regexp too big" }, + { REGERR_TO_MANY_PAREN, "too many ()" }, + { REGERR_UNTERMINATED_PAREN, "unterminated ()" }, + { REGERR_UNMATCHED_PAREN, "unmatched ()" }, + { REGERR_INTERNAL_ERROR_JUNK, "internal error: junk on end" }, + { REGERR_OP_COULD_BE_EMPTY, "*+ operand could be empty" }, + { REGERR_NESTED_OP, "nested *?+" }, + { REGERR_INVALID_RANGE, "invalid [] range" }, + { REGERR_UNMATCHED_BRACE, "unmatched []" }, + { REGERR_INTERNAL_UNEXPECTED_CHAR, "internal error: \\0|) unexpected" }, + { REGERR_OP_FOLLOWS_NOTHING, "?+* follows nothing" }, + { REGERR_TRAILING_ESC, "trailing \\" }, + { REGERR_INTERNAL_STRSCSPN, "internal error: strcspn 0" }, + { REGERR_NO_REGEXP, "NULL regexp" }, + { REGERR_SENTINEL_VALUE, "Unknown error" } // must be last value +}; + +// Flags to be passed up and down. + +enum { + HASWIDTH = 01, // Known never to match null string. + SIMPLE = 02, // Simple enough to be STAR/PLUS operand. + SPSTART = 04, // Starts with * or +. + WORST = 0 // Worst case. +}; + +/////////////////////////////////////////////////////////////////////////////// + +class CRegErrorHandler +{ + friend Regexp; + mutable string m_szError; + static const char * FindErr( int id ); +protected: + void ClearErrorString() const; + void regerror( const char * s ) const; + void regerror( int id ) const; +public: + CRegErrorHandler() { } + CRegErrorHandler( const CRegErrorHandler & reh ) : m_szError( reh.m_szError ) {} + + const string & GetErrorString() const; +}; + +void CRegErrorHandler::regerror( const char * s ) const +{ + TRACE( "regerror: %s\n", s ); + m_szError = s; +} + +void CRegErrorHandler::regerror( int id ) const +{ + regerror( FindErr( id ) ); +} + +const string & CRegErrorHandler::GetErrorString() const +{ + return m_szError; +} + +void CRegErrorHandler::ClearErrorString() const +{ + m_szError = "" ; +} + +const char * CRegErrorHandler::FindErr( int id ) +{ + for ( struct regErr * perr = errors; perr->m_id != REGERR_SENTINEL_VALUE; perr++ ) + if ( perr->m_id == id ) + return perr->m_err; + + return perr->m_err; // since we've fallen off the array, perr->m_id == 0 +} + +/////////////////////////////////////////////////////////////////////////////// + +// All of the functions required to directly access the 'program' +class CRegProgramAccessor : public CRegErrorHandler +{ +public: + static inline char OP( char * p ) + { + return (*(p)); + } + static inline char * OPERAND( char * p ) + { + return p + 3; + } + static inline char * regnext( char * p ) + { + const short &offset = *((short*)(p+1)); + + if (offset == 0) + return(NULL); + + return((OP(p) == BACK) ? p-offset : p+offset); + } +#ifdef _RE_DEBUG + char * CRegProgramAccessor::regprop( char * op ); +#endif +}; + +/////////////////////////////////////////////////////////////////////////////// + +// The internal interface to the regexp, wrapping the compilation as well as the +// execution of the regexp (matching) + +class regexp : public CRegProgramAccessor +{ + friend class CRegExecutor; + friend class Regexp; + + int m_programSize; + char * startp[Regexp::NSUBEXP]; + char * endp[Regexp::NSUBEXP]; + char regstart; // Internal use only. + char reganch; // Internal use only. + char * regmust; // Internal use only. + int regmlen; // Internal use only. + char * program; + + bool status; + int count; // used by Regexp to manage the reference counting of regexps + int numSubs; +public: + + regexp( const char * exp, bool iCase ); + regexp( const regexp & r ); + ~regexp(); + + void ignoreCase( const char * in, char * out ); + + bool regcomp( const char * exp ); + bool regexec( const char * string ); + bool Status() const { return status; } + + string GetReplaceString( const char* sReplaceExp ) const; + + regexp * getCopy(); + +#ifdef _RE_DEBUG + void regdump(); +#endif + +#ifdef _DEBUG + string m_originalPattern; + string m_modifiedPattern; +#endif +}; + +/////////////////////////////////////////////////////////////////////////////// +// Compile / Validate the regular expression - ADT + +class CRegCompilerBase : public CRegProgramAccessor +{ +public: + CRegCompilerBase( const char * parse ); + + char * reg(int paren, int *flagp); +protected: + char * regparse; // Input-scan pointer. + int regnpar; // () count. + + char * regbranch(int *flagp); + char * regpiece(int *flagp); + char * regatom(int *flagp); + inline bool ISREPN( char c ) { return ((c) == '*' || (c) == '+' || (c) == '?'); } + + virtual void regc(int c) = 0; + virtual char * regnode(int op) = 0; + virtual void reginsert(char op, char * opnd) = 0; + virtual void regtail(char * p, char * val) = 0; + virtual void regoptail(char * p, char * val) = 0; +}; + +/////////////////////////////////////////////////////////////////////////////// +// First pass over the expression, testing for validity and returning the +// program size + +class CRegValidator : public CRegCompilerBase +{ +public: + CRegValidator( const char * parse ); + + inline long Size() const { return regsize; } +private: + long regsize; // Code size. + char regdummy[3]; // NOTHING, 0 next ptr +protected: + char * regnode(int) { regsize += 3; return regdummy; } + void regc(int) { regsize++; } + void reginsert(char, char *) { regsize += 3; } + void regtail(char *, char *) { return; } + void regoptail(char *, char *) { return; } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Second pass, actually generating the program + +class CRegCompiler : public CRegCompilerBase +{ +public: + CRegCompiler( const char * parse, char * prog ); +private: + char * regcode; +protected: + // regc - emit (if appropriate) a byte of code + void regc(int b) + { + *regcode++ = (char)b; + } + char * regnode(int op); + void reginsert(char op, char * opnd); + void regtail(char * p, char * val); + void regoptail(char * p, char * val); +}; + +// regnode - emit a node +char * CRegCompiler::regnode(int op) +{ + char * const ret = regcode; + + char * ptr = ret; + *ptr++ = (char)op; + *ptr++ = '\0'; // Null next pointer. + *ptr++ = '\0'; + regcode = ptr; + + return(ret); +} + +// reginsert - insert an operator in front of already-emitted operand +// +// Means relocating the operand. +void CRegCompiler::reginsert(char op, char * opnd) +{ + char * place; + + (void) memmove(opnd+3, opnd, (size_t)((regcode - opnd)*sizeof(char))); + regcode += 3; + + place = opnd; // Op node, where operand used to be. + *place++ = op; + *place++ = '\0'; + *place++ = '\0'; +} + +// regtail - set the next-pointer at the end of a node chain +void CRegCompiler::regtail(char * p, char * val) +{ + char * scan; + char * temp; + + // Find last node. + for (scan = p; (temp = regnext(scan)) != NULL; scan = temp) + continue; + + *((short *)(scan+1)) = (short)((OP(scan) == BACK) ? scan - val : val - scan); +} + +// regoptail - regtail on operand of first argument; nop if operandless +void CRegCompiler::regoptail(char * p, char * val) +{ + // "Operandless" and "op != BRANCH" are synonymous in practice. + if (OP(p) == BRANCH) + regtail(OPERAND(p), val); +} + +/////////////////////////////////////////////////////////////////////////////// + +CRegCompilerBase::CRegCompilerBase( const char * parse ) + : regparse( (char *)parse ), + regnpar(1) +{ +} + +CRegValidator::CRegValidator( const char * parse ) + : CRegCompilerBase( parse ), + regsize(0) +{ + regc(MAGIC); + regdummy[0] = NOTHING; + regdummy[1] = regdummy[2] = 0; +} + +CRegCompiler::CRegCompiler( const char * parse, char * prog ) + : CRegCompilerBase( parse ), + regcode(prog) +{ + regc(MAGIC); +} + +/////////////////////////////////////////////////////////////////////////////// + +regexp::regexp( const char * exp, bool iCase ) + : regstart(0), + reganch(0), + regmust(0), + regmlen(0), + program(0), + m_programSize(0) +{ +#if _DEBUG + m_originalPattern = exp; // keep a version of the pattern for debugging +#endif + + if ( iCase ) + { + char * out = new char[(strlen( exp ) * 4) + 1]; + ignoreCase( exp, out ); + +#if _DEBUG + m_modifiedPattern = out; // and the modified version if there is one +#endif + status = regcomp( out ); + delete [] out; + } + else + status = regcomp( exp ); + + count = numSubs = 0; +} + +regexp::regexp( const regexp & orig ) + : regstart(orig.regstart), + reganch(orig.reganch), + regmlen(orig.regmlen), + m_programSize(orig.m_programSize), + numSubs(orig.numSubs), + regmust(0) +{ +#if _DEBUG + m_originalPattern = orig.m_originalPattern; + m_modifiedPattern = orig.m_modifiedPattern; +#endif + status = orig.status; + count = 0; + program = new char[m_programSize]; + memcpy( program, orig.program, m_programSize * sizeof( char ) ); + if ( orig.regmust ) + regmust = program + ( orig.regmust - orig.program ); + + for ( int i = Regexp::NSUBEXP - 1; i >= 0; i--) + { + startp[i] = orig.startp[i]; + endp[i] = orig.endp[i]; + } +} + +regexp::~regexp() +{ + delete [] program; +} + + +// regcomp - compile a regular expression into internal code +// +// We can't allocate space until we know how big the compiled form will +// be, but we can't compile it (and thus know how big it is) until we've +// got a place to put the code. So we cheat: we compile it twice, once +// with code generation turned off and size counting turned on, and once +// "for real". This also means that we don't allocate space until we are +// sure that the thing really will compile successfully, and we never +// have to move the code and thus invalidate pointers into it. (Note +// that it has to be in one piece because free() must be able to free it +// all.) +// +// Beware that the optimization-preparation code in here knows about some +// of the structure of the compiled regexp. + +bool regexp::regcomp(const char * exp) +{ + char * scan; + int flags; + + if (exp == NULL) + { + regerror( REGERR_NULL_TO_REGCOMP ); + return NULL; + } + + // First pass: determine size, legality. + CRegValidator tester( exp ); + + if (tester.reg(0, &flags) == NULL) + return false; + + // Small enough for pointer-storage convention? + if (tester.Size() >= 0x7fffL) // Probably could be 0xffffL. + { + regerror(REGERR_TO_BIG); + return NULL; + } + + m_programSize = tester.Size(); + // Allocate space. + program = new char[m_programSize]; + + CRegCompiler comp( exp, program ); + // Second pass: emit code. + if (comp.reg(0, &flags) == NULL) + return false; + + scan = program + 1; // First BRANCH. + if (OP(regnext(scan)) == END) + { // Only one top-level choice. + scan = OPERAND(scan); + + // Starting-point info. + if (OP(scan) == EXACTLY) + regstart = *OPERAND(scan); + else if (OP(scan) == BOL) + reganch = 1; + + // If there's something expensive in the r.e., find the + // longest literal string that must appear and make it the + // regmust. Resolve ties in favor of later strings, since + // the regstart check works with the beginning of the r.e. + // and avoiding duplication strengthens checking. Not a + // strong reason, but sufficient in the absence of others. + + if (flags&SPSTART) + { + char * longest = NULL; + size_t len = 0; + + for (; scan != NULL; scan = regnext(scan)) + if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) + { + longest = OPERAND(scan); + len = strlen(OPERAND(scan)); + } + regmust = longest; + regmlen = (int)len; + } + } + + return true; +} + +regexp * regexp::getCopy() +{ + return new regexp( *this ); +} + +// reg - regular expression, i.e. main body or parenthesized thing +// +// Caller must absorb opening parenthesis. +// +// Combining parenthesis handling with the base level of regular expression +// is a trifle forced, but the need to tie the tails of the branches to what +// follows makes it hard to avoid. + +char * CRegCompilerBase::reg( int paren, int *flagp ) +{ + char * ret = NULL; + char * br; + char * ender; + int parno = 0; + int flags; + + *flagp = HASWIDTH; // Tentatively. + + if (paren) + { + // Make an OPEN node. + if (regnpar >= Regexp::NSUBEXP) + { + regerror(REGERR_TO_MANY_PAREN); + return NULL; + } + parno = regnpar; + regnpar++; + ret = regnode(OPEN+parno); + } + + // Pick up the branches, linking them together. + br = regbranch(&flags); + if (br == NULL) + return(NULL); + if (paren) + regtail(ret, br); // OPEN -> first. + else + ret = br; + *flagp &= ~(~flags&HASWIDTH); // Clear bit if bit 0. + *flagp |= flags&SPSTART; + while (*regparse == '|') + { + regparse++; + br = regbranch(&flags); + if (br == NULL) + return(NULL); + regtail(ret, br); // BRANCH -> BRANCH. + *flagp &= ~(~flags&HASWIDTH); + *flagp |= flags&SPSTART; + } + + // Make a closing node, and hook it on the end. + ender = regnode((paren) ? CLOSE+parno : END); + regtail(ret, ender); + + // Hook the tails of the branches to the closing node. + for (br = ret; br != NULL; br = regnext(br)) + regoptail(br, ender); + + // Check for proper termination. + if (paren && *regparse++ != ')') + { + regerror( REGERR_UNTERMINATED_PAREN ); + return NULL; + } + else if (!paren && *regparse != '\0') + { + if (*regparse == ')') + { + regerror( REGERR_UNMATCHED_PAREN ); + return NULL; + } + else + { + regerror( REGERR_INTERNAL_ERROR_JUNK ); + return NULL; + } + // NOTREACHED + } + + return(ret); +} + +// regbranch - one alternative of an | operator +// +// Implements the concatenation operator. + +char * CRegCompilerBase::regbranch(int *flagp) +{ + char * ret; + char * chain; + char * latest; + int flags; + int c; + + *flagp = WORST; // Tentatively. + + ret = regnode(BRANCH); + chain = NULL; + while ((c = *regparse) != '\0' && c != '|' && c != ')') + { + latest = regpiece(&flags); + if (latest == NULL) + return(NULL); + *flagp |= flags&HASWIDTH; + if (chain == NULL) // First piece. + *flagp |= flags&SPSTART; + else + regtail(chain, latest); + chain = latest; + } + if (chain == NULL) // Loop ran zero times. + (void) regnode(NOTHING); + + return(ret); +} + +// regpiece - something followed by possible [*+?] +// +// Note that the branching code sequences used for ? and the general cases +// of * and + are somewhat optimized: they use the same NOTHING node as +// both the endmarker for their branch list and the body of the last branch. +// It might seem that this node could be dispensed with entirely, but the +// endmarker role is not redundant. + +char * CRegCompilerBase::regpiece(int *flagp) +{ + char * ret; + char op; + char * next; + int flags; + + ret = regatom(&flags); + if (ret == NULL) + return(NULL); + + op = *regparse; + if (!ISREPN(op)) + { + *flagp = flags; + return(ret); + } + + if (!(flags&HASWIDTH) && op != '?' ) + { + regerror( REGERR_OP_COULD_BE_EMPTY ); + return NULL; + } + switch (op) + { + case '*' : *flagp = WORST|SPSTART; break; + case '+' : *flagp = WORST|SPSTART|HASWIDTH; break; + case '?' : *flagp = WORST; break; + } + + if (op == '*' && (flags&SIMPLE)) + reginsert(STAR, ret); + else if (op == '*' ) + { + // Emit x* as (x&|), where & means "self". + reginsert(BRANCH, ret); // Either x + regoptail(ret, regnode(BACK)); // and loop + regoptail(ret, ret); // back + regtail(ret, regnode(BRANCH)); // or + regtail(ret, regnode(NOTHING)); // null. + } + else if (op == '+' && (flags&SIMPLE)) + reginsert(PLUS, ret); + else if (op == '+' ) + { + // Emit x+ as x(&|), where & means "self". + next = regnode(BRANCH); // Either + regtail(ret, next); + regtail(regnode(BACK), ret); // loop back + regtail(next, regnode(BRANCH)); // or + regtail(ret, regnode(NOTHING)); // null. + } + else if (op == '?' ) + { + // Emit x? as (x|) + reginsert(BRANCH, ret); // Either x + regtail(ret, regnode(BRANCH)); // or + next = regnode(NOTHING); // null. + regtail(ret, next); + regoptail(ret, next); + } + regparse++; + if (ISREPN(*regparse)) + { + regerror( REGERR_NESTED_OP ); + return NULL; + } + + return(ret); +} + +// regatom - the lowest level +// +// Optimization: gobbles an entire sequence of ordinary characters so that +// it can turn them into a single node, which is smaller to store and +// faster to run. Backslashed characters are exceptions, each becoming a +// separate node; the code is simpler that way and it's not worth fixing. + +char * CRegCompilerBase::regatom(int * flagp) +{ + char * ret; + int flags; + + *flagp = WORST; // Tentatively. + + switch ( *regparse++ ) + { + // FIXME: these chars only have meaning at beg/end of pat? + case '^': + ret = regnode(BOL); + break; + case '$': + ret = regnode(EOL); + break; + case '.': + ret = regnode(ANY); + *flagp |= HASWIDTH|SIMPLE; + break; + case '[': + { + int range; + int rangeend; + int c; + + if (*regparse == '^') + { // Complement of range. + ret = regnode(ANYBUT); + regparse++; + } + else + ret = regnode(ANYOF); + if ((c = *regparse) == ']' || c == '-') + { + regc(c); + regparse++; + } + while ((c = *regparse++ ) != '\0' && c != ']') + { + if (c != '-') + regc(c); + else if ((c = *regparse) == ']' || c == '\0') + regc('-'); + else + { + range = (char)*(regparse-2); + rangeend = (char)c; + if (range > rangeend) + { + regerror( REGERR_INVALID_RANGE ); + return NULL; + } + for (range++; range <= rangeend; range++) + regc(range); + regparse++; + } + } + regc('\0'); + if (c != ']') + { + regerror( REGERR_UNMATCHED_BRACE ); + return NULL; + } + *flagp |= HASWIDTH|SIMPLE; + break; + } + case '(': + ret = reg(1, &flags); + if (ret == NULL) + return(NULL); + *flagp |= flags&(HASWIDTH|SPSTART); + break; + case '\0': + case '|': + case ')': + // supposed to be caught earlier + regerror( REGERR_INTERNAL_UNEXPECTED_CHAR ); + return NULL; + case '?': + case '+': + case '*': + { + regerror( REGERR_OP_FOLLOWS_NOTHING ); + return NULL; + } + case '\\': + switch (*regparse++) + { + case '\0': + { + regerror( REGERR_TRAILING_ESC ); + return NULL; + } + case '<': + ret = regnode(WORDA); + break; + case '>': + ret = regnode(WORDZ); + break; + /* FIXME: Someday handle \1, \2, ... */ + default: + /* Handle general quoted chars in exact-match routine */ + goto de_fault; + } + break; + de_fault: + default: + // Encode a string of characters to be matched exactly. + // + // This is a bit tricky due to quoted chars and due to + // '*', '+', and '?' taking the SINGLE char previous + // as their operand. + // + // On entry, the char at regparse[-1] is going to go + // into the string, no matter what it is. (It could be + // following a \ if we are entered from the '\' case.) + // + // Basic idea is to pick up a good char in ch and + // examine the next char. If it's *+? then we twiddle. + // If it's \ then we frozzle. If it's other magic char + // we push ch and terminate the string. If none of the + // above, we push ch on the string and go around again. + // + // regprev is used to remember where "the current char" + // starts in the string, if due to a *+? we need to back + // up and put the current char in a separate, 1-char, string. + // When regprev is NULL, ch is the only char in the + // string; this is used in *+? handling, and in setting + // flags |= SIMPLE at the end. + { + char *regprev; + register char ch; + + regparse--; /* Look at cur char */ + ret = regnode(EXACTLY); + for ( regprev = 0 ; ; ) { + ch = *regparse++; /* Get current char */ + switch (*regparse) { /* look at next one */ + + default: + regc(ch); /* Add cur to string */ + break; + + case '.': case '[': case '(': + case ')': case '|': case '\n': + case '$': case '^': + case '\0': + /* FIXME, $ and ^ should not always be magic */ + magic: + regc(ch); /* dump cur char */ + goto done; /* and we are done */ + + case '?': case '+': case '*': + if (!regprev) /* If just ch in str, */ + goto magic; /* use it */ + /* End mult-char string one early */ + regparse = regprev; /* Back up parse */ + goto done; + + case '\\': + regc(ch); /* Cur char OK */ + switch (regparse[1]){ /* Look after \ */ + case '\0': + case '<': + case '>': + /* FIXME: Someday handle \1, \2, ... */ + goto done; /* Not quoted */ + default: + /* Backup point is \, scan * point is after it. */ + regprev = regparse; + regparse++; + continue; /* NOT break; */ + } + } + regprev = regparse; /* Set backup point */ + } + done: + regc('\0'); + *flagp |= HASWIDTH; + if (!regprev) /* One char? */ + *flagp |= SIMPLE; + } + break; + } + + return(ret); +} + +//////////////////////////////////////////////////////////////////////////////// +// regexec and friends + +// Work-variable struct for regexec(). + +class CRegExecutor : public CRegProgramAccessor +{ + friend bool regexp::regexec( const char * str ); + + char * reginput; // String-input pointer. + char * regbol; // Beginning of input, for ^ check. + char * * regstartp; // Pointer to startp array. + char * * regendp; // Ditto for endp. + + regexp * prog; +public: + CRegExecutor( regexp * prog, char * string ); +protected: + bool regtry( char * string ); + bool regmatch( char * prog ); + size_t regrepeat( char * node ); +}; + +CRegExecutor::CRegExecutor( regexp * p, char * string ) + : regbol( string ), + regstartp( p->startp ), + regendp( p->endp ), + prog(p) +{ +} + +#ifdef _RE_DEBUG +int regnarrate = 0; +#endif + +// regexec - match a regexp against a string + +bool regexp::regexec( const char * str ) +{ + char * string = (char *)str; // avert const poisoning + + // Be paranoid. + if ( string == NULL ) + { + regerror( REGERR_NULLARG ); + return false; + } + + // Check validity of program. + if (*program != MAGIC) + { + regerror( REGERR_CORRUPTED ); + return false; + } + + // If there is a "must appear" string, look for it. + if ( regmust != NULL && strstr( string, regmust ) == NULL ) + return false; + + CRegExecutor executor( this, string ); + + // Simplest case: anchored match need be tried only once. + if ( reganch ) + return( executor.regtry( string ) ); + + // Messy cases: unanchored match. + if ( regstart != '\0' ) + { + // We know what char it must start with. + for ( char * s = string; s != NULL; s = strchr( s+1 , regstart ) ) + if ( executor.regtry( s) ) + return true; + return false; + } + else + { + // We don't -- general case. + for ( char * s = string; ! executor.regtry( s ); s++ ) + if (*s == '\0') + return false; + } + return true; +} + +// regtry - try match at specific point +bool CRegExecutor::regtry( char * string ) +{ + int i; + char * * stp; + char * * enp; + + reginput = string; + + stp = prog->startp; + enp = prog->endp; + for (i = Regexp::NSUBEXP; i > 0; i--) + { + *stp++ = NULL; + *enp++ = NULL; + } + if ( regmatch( prog->program + 1 ) ) + { + prog->startp[0] = string; + prog->endp[0] = reginput; + return true; + } + else + return false; +} + +// regmatch - main matching routine +// +// Conceptually the strategy is simple: check to see whether the current +// node matches, call self recursively to see whether the rest matches, +// and then act accordingly. In practice we make some effort to avoid +// recursion, in particular by going through "ordinary" nodes (that don't +// need to know whether the rest of the match failed) by a loop instead of +// by recursion. + +bool CRegExecutor::regmatch( char * prog ) +{ + char * scan; // Current node. + char * next; // Next node. + +#ifdef _RE_DEBUG + if (prog != NULL && regnarrate) + fprintf(stderr, "%s(\n", regprop(prog)); +#endif + for (scan = prog; scan != NULL; scan = next) + { +#ifdef _RE_DEBUG + if (regnarrate) + fprintf(stderr, "%s...\n", regprop(scan)); +#endif + next = regnext(scan); + + switch (OP(scan)) + { + case BOL: + if (reginput != regbol) + return false; + break; + case EOL: + if (*reginput != '\0') + return false; + break; + case WORDA: + /* Must be looking at a letter, digit, or _ */ + if ((!isalnum(*reginput)) && *reginput != '_') + return(0); + /* Prev must be BOL or nonword */ + if (reginput > regbol && + (isalnum(reginput[-1]) || reginput[-1] == '_')) + return(0); + break; + case WORDZ: + /* Must be looking at non letter, digit, or _ */ + if (isalnum(*reginput) || *reginput == '_') + return(0); + /* We don't care what the previous char was */ + break; + case ANY: + if (*reginput == '\0') + return false; + reginput++; + break; + case EXACTLY: + { + size_t len; + char * const opnd = OPERAND(scan); + + // Inline the first character, for speed. + if (*opnd != *reginput) + return false; + len = strlen(opnd); + if (len > 1 && strncmp(opnd, reginput, len) != 0) + return false; + reginput += len; + + break; + } + case ANYOF: + if (*reginput == '\0' || + strchr(OPERAND(scan), *reginput) == NULL) + return false; + reginput++; + break; + case ANYBUT: + if (*reginput == '\0' || + strchr(OPERAND(scan), *reginput) != NULL) + return false; + reginput++; + break; + case NOTHING: + break; + case BACK: + break; + case OPEN+1: case OPEN+2: case OPEN+3: + case OPEN+4: case OPEN+5: case OPEN+6: + case OPEN+7: case OPEN+8: case OPEN+9: + case OPEN+10: case OPEN+11: case OPEN+12: + case OPEN+13: case OPEN+14: case OPEN+15: + case OPEN+16: case OPEN+17: case OPEN+18: + case OPEN+19: + { + const int no = OP(scan) - OPEN; + char * const input = reginput; + + if (regmatch(next)) + { + // Don't set startp if some later + // invocation of the same parentheses + // already has. + + if (regstartp[no] == NULL) + regstartp[no] = input; + return true; + } + else + return false; + break; + } + case CLOSE+1: case CLOSE+2: case CLOSE+3: + case CLOSE+4: case CLOSE+5: case CLOSE+6: + case CLOSE+7: case CLOSE+8: case CLOSE+9: + case CLOSE+10: case CLOSE+11: case CLOSE+12: + case CLOSE+13: case CLOSE+14: case CLOSE+15: + case CLOSE+16: case CLOSE+17: case CLOSE+18: + case CLOSE+19: + { + const int no = OP(scan) - CLOSE; + char * const input = reginput; + + if (regmatch(next)) + { + // Don't set endp if some later + // invocation of the same parentheses + // already has. + + if (regendp[no] == NULL) + regendp[no] = input; + return true; + } + else + return false; + break; + } + case BRANCH: + { + char * const save = reginput; + + if (OP(next) != BRANCH) // No choice. + next = OPERAND(scan); // Avoid recursion. + else + { + while (OP(scan) == BRANCH) + { + if (regmatch(OPERAND(scan))) + return true; + reginput = save; + scan = regnext(scan); + } + return false; + // NOTREACHED + } + break; + } + case STAR: case PLUS: + { + const char nextch = (OP(next) == EXACTLY) ? *OPERAND(next) : '\0'; + size_t no; + char * const save = reginput; + const size_t min = (OP(scan) == STAR) ? 0 : 1; + + for (no = regrepeat(OPERAND(scan)) + 1; no > min; no--) + { + reginput = save + no - 1; + // If it could work, try it. + if (nextch == '\0' || *reginput == nextch) + if (regmatch(next)) + return true; + } + return false; + break; + } + case END: + return true; // Success! + break; + default: + regerror( REGERR_CORRUPTION ); + return false; + break; + } + } + + // We get here only if there's trouble -- normally "case END" is + // the terminating point. + + regerror( REGERR_CORRUPTED_POINTERS ); + return false; +} + +// regrepeat - report how many times something simple would match + +size_t CRegExecutor::regrepeat( char * node ) +{ + size_t count; + char * scan; + char ch; + + switch (OP(node)) + { + case ANY: + return(strlen(reginput)); + break; + case EXACTLY: + ch = *OPERAND(node); + count = 0; + for (scan = reginput; *scan == ch; scan++) + count++; + return(count); + break; + case ANYOF: + return(strspn(reginput, OPERAND(node))); + break; + case ANYBUT: + return(strcspn(reginput, OPERAND(node))); + break; + default: // Oh dear. Called inappropriately. + regerror( REGERR_BAD_REGREPEAT ); + return(0); // Best compromise. + break; + } + // NOTREACHED +} + +#ifdef _RE_DEBUG + +// regdump - dump a regexp onto stdout in vaguely comprehensible form + +void regexp::regdump() +{ + char * s; + char op = EXACTLY; // Arbitrary non-END op. + char * next; + + s = program + 1; + while (op != END) + { // While that wasn't END last time... + op = OP(s); + printf("%2d%s", s-program, regprop(s)); // Where, what. + next = regnext(s); + if (next == NULL) // Next ptr. + printf("(0)" ); + else + printf("(%d)", (s-program)+(next-s)); + s += 3; + if (op == ANYOF || op == ANYBUT || op == EXACTLY) + { + // Literal string, where present. + while (*s != '\0') + { + putchar(*s); + s++; + } + s++; + } + putchar('\n'); + } + + // Header fields of interest. + if (regstart != '\0') + printf("start `%c' ", regstart); + if (reganch) + printf("anchored "); + if (regmust != NULL) + printf("must have \"%s\"" , regmust); + printf("\n"); +} + +// regprop - printable representation of opcode + +#define OUTPUT(s) case s: p = #s; break +char * CRegProgramAccessor::regprop( char * op ) +{ + char * p = NULL; + static char buf[50]; + + (void) strcpy(buf, ":" ); + + switch (OP(op)) + { + OUTPUT( BOL ); + OUTPUT( EOL ); + OUTPUT( ANY ); + OUTPUT( ANYOF ); + OUTPUT( ANYBUT ); + OUTPUT( BRANCH ); + OUTPUT( EXACTLY ); + OUTPUT( NOTHING ); + OUTPUT( BACK ); + OUTPUT( END ); + OUTPUT( STAR ); + OUTPUT( PLUS ); + OUTPUT( WORDA ); + OUTPUT( WORDZ ); + case OPEN+1: case OPEN+2: case OPEN+3: + case OPEN+4: case OPEN+5: case OPEN+6: + case OPEN+7: case OPEN+8: case OPEN+9: + case OPEN+10: case OPEN+11: case OPEN+12: + case OPEN+13: case OPEN+14: case OPEN+15: + case OPEN+16: case OPEN+17: case OPEN+18: + case OPEN+19: + sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); + p = NULL; + break; + case CLOSE+1: case CLOSE+2: case CLOSE+3: + case CLOSE+4: case CLOSE+5: case CLOSE+6: + case CLOSE+7: case CLOSE+8: case CLOSE+9: + case CLOSE+10: case CLOSE+11: case CLOSE+12: + case CLOSE+13: case CLOSE+14: case CLOSE+15: + case CLOSE+16: case CLOSE+17: case CLOSE+18: + case CLOSE+19: + sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); + p = NULL; + break; + default: + regerror( REGERR_CORRUPTED_OPCODE ); + break; + } + if (p != NULL) + (void) strcat(buf, p); + return(buf); +} +#endif + +/////////////////////////////////////////////////////////////////////////////// + +Regexp::Regexp() + : rc(0), + str(0) +{ +} + +Regexp::Regexp( const char * exp, bool iCase ) + : rc( new regexp( exp, iCase ) ), + str( 0 ) +{ +} + +Regexp::Regexp( const Regexp &r ) + : rc( r.rc ), + m_szError(r.m_szError), + str(r.str) +{ + if ( rc ) + rc->count++; +} + +const Regexp & Regexp::operator=( const Regexp & r ) +{ + if ( this != &r ) + { + if ( rc && rc->count-- == 0 ) + delete rc; + + rc = r.rc; + if ( rc ) + rc->count++; + + str = r.str; + m_szError = r.m_szError; + } + return *this; +} + +Regexp::~Regexp() +{ + if ( rc && rc->count-- == 0 ) + delete rc; +} + +bool Regexp::Match( const char * s ) +{ + ClearErrorString(); + str = s; + bool ret = false; + if ( rc ) + { + // copy on write ! + + if ( rc->count ) + { + rc->count--; + rc = rc->getCopy(); + } + + ret = rc->regexec( s ); + int i = 0; + if ( ret ) + for ( i = 0; i < Regexp::NSUBEXP && rc->startp[i] ; i++ ) + ; + rc->numSubs = i - 1; + } + else + m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP ); + return ret; +} + +string Regexp::GetReplaceString( const char * source ) const +{ + ClearErrorString(); + if ( rc ) + return rc->GetReplaceString( source ); + else + m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP ); + return ""; +} + +int Regexp::SubStrings() const +{ + ClearErrorString(); + int ret = -1; + if ( rc ) + ret = rc->numSubs; + else + m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP ); + return ret; +} + +int Regexp::SubStart( unsigned int i ) const +{ + ClearErrorString(); + int ret = -1; + if ( rc ) + ret = rc->startp[safeIndex(i)] - str; + else + m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP ); + return ret; +} + +int Regexp::SubLength( unsigned int i ) const +{ + ClearErrorString(); + int ret = -1; + if ( rc ) + { + i = safeIndex(i); + ret = rc->endp[i] - rc->startp[i]; + } + else + m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP ); + return ret; +} + +bool Regexp::CompiledOK() const +{ + return rc ? rc->Status() : false; +} + +#ifdef _RE_DEBUG +void Regexp::Dump() +{ + if ( rc ) + rc->regdump(); +#if defined( _DEBUG ) + else + TRACE( "No regexp to dump out\n" ); +#endif +} +#endif + +int Regexp::safeIndex( unsigned int i ) const +{ + return i < Regexp::NSUBEXP ? i : Regexp::NSUBEXP; +} + +const string Regexp::operator[]( unsigned int i ) const +{ + string value; + ClearErrorString(); + assert( rc ); + if ( rc ) + { + int len = SubLength(i); + //char * szbuf = buffer.GetBufferSetLength( len ); + //memcpy( szbuf, rc->startp[i], len * sizeof(char) ); + //buffer.ReleaseBuffer(); + value = std::string ( rc->startp[i], len ); + } + else + { + m_szError = CRegErrorHandler::FindErr( REGERR_NO_REGEXP ); + } + return value; +} + +void regexp::ignoreCase( const char * in, char * out ) +{ + // copy in to out making every top level character a [Aa] set + bool inRange = false; + while( *in ) + { + if ( *in == '[' ) + inRange = true; + if ( *in == ']' ) + inRange = false; + if ( ! inRange && isalpha( *in ) ) + { + *out++ = '['; + *out++ = (char)toupper( *in ); + *out++ = (char)tolower( *in ); + *out++ = ']'; + } + else + *out++ = *in; + in++; + } + *out = 0; +} + +// GetReplaceString - Converts a replace expression to a string +// - perform substitutions after a regexp match +// Returns - The resultant string +string regexp::GetReplaceString( const char* sReplaceExp ) const +{ + string szEmpty( "" ); + + char *src = (char *)sReplaceExp; + //char *buf; + char c; + int no; + size_t len; + + if( sReplaceExp == NULL ) + { + regerror( REGERR_NULL_TO_REGSUB ); + return szEmpty; + } + if ( *program != MAGIC) + { + regerror( REGERR_DAMAGED_REGEXP_REGSUB ); + return szEmpty; + } + + // First compute the length of the string + int replacelen = 0; + while ((c = *src++) != '\0') + { + if (c == '&') + no = 0; + else if (c == '\\' && isdigit(*src)) + no = *src++ - '0'; + else + no = -1; + + if (no < 0) + { + // Ordinary character. + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + replacelen++; + } + else if (startp[no] != NULL && endp[no] != NULL && + endp[no] > startp[no]) + { + // Get tagged expression + len = endp[no] - startp[no]; + replacelen += len; + } + } + + string szReplace; + + //buf = szReplace.GetBufferSetLength( replacelen ); + + // Now we can create the string + src = (char *)sReplaceExp; + while ((c = *src++) != '\0') + { + if (c == '&') + no = 0; + else if (c == '\\' && isdigit(*src)) + no = *src++ - '0'; + else + no = -1; + + if (no < 0) + { + // Ordinary character. + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + //*buf++ = c; + szReplace += c ; + } + else if (startp[no] != NULL && endp[no] != NULL && + endp[no] > startp[no]) + { + // Get tagged expression + len = endp[no] - startp[no]; + //strncpy(buf, startp[no], len); + //buf += len; + szReplace.append( startp[no], len ); +// if (len != 0 && *(buf-1) == '\0' )) +// { /* strncpy hit NUL. */ +// regerror( REGERR_DAMAGED_MATCH_STRING ); +// return szEmpty; +// } + } + } + + //szReplace.ReleaseBuffer( replacelen ); + return szReplace; +} + +string Regexp::GetErrorString() const +{ + // make sure that if status == 0 that we have an error string + assert( ( ! CompiledOK() ) ? ( rc ? rc->GetErrorString() : m_szError).length() != 0 : 1 ); + return rc ? rc->GetErrorString() : m_szError ; +} + +void Regexp::ClearErrorString() const +{ + if ( rc ) + rc->ClearErrorString(); + m_szError.erase(); +} + -- cgit v1.1