123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233 |
- #include "regexp.h"
- // TODO: make a little more multi-byte safe
- // regexp match functions
- // A match means the entire string TEXT is used up in matching.
- // In the pattern string:
- // `*' matches any sequence of characters (zero or more)
- // `?' matches any character
- // [SET] matches any character in the specified set,
- // [!SET] or [^SET] matches any character not in the specified set.
- // A set is composed of characters or ranges; a range looks like
- // character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the
- // minimal set of characters allowed in the [..] pattern construct.
- // Other characters are allowed (ie. 8 bit characters) if your system
- // will support them.
- // To suppress the special syntactic significance of any of `[]*?!^-\',
- // and match the character exactly, precede it with a `\'.
- enum {
- MATCH_VALID = 1, /* valid match */
- MATCH_END, /* premature end of pattern string */
- MATCH_ABORT, /* premature end of text string */
- MATCH_RANGE, /* match failure on [..] construct */
- MATCH_LITERAL, /* match failure on literal match */
- MATCH_PATTERN, /* bad pattern */
- };
- enum {
- PATTERN_VALID = 0, /* valid pattern */
- PATTERN_ESC = -1, /* literal escape at end of pattern */
- PATTERN_RANGE = -2, /* malformed range in [..] construct */
- PATTERN_CLOSE = -3, /* no end bracket in [..] construct */
- PATTERN_EMPTY = -4, /* [..] contstruct is empty */
- };
- int Matche(const regchar_t *p, const regchar_t *t);
- // TODO: make this multi-byte aware
- int matche_after_star(const regchar_t *p, const regchar_t *t)
- {
- register int match = 0;
- register regchar_t nextp;
- /* pass over existing ? and * in pattern */
- while ( *p == '?' || *p == '*' )
- {
- /* take one char for each ? and + */
- if (*p == '?')
- {
- /* if end of text then no match */
- if (!*t++) return MATCH_ABORT;
- }
- /* move to next char in pattern */
- p++;
- }
- /* if end of pattern we have matched regardless of text left */
- if (!*p) return MATCH_VALID;
- /* get the next character to match which must be a literal or '[' */
- nextp = *p;
- if (nextp == '\\')
- {
- nextp = p[1];
- /* if end of text then we have a bad pattern */
- if (!nextp) return MATCH_PATTERN;
- }
- /* Continue until we run out of text or definite result seen */
- do
- {
- /* a precondition for matching is that the next character
- in the pattern match the next character in the text or that
- the next pattern char is the beginning of a range. Increment
- text pointer as we go here */
- if (nextp == *t || nextp == '[') match = Matche(p, t);
- /* if the end of text is reached then no match */
- if (!*t++) match = MATCH_ABORT;
- }
- while ( match != MATCH_VALID && match != MATCH_ABORT && match != MATCH_PATTERN);
- /* return result */
- return match;
- }
- int Matche(const regchar_t *p, const regchar_t *t)
- {
- regchar_t range_start, range_end; /* start and end in range */
- bool invert; /* is this [..] or [!..] */
- bool member_match; /* have I matched the [..] construct? */
- bool loop; /* should I terminate? */
- for ( ; *p; p++, t++)
- {
- /* if this is the end of the text then this is the end of the match */
- if (!*t)
- {
- return (*p == '*' && *++p == '\0') ? MATCH_VALID : MATCH_ABORT;
- }
- /* determine and react to pattern type */
- switch (*p)
- {
- case '?': /* single any character match */
- break;
- case '*': /* multiple any character match */
- return matche_after_star (p, t);
- /* [..] construct, single member/exclusion character match */
- case '[':
- {
- /* move to beginning of range */
- p++;
- /* check if this is a member match or exclusion match */
- invert = false;
- if (*p == '!' || *p == '^')
- {
- invert = true;
- p++;
- }
- /* if closing bracket here or at range start then we have a malformed pattern */
- if (*p == ']')
- return MATCH_PATTERN;
- member_match = false;
- loop = true;
- while (loop)
- {
- /* if end of construct then loop is done */
- if (*p == ']')
- {
- loop = false;
- continue;
- }
- /* matching a '!', '^', '-', '\' or a ']' */
- if (*p == '\\')
- range_start = range_end = *++p;
- else
- range_start = range_end = *p;
- /* if end of pattern then bad pattern (Missing ']') */
- if (!*p)
- return MATCH_PATTERN;
- /* check for range bar */
- if (*++p == '-')
- {
- /* get the range end */
- range_end = *++p;
- /* if end of pattern or construct then bad pattern */
- if (range_end == '\0' || range_end == ']') return MATCH_PATTERN;
- /* special character range end */
- if (range_end == '\\')
- {
- range_end = *++p;
- /* if end of text then we have a bad pattern */
- if (!range_end) return MATCH_PATTERN;
- }
- /* move just beyond this range */
- p++;
- }
- /* if the text character is in range then match found.
- make sure the range letters have the proper
- relationship to one another before comparison */
- if (range_start < range_end)
- {
- if (*t >= range_start && *t <= range_end)
- {
- member_match = true;
- loop = false;
- }
- }
- else
- {
- if (*t >= range_end && *t <= range_start)
- {
- member_match = true;
- loop = false;
- }
- }
- }
- /* if there was a match in an exclusion set then no match */
- /* if there was no match in a member set then no match */
- if ((invert && member_match) || !(invert || member_match))
- return MATCH_RANGE;
- /* if this is not an exclusion then skip the rest of the [...] construct that already matched. */
- if (member_match)
- {
- while (p && *p != ']')
- {
- /* bad pattern (Missing ']') */
- if (!*p)
- return MATCH_PATTERN;
- /* skip exact match */
- if (*p == '\\')
- {
- p++;
- /* if end of text then we have a bad pattern */
- if (!*p)
- return MATCH_PATTERN;
- }
- /* move to next pattern char */
- p++;
- }
- }
- break;
- }
- case '\\': /* next character is quoted and must match exactly */
- /* move pattern pointer to quoted char and fall through */
- p++;
- /* if end of text then we have a bad pattern */
- if (!*p)
- return MATCH_PATTERN;
- /* must match this character exactly */
- default:
- if (*p != *t)
- return MATCH_LITERAL;
- }
- }
- /* if end of text not reached then the pattern fails */
- if (*t)
- return MATCH_END;
- else return MATCH_VALID;
- }
- bool Match(const regchar_t *match, const regchar_t *string)
- {
- if (!match)
- return true;
- int error_type;
-
- error_type = Matche(match, string);
- return (error_type == MATCH_VALID);
- }
|