regexp.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #include "regexp.h"
  2. // TODO: make a little more multi-byte safe
  3. // regexp match functions
  4. // A match means the entire string TEXT is used up in matching.
  5. // In the pattern string:
  6. // `*' matches any sequence of characters (zero or more)
  7. // `?' matches any character
  8. // [SET] matches any character in the specified set,
  9. // [!SET] or [^SET] matches any character not in the specified set.
  10. // A set is composed of characters or ranges; a range looks like
  11. // character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the
  12. // minimal set of characters allowed in the [..] pattern construct.
  13. // Other characters are allowed (ie. 8 bit characters) if your system
  14. // will support them.
  15. // To suppress the special syntactic significance of any of `[]*?!^-\',
  16. // and match the character exactly, precede it with a `\'.
  17. enum {
  18. MATCH_VALID = 1, /* valid match */
  19. MATCH_END, /* premature end of pattern string */
  20. MATCH_ABORT, /* premature end of text string */
  21. MATCH_RANGE, /* match failure on [..] construct */
  22. MATCH_LITERAL, /* match failure on literal match */
  23. MATCH_PATTERN, /* bad pattern */
  24. };
  25. enum {
  26. PATTERN_VALID = 0, /* valid pattern */
  27. PATTERN_ESC = -1, /* literal escape at end of pattern */
  28. PATTERN_RANGE = -2, /* malformed range in [..] construct */
  29. PATTERN_CLOSE = -3, /* no end bracket in [..] construct */
  30. PATTERN_EMPTY = -4, /* [..] contstruct is empty */
  31. };
  32. int Matche(const regchar_t *p, const regchar_t *t);
  33. // TODO: make this multi-byte aware
  34. int matche_after_star(const regchar_t *p, const regchar_t *t)
  35. {
  36. register int match = 0;
  37. register regchar_t nextp;
  38. /* pass over existing ? and * in pattern */
  39. while ( *p == '?' || *p == '*' )
  40. {
  41. /* take one char for each ? and + */
  42. if (*p == '?')
  43. {
  44. /* if end of text then no match */
  45. if (!*t++) return MATCH_ABORT;
  46. }
  47. /* move to next char in pattern */
  48. p++;
  49. }
  50. /* if end of pattern we have matched regardless of text left */
  51. if (!*p) return MATCH_VALID;
  52. /* get the next character to match which must be a literal or '[' */
  53. nextp = *p;
  54. if (nextp == '\\')
  55. {
  56. nextp = p[1];
  57. /* if end of text then we have a bad pattern */
  58. if (!nextp) return MATCH_PATTERN;
  59. }
  60. /* Continue until we run out of text or definite result seen */
  61. do
  62. {
  63. /* a precondition for matching is that the next character
  64. in the pattern match the next character in the text or that
  65. the next pattern char is the beginning of a range. Increment
  66. text pointer as we go here */
  67. if (nextp == *t || nextp == '[') match = Matche(p, t);
  68. /* if the end of text is reached then no match */
  69. if (!*t++) match = MATCH_ABORT;
  70. }
  71. while ( match != MATCH_VALID && match != MATCH_ABORT && match != MATCH_PATTERN);
  72. /* return result */
  73. return match;
  74. }
  75. int Matche(const regchar_t *p, const regchar_t *t)
  76. {
  77. regchar_t range_start, range_end; /* start and end in range */
  78. bool invert; /* is this [..] or [!..] */
  79. bool member_match; /* have I matched the [..] construct? */
  80. bool loop; /* should I terminate? */
  81. for ( ; *p; p++, t++)
  82. {
  83. /* if this is the end of the text then this is the end of the match */
  84. if (!*t)
  85. {
  86. return (*p == '*' && *++p == '\0') ? MATCH_VALID : MATCH_ABORT;
  87. }
  88. /* determine and react to pattern type */
  89. switch (*p)
  90. {
  91. case '?': /* single any character match */
  92. break;
  93. case '*': /* multiple any character match */
  94. return matche_after_star (p, t);
  95. /* [..] construct, single member/exclusion character match */
  96. case '[':
  97. {
  98. /* move to beginning of range */
  99. p++;
  100. /* check if this is a member match or exclusion match */
  101. invert = false;
  102. if (*p == '!' || *p == '^')
  103. {
  104. invert = true;
  105. p++;
  106. }
  107. /* if closing bracket here or at range start then we have a malformed pattern */
  108. if (*p == ']')
  109. return MATCH_PATTERN;
  110. member_match = false;
  111. loop = true;
  112. while (loop)
  113. {
  114. /* if end of construct then loop is done */
  115. if (*p == ']')
  116. {
  117. loop = false;
  118. continue;
  119. }
  120. /* matching a '!', '^', '-', '\' or a ']' */
  121. if (*p == '\\')
  122. range_start = range_end = *++p;
  123. else
  124. range_start = range_end = *p;
  125. /* if end of pattern then bad pattern (Missing ']') */
  126. if (!*p)
  127. return MATCH_PATTERN;
  128. /* check for range bar */
  129. if (*++p == '-')
  130. {
  131. /* get the range end */
  132. range_end = *++p;
  133. /* if end of pattern or construct then bad pattern */
  134. if (range_end == '\0' || range_end == ']') return MATCH_PATTERN;
  135. /* special character range end */
  136. if (range_end == '\\')
  137. {
  138. range_end = *++p;
  139. /* if end of text then we have a bad pattern */
  140. if (!range_end) return MATCH_PATTERN;
  141. }
  142. /* move just beyond this range */
  143. p++;
  144. }
  145. /* if the text character is in range then match found.
  146. make sure the range letters have the proper
  147. relationship to one another before comparison */
  148. if (range_start < range_end)
  149. {
  150. if (*t >= range_start && *t <= range_end)
  151. {
  152. member_match = true;
  153. loop = false;
  154. }
  155. }
  156. else
  157. {
  158. if (*t >= range_end && *t <= range_start)
  159. {
  160. member_match = true;
  161. loop = false;
  162. }
  163. }
  164. }
  165. /* if there was a match in an exclusion set then no match */
  166. /* if there was no match in a member set then no match */
  167. if ((invert && member_match) || !(invert || member_match))
  168. return MATCH_RANGE;
  169. /* if this is not an exclusion then skip the rest of the [...] construct that already matched. */
  170. if (member_match)
  171. {
  172. while (p && *p != ']')
  173. {
  174. /* bad pattern (Missing ']') */
  175. if (!*p)
  176. return MATCH_PATTERN;
  177. /* skip exact match */
  178. if (*p == '\\')
  179. {
  180. p++;
  181. /* if end of text then we have a bad pattern */
  182. if (!*p)
  183. return MATCH_PATTERN;
  184. }
  185. /* move to next pattern char */
  186. p++;
  187. }
  188. }
  189. break;
  190. }
  191. case '\\': /* next character is quoted and must match exactly */
  192. /* move pattern pointer to quoted char and fall through */
  193. p++;
  194. /* if end of text then we have a bad pattern */
  195. if (!*p)
  196. return MATCH_PATTERN;
  197. /* must match this character exactly */
  198. default:
  199. if (*p != *t)
  200. return MATCH_LITERAL;
  201. }
  202. }
  203. /* if end of text not reached then the pattern fails */
  204. if (*t)
  205. return MATCH_END;
  206. else return MATCH_VALID;
  207. }
  208. bool Match(const regchar_t *match, const regchar_t *string)
  209. {
  210. if (!match)
  211. return true;
  212. int error_type;
  213. error_type = Matche(match, string);
  214. return (error_type == MATCH_VALID);
  215. }