1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
39 * Define the syntax stuff, so we can do the \<...\> things.
42 #ifndef Sword /* must be non-zero in some of the tests below... */
46 #define SYNTAX(c) re_syntax_table[c]
50 char *re_syntax_table;
54 static char re_syntax_table[256];
65 memset (re_syntax_table, '\0', sizeof re_syntax_table);
67 for (c = 'a'; c <= 'z'; c++)
68 re_syntax_table[c] = Sword;
70 for (c = 'A'; c <= 'Z'; c++)
71 re_syntax_table[c] = Sword;
73 for (c = '0'; c <= '9'; c++)
74 re_syntax_table[c] = Sword;
79 #endif /* SYNTAX_TABLE */
80 #endif /* not emacs */
84 /* Number of failure points to allocate space for initially,
85 when matching. If this number is exceeded, more space is allocated,
86 so it is not a hard limit. */
90 #endif /* NFAILURES */
92 /* width of a byte in bits */
96 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
97 since ours (we hope) works properly with all combinations of
98 machines, compilers, `char' and `unsigned char' argument types.
99 (Per Bothner suggested the basic approach.) */
100 #undef SIGN_EXTEND_CHAR
102 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
103 #else /* not __STDC__ */
104 /* As in Harbison and Steele. */
105 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
108 static int obscure_syntax = 0;
110 /* Specify the precise syntax of regexp for compilation.
111 This provides for compatibility for various utilities
112 which historically have different, incompatible syntaxes.
114 The argument SYNTAX is a bit-mask containing the two bits
115 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
118 re_set_syntax (syntax)
123 ret = obscure_syntax;
124 obscure_syntax = syntax;
128 /* re_compile_pattern takes a regular-expression string
129 and converts it into a buffer full of byte commands for matching.
131 PATTERN is the address of the pattern string
132 SIZE is the length of it.
133 BUFP is a struct re_pattern_buffer * which points to the info
134 on where to store the byte commands.
135 This structure contains a char * which points to the
136 actual space, which should have been obtained with malloc.
137 re_compile_pattern may use realloc to grow the buffer space.
139 The number of bytes of commands can be found out by looking in
140 the struct re_pattern_buffer that bufp pointed to,
141 after re_compile_pattern returns.
144 #define PATPUSH(ch) (*b++ = (char) (ch))
146 #define PATFETCH(c) \
147 {if (p == pend) goto end_of_pattern; \
148 c = * (unsigned char *) p++; \
149 if (translate) c = translate[c]; }
151 #define PATFETCH_RAW(c) \
152 {if (p == pend) goto end_of_pattern; \
153 c = * (unsigned char *) p++; }
155 #define PATUNFETCH p--
157 /* This is not an arbitrary limit: the arguments which represent offsets
158 into the pattern are two bytes long. So if 2^16 bytes turns out to
159 be too small, many things would have to change. */
160 #define MAX_BUF_SIZE (1 << 16)
163 /* Extend the buffer by twice its current size via realloc and
164 reset the pointers that pointed into the old block to point to the
165 correct places in the new one. If extending the buffer results in it
166 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
167 #define EXTEND_BUFFER \
169 char *old_buffer = bufp->buffer; \
170 if (bufp->allocated == MAX_BUF_SIZE) \
172 bufp->allocated <<= 1; \
173 if (bufp->allocated > MAX_BUF_SIZE) \
174 bufp->allocated = MAX_BUF_SIZE; \
175 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated);\
176 if (bufp->buffer == NULL) \
177 goto memory_exhausted; \
178 /* If the buffer moved, move all the pointers into it. */ \
179 if (old_buffer != bufp->buffer) \
181 b = (b - old_buffer) + bufp->buffer; \
182 begalt = (begalt - old_buffer) + bufp->buffer; \
184 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer;\
186 laststart = (laststart - old_buffer) + bufp->buffer; \
188 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
192 static void store_jump (), insert_jump ();
195 re_compile_pattern (pattern, size, bufp)
198 struct re_pattern_buffer *bufp;
200 register char *b = bufp->buffer;
201 register char *p = pattern;
202 char *pend = pattern + size;
203 register unsigned c, c1;
205 unsigned char *translate = (unsigned char *) bufp->translate;
207 /* address of the count-byte of the most recently inserted "exactn" command.
208 This makes it possible to tell whether a new exact-match character
209 can be added to that command or requires a new "exactn" command. */
211 char *pending_exact = 0;
213 /* address of the place where a forward-jump should go
214 to the end of the containing expression.
215 Each alternative of an "or", except the last, ends with a forward-jump
218 char *fixup_jump = 0;
220 /* address of start of the most recently finished expression.
221 This tells postfix * where to find the start of its operand. */
225 /* In processing a repeat, 1 means zero matches is allowed */
229 /* In processing a repeat, 1 means many matches is allowed */
233 /* address of beginning of regexp, or inside of last \( */
237 /* Stack of information saved by \( and restored by \).
238 Four stack elements are pushed by each \(:
239 First, the value of b.
240 Second, the value of fixup_jump.
241 Third, the value of regnum.
242 Fourth, the value of begalt. */
245 int *stackp = stackb;
246 int *stacke = stackb + 40;
249 /* Counts \('s as they are encountered. Remembered for the matching \),
250 where it becomes the "register number" to put in the stop_memory command */
254 bufp->fastmap_accurate = 0;
259 * Initialize the syntax table.
265 if (bufp->allocated == 0)
267 bufp->allocated = 28;
269 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
270 bufp->buffer = (char *) realloc (bufp->buffer, 28);
272 /* Caller did not allocate a buffer. Do it for him */
273 bufp->buffer = (char *) malloc (28);
274 if (!bufp->buffer) goto memory_exhausted;
275 begalt = b = bufp->buffer;
280 if (b - bufp->buffer > bufp->allocated - 10)
281 /* Note that EXTEND_BUFFER clobbers c */
289 if (obscure_syntax & RE_TIGHT_VBAR)
291 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
293 /* Make operand of last vbar end before this `$'. */
295 store_jump (fixup_jump, jump, b);
301 /* $ means succeed if at end of line, but only in special contexts.
302 If randomly in the middle of a pattern, it is a normal character. */
303 if (p == pend || *p == '\n'
304 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
305 || (obscure_syntax & RE_NO_BK_PARENS
307 : *p == '\\' && p[1] == ')')
308 || (obscure_syntax & RE_NO_BK_VBAR
310 : *p == '\\' && p[1] == '|'))
318 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
320 if (laststart && p[-2] != '\n'
321 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
323 if (obscure_syntax & RE_TIGHT_VBAR)
326 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
337 if (obscure_syntax & RE_BK_PLUS_QM)
341 /* If there is no previous pattern, char not special. */
342 if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
344 /* If there is a sequence of repetition chars,
345 collapse it down to equivalent to just one. */
350 zero_times_ok |= c != '+';
351 many_times_ok |= c != '?';
357 else if (!(obscure_syntax & RE_BK_PLUS_QM)
358 && (c == '+' || c == '?'))
360 else if ((obscure_syntax & RE_BK_PLUS_QM)
365 if (!(c1 == '+' || c1 == '?'))
380 /* Star, etc. applied to an empty pattern is equivalent
381 to an empty pattern. */
385 /* Now we know whether 0 matches is allowed,
386 and whether 2 or more matches is allowed. */
389 /* If more than one repetition is allowed,
390 put in a backward jump at the end. */
391 store_jump (b, maybe_finalize_jump, laststart - 3);
394 insert_jump (on_failure_jump, laststart, b + 3, b);
399 /* At least one repetition required: insert before the loop
400 a skip over the initial on-failure-jump instruction */
401 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
412 while (b - bufp->buffer
413 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
414 /* Note that EXTEND_BUFFER clobbers c */
419 PATPUSH (charset_not), p++;
424 PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
425 /* Clear the whole map */
426 memset (b, '\0', (1 << BYTEWIDTH) / BYTEWIDTH);
427 /* Read in characters and ranges, setting map bits */
431 if (c == ']' && p != p1 + 1) break;
432 if (*p == '-' && p[1] != ']')
437 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
441 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
444 /* Discard any bitmap bytes that are all 0 at the end of the map.
445 Decrement the map-length byte too. */
446 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
452 if (! (obscure_syntax & RE_NO_BK_PARENS))
458 if (! (obscure_syntax & RE_NO_BK_PARENS))
464 if (! (obscure_syntax & RE_NEWLINE_OR))
470 if (! (obscure_syntax & RE_NO_BK_VBAR))
476 if (p == pend) goto invalid_pattern;
481 if (obscure_syntax & RE_NO_BK_PARENS)
484 if (stackp == stacke) goto nesting_too_deep;
485 if (regnum < RE_NREGS)
487 PATPUSH (start_memory);
490 *stackp++ = b - bufp->buffer;
491 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
492 *stackp++ = regnum++;
493 *stackp++ = begalt - bufp->buffer;
500 if (obscure_syntax & RE_NO_BK_PARENS)
503 if (stackp == stackb) goto unmatched_close;
504 begalt = *--stackp + bufp->buffer;
506 store_jump (fixup_jump, jump, b);
507 if (stackp[-1] < RE_NREGS)
509 PATPUSH (stop_memory);
510 PATPUSH (stackp[-1]);
515 fixup_jump = *stackp + bufp->buffer - 1;
516 laststart = *--stackp + bufp->buffer;
520 if (obscure_syntax & RE_NO_BK_VBAR)
523 insert_jump (on_failure_jump, begalt, b + 6, b);
527 store_jump (fixup_jump, jump, b);
541 PATPUSH (syntaxspec);
543 PATPUSH (syntax_spec_code[c]);
548 PATPUSH (notsyntaxspec);
550 PATPUSH (syntax_spec_code[c]);
561 PATPUSH (notwordchar);
577 PATPUSH (notwordbound);
600 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
610 if (obscure_syntax & RE_BK_PLUS_QM)
615 /* You might think it would be useful for \ to mean
616 not to translate; but if we don't translate it
617 it will never match anything. */
618 if (translate) c = translate[c];
625 if (!pending_exact || pending_exact + *pending_exact + 1 != b
626 || *pending_exact == 0177 || *p == '*' || *p == '^'
627 || ((obscure_syntax & RE_BK_PLUS_QM)
628 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
629 : (*p == '+' || *p == '?')))
642 store_jump (fixup_jump, jump, b);
644 if (stackp != stackb) goto unmatched_open;
646 bufp->used = b - bufp->buffer;
650 return "Invalid regular expression";
653 return "Unmatched \\(";
656 return "Unmatched \\)";
659 return "Premature end of regular expression";
662 return "Nesting too deep";
665 return "Regular expression too big";
668 return "Memory exhausted";
671 /* Store where `from' points a jump operation to jump to where `to' points.
672 `opcode' is the opcode to store. */
675 store_jump (from, opcode, to)
680 from[1] = (to - (from + 3)) & 0377;
681 from[2] = (to - (from + 3)) >> 8;
684 /* Open up space at char FROM, and insert there a jump to TO.
685 CURRENT_END gives te end of the storage no in use,
686 so we know how much data to copy up.
687 OP is the opcode of the jump to insert.
689 If you call this function, you must zero out pending_exact. */
692 insert_jump (op, from, to, current_end)
694 char *from, *to, *current_end;
696 register char *pto = current_end + 3;
697 register char *pfrom = current_end;
698 while (pfrom != from)
700 store_jump (from, op, to);
703 /* Given a pattern, compute a fastmap from it.
704 The fastmap records which of the (1 << BYTEWIDTH) possible characters
705 can start a string that matches the pattern.
706 This fastmap is used by re_search to skip quickly over totally implausible text.
708 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
710 The other components of bufp describe the pattern to be used. */
713 re_compile_fastmap (bufp)
714 struct re_pattern_buffer *bufp;
716 unsigned char *pattern = (unsigned char *) bufp->buffer;
717 int size = bufp->used;
718 register char *fastmap = bufp->fastmap;
719 register unsigned char *p = pattern;
720 register unsigned char *pend = pattern + size;
722 unsigned char *translate = (unsigned char *) bufp->translate;
724 unsigned char *stackb[NFAILURES];
725 unsigned char **stackp = stackb;
727 memset (fastmap, '\0', (1 << BYTEWIDTH));
728 bufp->fastmap_accurate = 1;
729 bufp->can_be_null = 0;
735 bufp->can_be_null = 1;
738 #ifdef SWITCH_ENUM_BUG
739 switch ((int) ((enum regexpcode) *p++))
741 switch ((enum regexpcode) *p++)
746 fastmap[translate[p[1]]] = 1;
765 fastmap[translate['\n']] = 1;
768 if (bufp->can_be_null != 1)
769 bufp->can_be_null = 2;
773 case maybe_finalize_jump:
775 case dummy_failure_jump:
776 bufp->can_be_null = 1;
778 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
779 p += j + 1; /* The 1 compensates for missing ++ above */
782 /* Jump backward reached implies we just went through
783 the body of a loop and matched nothing.
784 Opcode jumped to should be an on_failure_jump.
785 Just treat it like an ordinary jump.
786 For a * loop, it has pushed its failure point already;
787 if so, discard that as redundant. */
788 if ((enum regexpcode) *p != on_failure_jump)
792 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
793 p += j + 1; /* The 1 compensates for missing ++ above */
794 if (stackp != stackb && *stackp == p)
798 case on_failure_jump:
800 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
811 bufp->can_be_null = 1;
814 for (j = 0; j < (1 << BYTEWIDTH); j++)
817 if (bufp->can_be_null)
819 /* Don't return; check the alternative paths
820 so we can set can_be_null if appropriate. */
824 for (j = 0; j < (1 << BYTEWIDTH); j++)
825 if (SYNTAX (j) == Sword)
830 for (j = 0; j < (1 << BYTEWIDTH); j++)
831 if (SYNTAX (j) != Sword)
838 for (j = 0; j < (1 << BYTEWIDTH); j++)
839 if (SYNTAX (j) == (enum syntaxcode) k)
845 for (j = 0; j < (1 << BYTEWIDTH); j++)
846 if (SYNTAX (j) != (enum syntaxcode) k)
852 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
853 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
856 fastmap[translate[j]] = 1;
863 /* Chars beyond end of map must be allowed */
864 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
866 fastmap[translate[j]] = 1;
870 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
871 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
874 fastmap[translate[j]] = 1;
881 /* Get here means we have successfully found the possible starting characters
882 of one path of the pattern. We need not follow this path any farther.
883 Instead, look at the next alternative remembered in the stack. */
884 if (stackp != stackb)
891 /* Like re_search_2, below, but only one string is specified. */
894 re_search (pbufp, string, size, startpos, range, regs)
895 struct re_pattern_buffer *pbufp;
897 int size, startpos, range;
898 struct re_registers *regs;
900 return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
903 /* Like re_match_2 but tries first a match starting at index STARTPOS,
904 then at STARTPOS + 1, and so on.
905 RANGE is the number of places to try before giving up.
906 If RANGE is negative, the starting positions tried are
907 STARTPOS, STARTPOS - 1, etc.
908 It is up to the caller to make sure that range is not so large
909 as to take the starting position outside of the input strings.
911 The value returned is the position at which the match was found,
912 or -1 if no match was found,
913 or -2 if error (such as failure stack overflow). */
916 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
917 struct re_pattern_buffer *pbufp;
918 char *string1, *string2;
922 struct re_registers *regs;
925 register char *fastmap = pbufp->fastmap;
926 register unsigned char *translate = (unsigned char *) pbufp->translate;
927 int total = size1 + size2;
930 /* Update the fastmap now if not correct already */
931 if (fastmap && !pbufp->fastmap_accurate)
932 re_compile_fastmap (pbufp);
934 /* Don't waste time in a long search for a pattern
935 that says it is anchored. */
936 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
947 /* If a fastmap is supplied, skip quickly over characters
948 that cannot possibly be the start of a match.
949 Note, however, that if the pattern can possibly match
950 the null string, we must test it at each starting point
951 so that we take the first null string we get. */
953 if (fastmap && startpos < total && pbufp->can_be_null != 1)
957 register int lim = 0;
958 register unsigned char *p;
960 if (startpos < size1 && startpos + range >= size1)
961 lim = range - (size1 - startpos);
963 p = ((unsigned char *)
964 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
968 while (range > lim && !fastmap[translate[*p++]])
973 while (range > lim && !fastmap[*p++])
976 startpos += irange - range;
980 register unsigned char c;
981 if (startpos >= size1)
982 c = string2[startpos - size1];
984 c = string1[startpos];
986 if (translate ? !fastmap[translate[c]] : !fastmap[c])
991 if (range >= 0 && startpos == total
992 && fastmap && pbufp->can_be_null == 0)
995 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
1005 #endif /* C_ALLOCA */
1009 if (range > 0) range--, startpos++; else range++, startpos--;
1014 #ifndef emacs /* emacs never uses this */
1016 re_match (pbufp, string, size, pos, regs)
1017 struct re_pattern_buffer *pbufp;
1020 struct re_registers *regs;
1022 return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1026 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1028 int re_max_failures = 2000;
1030 static int memcmp_translate();
1031 /* Match the pattern described by PBUFP
1032 against data which is the virtual concatenation of STRING1 and STRING2.
1033 SIZE1 and SIZE2 are the sizes of the two data strings.
1034 Start the match at position POS.
1035 Do not consider matching past the position MSTOP.
1037 If pbufp->fastmap is nonzero, then it had better be up to date.
1039 The reason that the data to match are specified as two components
1040 which are to be regarded as concatenated
1041 is so this function can be used directly on the contents of an Emacs buffer.
1043 -1 is returned if there is no match. -2 is returned if there is
1044 an error (such as match stack overflow). Otherwise the value is the length
1045 of the substring which was matched. */
1048 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1049 struct re_pattern_buffer *pbufp;
1050 unsigned char *string1, *string2;
1053 struct re_registers *regs;
1056 register unsigned char *p = (unsigned char *) pbufp->buffer;
1057 register unsigned char *pend = p + pbufp->used;
1058 /* End of first string */
1059 unsigned char *end1;
1060 /* End of second string */
1061 unsigned char *end2;
1062 /* Pointer just past last char to consider matching */
1063 unsigned char *end_match_1, *end_match_2;
1064 register unsigned char *d, *dend;
1066 unsigned char *translate = (unsigned char *) pbufp->translate;
1068 /* Failure point stack. Each place that can handle a failure further down the line
1069 pushes a failure point on this stack. It consists of two char *'s.
1070 The first one pushed is where to resume scanning the pattern;
1071 the second pushed is where to resume scanning the strings.
1072 If the latter is zero, the failure point is a "dummy".
1073 If a failure happens and the innermost failure point is dormant,
1074 it discards that failure point and tries the next one. */
1076 unsigned char *initial_stack[2 * NFAILURES];
1077 unsigned char **stackb = initial_stack;
1078 unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1080 /* Information on the "contents" of registers.
1081 These are pointers into the input strings; they record
1082 just what was matched (on this attempt) by some part of the pattern.
1083 The start_memory command stores the start of a register's contents
1084 and the stop_memory command stores the end.
1086 At that point, regstart[regnum] points to the first character in the register,
1087 regend[regnum] points to the first character beyond the end of the register,
1088 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1089 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1091 unsigned char *regstart[RE_NREGS];
1092 unsigned char *regend[RE_NREGS];
1093 unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1095 /* Set up pointers to ends of strings.
1096 Don't allow the second string to be empty unless both are empty. */
1104 end1 = string1 + size1;
1105 end2 = string2 + size2;
1107 /* Compute where to stop matching, within the two strings */
1110 end_match_1 = string1 + mstop;
1111 end_match_2 = string2;
1116 end_match_2 = string2 + mstop - size1;
1119 /* Initialize \) text positions to -1
1120 to mark ones that no \( or \) has been seen for. */
1122 for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1123 regend[mcnt] = (unsigned char *) -1;
1125 /* `p' scans through the pattern as `d' scans through the data.
1126 `dend' is the end of the input string that `d' points within.
1127 `d' is advanced into the following input string whenever necessary,
1128 but this happens before fetching;
1129 therefore, at the beginning of the loop,
1130 `d' can be pointing at the end of a string,
1131 but it cannot equal string2. */
1134 d = string1 + pos, dend = end_match_1;
1136 d = string2 + pos - size1, dend = end_match_2;
1138 /* Write PREFETCH; just before fetching a character with *d. */
1141 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1142 d = string2; /* end of string1 => advance to string2. */ \
1143 dend = end_match_2; }
1145 /* This loop loops over pattern commands.
1146 It exits by returning from the function if match is complete,
1147 or it drops through if match fails at this starting point in the input data. */
1152 /* End of pattern means we have succeeded! */
1154 /* If caller wants register contents data back, convert it to indices */
1157 regs->start[0] = pos;
1158 if (dend == end_match_1)
1159 regs->end[0] = d - string1;
1161 regs->end[0] = d - string2 + size1;
1162 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1164 if (regend[mcnt] == (unsigned char *) -1)
1166 regs->start[mcnt] = -1;
1167 regs->end[mcnt] = -1;
1170 if (regstart_seg1[mcnt])
1171 regs->start[mcnt] = regstart[mcnt] - string1;
1173 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1174 if (regend_seg1[mcnt])
1175 regs->end[mcnt] = regend[mcnt] - string1;
1177 regs->end[mcnt] = regend[mcnt] - string2 + size1;
1180 if (dend == end_match_1)
1181 return (d - string1 - pos);
1183 return d - string2 + size1 - pos;
1186 /* Otherwise match next pattern command */
1187 #ifdef SWITCH_ENUM_BUG
1188 switch ((int) ((enum regexpcode) *p++))
1190 switch ((enum regexpcode) *p++)
1194 /* \( is represented by a start_memory, \) by a stop_memory.
1195 Both of those commands contain a "register number" argument.
1196 The text matched within the \( and \) is recorded under that number.
1197 Then, \<digit> turns into a `duplicate' command which
1198 is followed by the numeric value of <digit> as the register number. */
1202 regstart_seg1[*p++] = (dend == end_match_1);
1207 regend_seg1[*p++] = (dend == end_match_1);
1212 int regno = *p++; /* Get which register to match against */
1213 register unsigned char *d2, *dend2;
1215 d2 = regstart[regno];
1216 dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1217 ? regend[regno] : end_match_1);
1220 /* Advance to next segment in register contents, if necessary */
1223 if (dend2 == end_match_2) break;
1224 if (dend2 == regend[regno]) break;
1225 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1227 /* At end of register contents => success */
1228 if (d2 == dend2) break;
1230 /* Advance to next segment in data being matched, if necessary */
1233 /* mcnt gets # consecutive chars to compare */
1235 if (mcnt > dend2 - d2)
1237 /* Compare that many; failure if mismatch, else skip them. */
1238 if (translate ? memcmp_translate (d, d2, mcnt, translate) : memcmp (d, d2, mcnt))
1240 d += mcnt, d2 += mcnt;
1246 /* fetch a data character */
1248 /* Match anything but a newline. */
1249 if ((translate ? translate[*d++] : *d++) == '\n')
1256 /* Nonzero for charset_not */
1259 if (*(p - 1) == (unsigned char) charset_not)
1262 /* fetch a data character */
1270 if (c < *p * BYTEWIDTH
1271 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1276 if (!not) goto fail;
1282 if (d == string1 || d[-1] == '\n')
1288 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1292 /* "or" constructs ("|") are handled by starting each alternative
1293 with an on_failure_jump that points to the start of the next alternative.
1294 Each alternative except the last ends with a jump to the joining point.
1295 (Actually, each jump except for the last one really jumps
1296 to the following jump, because tensioning the jumps is a hassle.) */
1298 /* The start of a stupid repeat has an on_failure_jump that points
1299 past the end of the repeat text.
1300 This makes a failure point so that, on failure to match a repetition,
1301 matching restarts past as many repetitions have been found
1302 with no way to fail and look for another one. */
1304 /* A smart repeat is similar but loops back to the on_failure_jump
1305 so that each repetition makes another failure point. */
1307 case on_failure_jump:
1308 if (stackp == stacke)
1310 unsigned char **stackx;
1311 if (stacke - stackb > re_max_failures * 2)
1313 stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1315 memcpy (stackx, stackb, (stacke - stackb) * sizeof (char *));
1316 stackp = stackx + (stackp - stackb);
1317 stacke = stackx + 2 * (stacke - stackb);
1321 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1323 *stackp++ = mcnt + p;
1327 /* The end of a smart repeat has an maybe_finalize_jump back.
1328 Change it either to a finalize_jump or an ordinary jump. */
1330 case maybe_finalize_jump:
1332 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1335 register unsigned char *p2 = p;
1336 /* Compare what follows with the begining of the repeat.
1337 If we can establish that there is nothing that they would
1338 both match, we can change to finalize_jump */
1340 && (*p2 == (unsigned char) stop_memory
1341 || *p2 == (unsigned char) start_memory))
1344 p[-3] = (unsigned char) finalize_jump;
1345 else if (*p2 == (unsigned char) exactn
1346 || *p2 == (unsigned char) endline)
1348 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
1349 register unsigned char *p1 = p + mcnt;
1350 /* p1[0] ... p1[2] are an on_failure_jump.
1351 Examine what follows that */
1352 if (p1[3] == (unsigned char) exactn && p1[5] != c)
1353 p[-3] = (unsigned char) finalize_jump;
1354 else if (p1[3] == (unsigned char) charset
1355 || p1[3] == (unsigned char) charset_not)
1357 int not = p1[3] == (unsigned char) charset_not;
1358 if (c < p1[4] * BYTEWIDTH
1359 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1361 /* not is 1 if c would match */
1362 /* That means it is not safe to finalize */
1364 p[-3] = (unsigned char) finalize_jump;
1369 if (p[-1] != (unsigned char) finalize_jump)
1371 p[-1] = (unsigned char) jump;
1375 /* The end of a stupid repeat has a finalize-jump
1376 back to the start, where another failure point will be made
1377 which will point after all the repetitions found so far. */
1385 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1386 p += mcnt + 1; /* The 1 compensates for missing ++ above */
1389 case dummy_failure_jump:
1390 if (stackp == stacke)
1392 unsigned char **stackx
1393 = (unsigned char **) alloca (2 * (stacke - stackb)
1395 memcpy (stackx, stackb, (stacke - stackb) * sizeof (char *));
1396 stackp = stackx + (stackp - stackb);
1397 stacke = stackx + 2 * (stacke - stackb);
1405 if (d == string1 /* Points to first char */
1406 || d == end2 /* Points to end */
1407 || (d == end1 && size2 == 0)) /* Points to end */
1409 if ((SYNTAX (d[-1]) == Sword)
1410 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1415 if (d == string1 /* Points to first char */
1416 || d == end2 /* Points to end */
1417 || (d == end1 && size2 == 0)) /* Points to end */
1419 if ((SYNTAX (d[-1]) == Sword)
1420 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1425 if (d == end2 /* Points to end */
1426 || (d == end1 && size2 == 0) /* Points to end */
1427 || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1429 if (d == string1 /* Points to first char */
1430 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1435 if (d == string1 /* Points to first char */
1436 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1438 if (d == end2 /* Points to end */
1439 || (d == end1 && size2 == 0) /* Points to end */
1440 || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1446 if (((d - string2 <= (unsigned) size2)
1447 ? d - bf_p2 : d - bf_p1)
1453 if (((d - string2 <= (unsigned) size2)
1454 ? d - bf_p2 : d - bf_p1)
1460 if (((d - string2 <= (unsigned) size2)
1461 ? d - bf_p2 : d - bf_p1)
1474 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1479 goto matchnotsyntax;
1485 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1490 if (SYNTAX (*d++) == 0) goto fail;
1495 if (SYNTAX (*d++) != 0) goto fail;
1497 #endif /* not emacs */
1500 if (d == string1) /* Note, d cannot equal string2 */
1501 break; /* unless string1 == string2. */
1505 if (d == end2 || (d == end1 && size2 == 0))
1510 /* Match the next few pattern characters exactly.
1511 mcnt is how many characters to match. */
1518 if (translate[*d++] != *p++) goto fail;
1527 if (*d++ != *p++) goto fail;
1533 continue; /* Successfully matched one pattern command; keep matching */
1535 /* Jump here if any matching operation fails. */
1537 if (stackp != stackb)
1538 /* A restart point is known. Restart there and pop it. */
1541 { /* If innermost failure point is dormant, flush it and keep looking */
1547 if (d >= string1 && d <= end1)
1550 else break; /* Matching at this starting point really fails! */
1552 return -1; /* Failure to match */
1556 memcmp_translate (s1, s2, len, translate)
1557 unsigned char *s1, *s2;
1559 unsigned char *translate;
1561 register unsigned char *p1 = s1, *p2 = s2;
1564 if (translate [*p1++] != translate [*p2++]) return 1;
1570 /* Entry points compatible with bsd4.2 regex library */
1574 static struct re_pattern_buffer re_comp_buf;
1582 if (!re_comp_buf.buffer)
1583 return "No previous regular expression";
1587 if (!re_comp_buf.buffer)
1589 if (!(re_comp_buf.buffer = (char *) malloc (200)))
1590 return "Memory exhausted";
1591 re_comp_buf.allocated = 200;
1592 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1593 return "Memory exhausted";
1595 return re_compile_pattern (s, strlen (s), &re_comp_buf);
1602 int len = strlen (s);
1603 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1612 /* Indexed by a character, gives the upper case equivalent of the character */
1614 static char upcase[0400] =
1615 { 000, 001, 002, 003, 004, 005, 006, 007,
1616 010, 011, 012, 013, 014, 015, 016, 017,
1617 020, 021, 022, 023, 024, 025, 026, 027,
1618 030, 031, 032, 033, 034, 035, 036, 037,
1619 040, 041, 042, 043, 044, 045, 046, 047,
1620 050, 051, 052, 053, 054, 055, 056, 057,
1621 060, 061, 062, 063, 064, 065, 066, 067,
1622 070, 071, 072, 073, 074, 075, 076, 077,
1623 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1624 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1625 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1626 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1627 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1628 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1629 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1630 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1631 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1632 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1633 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1634 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1635 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1636 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1637 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1638 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1639 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1640 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1641 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1642 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1643 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1644 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1645 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1646 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1654 struct re_pattern_buffer buf;
1657 char fastmap[(1 << BYTEWIDTH)];
1659 /* Allow a command argument to specify the style of syntax. */
1661 obscure_syntax = atoi (argv[1]);
1664 buf.buffer = (char *) malloc (buf.allocated);
1665 buf.fastmap = fastmap;
1666 buf.translate = upcase;
1674 re_compile_pattern (pat, strlen(pat), &buf);
1676 for (i = 0; i < buf.used; i++)
1677 printchar (buf.buffer[i]);
1679 putchar_unfiltered ('\n');
1681 printf_unfiltered ("%d allocated, %d used.\n", buf.allocated, buf.used);
1683 re_compile_fastmap (&buf);
1684 printf_unfiltered ("Allowed by fastmap: ");
1685 for (i = 0; i < (1 << BYTEWIDTH); i++)
1686 if (fastmap[i]) printchar (i);
1687 putchar_unfiltered ('\n');
1690 gets (pat); /* Now read the string to match against */
1692 i = re_match (&buf, pat, strlen (pat), 0, 0);
1693 printf_unfiltered ("Match value %d.\n", i);
1699 struct re_pattern_buffer *bufp;
1703 printf_unfiltered ("buf is :\n----------------\n");
1704 for (i = 0; i < bufp->used; i++)
1705 printchar (bufp->buffer[i]);
1707 printf_unfiltered ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1709 printf_unfiltered ("Allowed by fastmap: ");
1710 for (i = 0; i < (1 << BYTEWIDTH); i++)
1711 if (bufp->fastmap[i])
1713 printf_unfiltered ("\nAllowed by translate: ");
1714 if (bufp->translate)
1715 for (i = 0; i < (1 << BYTEWIDTH); i++)
1716 if (bufp->translate[i])
1718 printf_unfiltered ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1719 printf_unfiltered ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1726 if (c < 041 || c >= 0177)
1728 putchar_unfiltered ('\\');
1729 putchar_unfiltered (((c >> 6) & 3) + '0');
1730 putchar_unfiltered (((c >> 3) & 7) + '0');
1731 putchar_unfiltered ((c & 7) + '0');
1734 putchar_unfiltered (c);
1740 puts_unfiltered (string);