1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
37 #define bcopy(s,d,n) memcpy((d),(s),(n))
38 #define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
39 #define bzero(s,n) memset((s),0,(n))
43 /* Make alloca work the best possible way. */
45 #define alloca __builtin_alloca
53 * Define the syntax stuff, so we can do the \<...\> things.
56 #ifndef Sword /* must be non-zero in some of the tests below... */
60 #define SYNTAX(c) re_syntax_table[c]
64 char *re_syntax_table;
68 static char re_syntax_table[256];
79 bzero (re_syntax_table, sizeof re_syntax_table);
81 for (c = 'a'; c <= 'z'; c++)
82 re_syntax_table[c] = Sword;
84 for (c = 'A'; c <= 'Z'; c++)
85 re_syntax_table[c] = Sword;
87 for (c = '0'; c <= '9'; c++)
88 re_syntax_table[c] = Sword;
93 #endif /* SYNTAX_TABLE */
94 #endif /* not emacs */
98 /* Number of failure points to allocate space for initially,
99 when matching. If this number is exceeded, more space is allocated,
100 so it is not a hard limit. */
104 #endif /* NFAILURES */
106 /* width of a byte in bits */
110 #ifndef SIGN_EXTEND_CHAR
111 #define SIGN_EXTEND_CHAR(x) (x)
114 static int obscure_syntax = 0;
116 /* Specify the precise syntax of regexp for compilation.
117 This provides for compatibility for various utilities
118 which historically have different, incompatible syntaxes.
120 The argument SYNTAX is a bit-mask containing the two bits
121 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
124 re_set_syntax (syntax)
129 ret = obscure_syntax;
130 obscure_syntax = syntax;
134 /* re_compile_pattern takes a regular-expression string
135 and converts it into a buffer full of byte commands for matching.
137 PATTERN is the address of the pattern string
138 SIZE is the length of it.
139 BUFP is a struct re_pattern_buffer * which points to the info
140 on where to store the byte commands.
141 This structure contains a char * which points to the
142 actual space, which should have been obtained with malloc.
143 re_compile_pattern may use realloc to grow the buffer space.
145 The number of bytes of commands can be found out by looking in
146 the struct re_pattern_buffer that bufp pointed to,
147 after re_compile_pattern returns.
150 #define PATPUSH(ch) (*b++ = (char) (ch))
152 #define PATFETCH(c) \
153 {if (p == pend) goto end_of_pattern; \
154 c = * (unsigned char *) p++; \
155 if (translate) c = translate[c]; }
157 #define PATFETCH_RAW(c) \
158 {if (p == pend) goto end_of_pattern; \
159 c = * (unsigned char *) p++; }
161 #define PATUNFETCH p--
163 #define EXTEND_BUFFER \
164 { char *old_buffer = bufp->buffer; \
165 if (bufp->allocated == (1<<16)) goto too_big; \
166 bufp->allocated *= 2; \
167 if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
168 if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
169 goto memory_exhausted; \
170 c = bufp->buffer - old_buffer; \
178 pending_exact += c; \
181 static void store_jump (), insert_jump ();
184 re_compile_pattern (pattern, size, bufp)
187 struct re_pattern_buffer *bufp;
189 register char *b = bufp->buffer;
190 register char *p = pattern;
191 char *pend = pattern + size;
192 register unsigned c, c1;
194 unsigned char *translate = (unsigned char *) bufp->translate;
196 /* address of the count-byte of the most recently inserted "exactn" command.
197 This makes it possible to tell whether a new exact-match character
198 can be added to that command or requires a new "exactn" command. */
200 char *pending_exact = 0;
202 /* address of the place where a forward-jump should go
203 to the end of the containing expression.
204 Each alternative of an "or", except the last, ends with a forward-jump
207 char *fixup_jump = 0;
209 /* address of start of the most recently finished expression.
210 This tells postfix * where to find the start of its operand. */
214 /* In processing a repeat, 1 means zero matches is allowed */
218 /* In processing a repeat, 1 means many matches is allowed */
222 /* address of beginning of regexp, or inside of last \( */
226 /* Stack of information saved by \( and restored by \).
227 Four stack elements are pushed by each \(:
228 First, the value of b.
229 Second, the value of fixup_jump.
230 Third, the value of regnum.
231 Fourth, the value of begalt. */
234 int *stackp = stackb;
235 int *stacke = stackb + 40;
238 /* Counts \('s as they are encountered. Remembered for the matching \),
239 where it becomes the "register number" to put in the stop_memory command */
243 bufp->fastmap_accurate = 0;
248 * Initialize the syntax table.
254 if (bufp->allocated == 0)
256 bufp->allocated = 28;
258 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
259 bufp->buffer = (char *) realloc (bufp->buffer, 28);
261 /* Caller did not allocate a buffer. Do it for him */
262 bufp->buffer = (char *) malloc (28);
263 if (!bufp->buffer) goto memory_exhausted;
264 begalt = b = bufp->buffer;
269 if (b - bufp->buffer > bufp->allocated - 10)
270 /* Note that EXTEND_BUFFER clobbers c */
278 if (obscure_syntax & RE_TIGHT_VBAR)
280 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
282 /* Make operand of last vbar end before this `$'. */
284 store_jump (fixup_jump, jump, b);
290 /* $ means succeed if at end of line, but only in special contexts.
291 If randomly in the middle of a pattern, it is a normal character. */
292 if (p == pend || *p == '\n'
293 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
294 || (obscure_syntax & RE_NO_BK_PARENS
296 : *p == '\\' && p[1] == ')')
297 || (obscure_syntax & RE_NO_BK_VBAR
299 : *p == '\\' && p[1] == '|'))
307 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
309 if (laststart && p[-2] != '\n'
310 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
312 if (obscure_syntax & RE_TIGHT_VBAR)
315 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
326 if (obscure_syntax & RE_BK_PLUS_QM)
330 /* If there is no previous pattern, char not special. */
331 if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
333 /* If there is a sequence of repetition chars,
334 collapse it down to equivalent to just one. */
339 zero_times_ok |= c != '+';
340 many_times_ok |= c != '?';
346 else if (!(obscure_syntax & RE_BK_PLUS_QM)
347 && (c == '+' || c == '?'))
349 else if ((obscure_syntax & RE_BK_PLUS_QM)
354 if (!(c1 == '+' || c1 == '?'))
369 /* Star, etc. applied to an empty pattern is equivalent
370 to an empty pattern. */
374 /* Now we know whether 0 matches is allowed,
375 and whether 2 or more matches is allowed. */
378 /* If more than one repetition is allowed,
379 put in a backward jump at the end. */
380 store_jump (b, maybe_finalize_jump, laststart - 3);
383 insert_jump (on_failure_jump, laststart, b + 3, b);
388 /* At least one repetition required: insert before the loop
389 a skip over the initial on-failure-jump instruction */
390 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
401 while (b - bufp->buffer
402 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
403 /* Note that EXTEND_BUFFER clobbers c */
408 PATPUSH (charset_not), p++;
413 PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
414 /* Clear the whole map */
415 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
416 /* Read in characters and ranges, setting map bits */
420 if (c == ']' && p != p1 + 1) break;
421 if (*p == '-' && p[1] != ']')
426 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
430 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
433 /* Discard any bitmap bytes that are all 0 at the end of the map.
434 Decrement the map-length byte too. */
435 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
441 if (! (obscure_syntax & RE_NO_BK_PARENS))
447 if (! (obscure_syntax & RE_NO_BK_PARENS))
453 if (! (obscure_syntax & RE_NEWLINE_OR))
459 if (! (obscure_syntax & RE_NO_BK_VBAR))
465 if (p == pend) goto invalid_pattern;
470 if (obscure_syntax & RE_NO_BK_PARENS)
473 if (stackp == stacke) goto nesting_too_deep;
474 if (regnum < RE_NREGS)
476 PATPUSH (start_memory);
479 *stackp++ = b - bufp->buffer;
480 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
481 *stackp++ = regnum++;
482 *stackp++ = begalt - bufp->buffer;
489 if (obscure_syntax & RE_NO_BK_PARENS)
492 if (stackp == stackb) goto unmatched_close;
493 begalt = *--stackp + bufp->buffer;
495 store_jump (fixup_jump, jump, b);
496 if (stackp[-1] < RE_NREGS)
498 PATPUSH (stop_memory);
499 PATPUSH (stackp[-1]);
504 fixup_jump = *stackp + bufp->buffer - 1;
505 laststart = *--stackp + bufp->buffer;
509 if (obscure_syntax & RE_NO_BK_VBAR)
512 insert_jump (on_failure_jump, begalt, b + 6, b);
516 store_jump (fixup_jump, jump, b);
530 PATPUSH (syntaxspec);
532 PATPUSH (syntax_spec_code[c]);
537 PATPUSH (notsyntaxspec);
539 PATPUSH (syntax_spec_code[c]);
550 PATPUSH (notwordchar);
566 PATPUSH (notwordbound);
589 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
599 if (obscure_syntax & RE_BK_PLUS_QM)
604 /* You might think it would be useful for \ to mean
605 not to translate; but if we don't translate it
606 it will never match anything. */
607 if (translate) c = translate[c];
614 if (!pending_exact || pending_exact + *pending_exact + 1 != b
615 || *pending_exact == 0177 || *p == '*' || *p == '^'
616 || ((obscure_syntax & RE_BK_PLUS_QM)
617 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
618 : (*p == '+' || *p == '?')))
631 store_jump (fixup_jump, jump, b);
633 if (stackp != stackb) goto unmatched_open;
635 bufp->used = b - bufp->buffer;
639 return "Invalid regular expression";
642 return "Unmatched \\(";
645 return "Unmatched \\)";
648 return "Premature end of regular expression";
651 return "Nesting too deep";
654 return "Regular expression too big";
657 return "Memory exhausted";
660 /* Store where `from' points a jump operation to jump to where `to' points.
661 `opcode' is the opcode to store. */
664 store_jump (from, opcode, to)
669 from[1] = (to - (from + 3)) & 0377;
670 from[2] = (to - (from + 3)) >> 8;
673 /* Open up space at char FROM, and insert there a jump to TO.
674 CURRENT_END gives te end of the storage no in use,
675 so we know how much data to copy up.
676 OP is the opcode of the jump to insert.
678 If you call this function, you must zero out pending_exact. */
681 insert_jump (op, from, to, current_end)
683 char *from, *to, *current_end;
685 register char *pto = current_end + 3;
686 register char *pfrom = current_end;
687 while (pfrom != from)
689 store_jump (from, op, to);
692 /* Given a pattern, compute a fastmap from it.
693 The fastmap records which of the (1 << BYTEWIDTH) possible characters
694 can start a string that matches the pattern.
695 This fastmap is used by re_search to skip quickly over totally implausible text.
697 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
699 The other components of bufp describe the pattern to be used. */
702 re_compile_fastmap (bufp)
703 struct re_pattern_buffer *bufp;
705 unsigned char *pattern = (unsigned char *) bufp->buffer;
706 int size = bufp->used;
707 register char *fastmap = bufp->fastmap;
708 register unsigned char *p = pattern;
709 register unsigned char *pend = pattern + size;
711 unsigned char *translate = (unsigned char *) bufp->translate;
713 unsigned char *stackb[NFAILURES];
714 unsigned char **stackp = stackb;
716 bzero (fastmap, (1 << BYTEWIDTH));
717 bufp->fastmap_accurate = 1;
718 bufp->can_be_null = 0;
724 bufp->can_be_null = 1;
727 #ifdef SWITCH_ENUM_BUG
728 switch ((int) ((enum regexpcode) *p++))
730 switch ((enum regexpcode) *p++)
735 fastmap[translate[p[1]]] = 1;
754 fastmap[translate['\n']] = 1;
757 if (bufp->can_be_null != 1)
758 bufp->can_be_null = 2;
762 case maybe_finalize_jump:
764 case dummy_failure_jump:
765 bufp->can_be_null = 1;
767 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
768 p += j + 1; /* The 1 compensates for missing ++ above */
771 /* Jump backward reached implies we just went through
772 the body of a loop and matched nothing.
773 Opcode jumped to should be an on_failure_jump.
774 Just treat it like an ordinary jump.
775 For a * loop, it has pushed its failure point already;
776 if so, discard that as redundant. */
777 if ((enum regexpcode) *p != on_failure_jump)
781 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
782 p += j + 1; /* The 1 compensates for missing ++ above */
783 if (stackp != stackb && *stackp == p)
787 case on_failure_jump:
789 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
800 bufp->can_be_null = 1;
803 for (j = 0; j < (1 << BYTEWIDTH); j++)
806 if (bufp->can_be_null)
808 /* Don't return; check the alternative paths
809 so we can set can_be_null if appropriate. */
813 for (j = 0; j < (1 << BYTEWIDTH); j++)
814 if (SYNTAX (j) == Sword)
819 for (j = 0; j < (1 << BYTEWIDTH); j++)
820 if (SYNTAX (j) != Sword)
827 for (j = 0; j < (1 << BYTEWIDTH); j++)
828 if (SYNTAX (j) == (enum syntaxcode) k)
834 for (j = 0; j < (1 << BYTEWIDTH); j++)
835 if (SYNTAX (j) != (enum syntaxcode) k)
841 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
842 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
845 fastmap[translate[j]] = 1;
852 /* Chars beyond end of map must be allowed */
853 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
855 fastmap[translate[j]] = 1;
859 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
860 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
863 fastmap[translate[j]] = 1;
870 /* Get here means we have successfully found the possible starting characters
871 of one path of the pattern. We need not follow this path any farther.
872 Instead, look at the next alternative remembered in the stack. */
873 if (stackp != stackb)
880 /* Like re_search_2, below, but only one string is specified. */
883 re_search (pbufp, string, size, startpos, range, regs)
884 struct re_pattern_buffer *pbufp;
886 int size, startpos, range;
887 struct re_registers *regs;
889 return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
892 /* Like re_match_2 but tries first a match starting at index STARTPOS,
893 then at STARTPOS + 1, and so on.
894 RANGE is the number of places to try before giving up.
895 If RANGE is negative, the starting positions tried are
896 STARTPOS, STARTPOS - 1, etc.
897 It is up to the caller to make sure that range is not so large
898 as to take the starting position outside of the input strings.
900 The value returned is the position at which the match was found,
901 or -1 if no match was found,
902 or -2 if error (such as failure stack overflow). */
905 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
906 struct re_pattern_buffer *pbufp;
907 char *string1, *string2;
911 struct re_registers *regs;
914 register char *fastmap = pbufp->fastmap;
915 register unsigned char *translate = (unsigned char *) pbufp->translate;
916 int total = size1 + size2;
919 /* Update the fastmap now if not correct already */
920 if (fastmap && !pbufp->fastmap_accurate)
921 re_compile_fastmap (pbufp);
923 /* Don't waste time in a long search for a pattern
924 that says it is anchored. */
925 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
936 /* If a fastmap is supplied, skip quickly over characters
937 that cannot possibly be the start of a match.
938 Note, however, that if the pattern can possibly match
939 the null string, we must test it at each starting point
940 so that we take the first null string we get. */
942 if (fastmap && startpos < total && pbufp->can_be_null != 1)
946 register int lim = 0;
947 register unsigned char *p;
949 if (startpos < size1 && startpos + range >= size1)
950 lim = range - (size1 - startpos);
952 p = ((unsigned char *)
953 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
957 while (range > lim && !fastmap[translate[*p++]])
962 while (range > lim && !fastmap[*p++])
965 startpos += irange - range;
969 register unsigned char c;
970 if (startpos >= size1)
971 c = string2[startpos - size1];
973 c = string1[startpos];
975 if (translate ? !fastmap[translate[c]] : !fastmap[c])
980 if (range >= 0 && startpos == total
981 && fastmap && pbufp->can_be_null == 0)
984 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
994 #endif /* C_ALLOCA */
998 if (range > 0) range--, startpos++; else range++, startpos--;
1003 #ifndef emacs /* emacs never uses this */
1005 re_match (pbufp, string, size, pos, regs)
1006 struct re_pattern_buffer *pbufp;
1009 struct re_registers *regs;
1011 return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1015 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1017 int re_max_failures = 2000;
1019 static int bcmp_translate();
1020 /* Match the pattern described by PBUFP
1021 against data which is the virtual concatenation of STRING1 and STRING2.
1022 SIZE1 and SIZE2 are the sizes of the two data strings.
1023 Start the match at position POS.
1024 Do not consider matching past the position MSTOP.
1026 If pbufp->fastmap is nonzero, then it had better be up to date.
1028 The reason that the data to match are specified as two components
1029 which are to be regarded as concatenated
1030 is so this function can be used directly on the contents of an Emacs buffer.
1032 -1 is returned if there is no match. -2 is returned if there is
1033 an error (such as match stack overflow). Otherwise the value is the length
1034 of the substring which was matched. */
1037 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1038 struct re_pattern_buffer *pbufp;
1039 unsigned char *string1, *string2;
1042 struct re_registers *regs;
1045 register unsigned char *p = (unsigned char *) pbufp->buffer;
1046 register unsigned char *pend = p + pbufp->used;
1047 /* End of first string */
1048 unsigned char *end1;
1049 /* End of second string */
1050 unsigned char *end2;
1051 /* Pointer just past last char to consider matching */
1052 unsigned char *end_match_1, *end_match_2;
1053 register unsigned char *d, *dend;
1055 unsigned char *translate = (unsigned char *) pbufp->translate;
1057 /* Failure point stack. Each place that can handle a failure further down the line
1058 pushes a failure point on this stack. It consists of two char *'s.
1059 The first one pushed is where to resume scanning the pattern;
1060 the second pushed is where to resume scanning the strings.
1061 If the latter is zero, the failure point is a "dummy".
1062 If a failure happens and the innermost failure point is dormant,
1063 it discards that failure point and tries the next one. */
1065 unsigned char *initial_stack[2 * NFAILURES];
1066 unsigned char **stackb = initial_stack;
1067 unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1069 /* Information on the "contents" of registers.
1070 These are pointers into the input strings; they record
1071 just what was matched (on this attempt) by some part of the pattern.
1072 The start_memory command stores the start of a register's contents
1073 and the stop_memory command stores the end.
1075 At that point, regstart[regnum] points to the first character in the register,
1076 regend[regnum] points to the first character beyond the end of the register,
1077 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1078 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1080 unsigned char *regstart[RE_NREGS];
1081 unsigned char *regend[RE_NREGS];
1082 unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1084 /* Set up pointers to ends of strings.
1085 Don't allow the second string to be empty unless both are empty. */
1093 end1 = string1 + size1;
1094 end2 = string2 + size2;
1096 /* Compute where to stop matching, within the two strings */
1099 end_match_1 = string1 + mstop;
1100 end_match_2 = string2;
1105 end_match_2 = string2 + mstop - size1;
1108 /* Initialize \) text positions to -1
1109 to mark ones that no \( or \) has been seen for. */
1111 for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1112 regend[mcnt] = (unsigned char *) -1;
1114 /* `p' scans through the pattern as `d' scans through the data.
1115 `dend' is the end of the input string that `d' points within.
1116 `d' is advanced into the following input string whenever necessary,
1117 but this happens before fetching;
1118 therefore, at the beginning of the loop,
1119 `d' can be pointing at the end of a string,
1120 but it cannot equal string2. */
1123 d = string1 + pos, dend = end_match_1;
1125 d = string2 + pos - size1, dend = end_match_2;
1127 /* Write PREFETCH; just before fetching a character with *d. */
1130 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1131 d = string2; /* end of string1 => advance to string2. */ \
1132 dend = end_match_2; }
1134 /* This loop loops over pattern commands.
1135 It exits by returning from the function if match is complete,
1136 or it drops through if match fails at this starting point in the input data. */
1141 /* End of pattern means we have succeeded! */
1143 /* If caller wants register contents data back, convert it to indices */
1146 regs->start[0] = pos;
1147 if (dend == end_match_1)
1148 regs->end[0] = d - string1;
1150 regs->end[0] = d - string2 + size1;
1151 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1153 if (regend[mcnt] == (unsigned char *) -1)
1155 regs->start[mcnt] = -1;
1156 regs->end[mcnt] = -1;
1159 if (regstart_seg1[mcnt])
1160 regs->start[mcnt] = regstart[mcnt] - string1;
1162 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1163 if (regend_seg1[mcnt])
1164 regs->end[mcnt] = regend[mcnt] - string1;
1166 regs->end[mcnt] = regend[mcnt] - string2 + size1;
1169 if (dend == end_match_1)
1170 return (d - string1 - pos);
1172 return d - string2 + size1 - pos;
1175 /* Otherwise match next pattern command */
1176 #ifdef SWITCH_ENUM_BUG
1177 switch ((int) ((enum regexpcode) *p++))
1179 switch ((enum regexpcode) *p++)
1183 /* \( is represented by a start_memory, \) by a stop_memory.
1184 Both of those commands contain a "register number" argument.
1185 The text matched within the \( and \) is recorded under that number.
1186 Then, \<digit> turns into a `duplicate' command which
1187 is followed by the numeric value of <digit> as the register number. */
1191 regstart_seg1[*p++] = (dend == end_match_1);
1196 regend_seg1[*p++] = (dend == end_match_1);
1201 int regno = *p++; /* Get which register to match against */
1202 register unsigned char *d2, *dend2;
1204 d2 = regstart[regno];
1205 dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1206 ? regend[regno] : end_match_1);
1209 /* Advance to next segment in register contents, if necessary */
1212 if (dend2 == end_match_2) break;
1213 if (dend2 == regend[regno]) break;
1214 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1216 /* At end of register contents => success */
1217 if (d2 == dend2) break;
1219 /* Advance to next segment in data being matched, if necessary */
1222 /* mcnt gets # consecutive chars to compare */
1224 if (mcnt > dend2 - d2)
1226 /* Compare that many; failure if mismatch, else skip them. */
1227 if (translate ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt))
1229 d += mcnt, d2 += mcnt;
1235 /* fetch a data character */
1237 /* Match anything but a newline. */
1238 if ((translate ? translate[*d++] : *d++) == '\n')
1245 /* Nonzero for charset_not */
1248 if (*(p - 1) == (unsigned char) charset_not)
1251 /* fetch a data character */
1259 if (c < *p * BYTEWIDTH
1260 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1265 if (!not) goto fail;
1271 if (d == string1 || d[-1] == '\n')
1277 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1281 /* "or" constructs ("|") are handled by starting each alternative
1282 with an on_failure_jump that points to the start of the next alternative.
1283 Each alternative except the last ends with a jump to the joining point.
1284 (Actually, each jump except for the last one really jumps
1285 to the following jump, because tensioning the jumps is a hassle.) */
1287 /* The start of a stupid repeat has an on_failure_jump that points
1288 past the end of the repeat text.
1289 This makes a failure point so that, on failure to match a repetition,
1290 matching restarts past as many repetitions have been found
1291 with no way to fail and look for another one. */
1293 /* A smart repeat is similar but loops back to the on_failure_jump
1294 so that each repetition makes another failure point. */
1296 case on_failure_jump:
1297 if (stackp == stacke)
1299 unsigned char **stackx;
1300 if (stacke - stackb > re_max_failures * 2)
1302 stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1304 bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1305 stackp = stackx + (stackp - stackb);
1306 stacke = stackx + 2 * (stacke - stackb);
1310 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1312 *stackp++ = mcnt + p;
1316 /* The end of a smart repeat has an maybe_finalize_jump back.
1317 Change it either to a finalize_jump or an ordinary jump. */
1319 case maybe_finalize_jump:
1321 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1324 register unsigned char *p2 = p;
1325 /* Compare what follows with the begining of the repeat.
1326 If we can establish that there is nothing that they would
1327 both match, we can change to finalize_jump */
1329 && (*p2 == (unsigned char) stop_memory
1330 || *p2 == (unsigned char) start_memory))
1333 p[-3] = (unsigned char) finalize_jump;
1334 else if (*p2 == (unsigned char) exactn
1335 || *p2 == (unsigned char) endline)
1337 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
1338 register unsigned char *p1 = p + mcnt;
1339 /* p1[0] ... p1[2] are an on_failure_jump.
1340 Examine what follows that */
1341 if (p1[3] == (unsigned char) exactn && p1[5] != c)
1342 p[-3] = (unsigned char) finalize_jump;
1343 else if (p1[3] == (unsigned char) charset
1344 || p1[3] == (unsigned char) charset_not)
1346 int not = p1[3] == (unsigned char) charset_not;
1347 if (c < p1[4] * BYTEWIDTH
1348 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1350 /* not is 1 if c would match */
1351 /* That means it is not safe to finalize */
1353 p[-3] = (unsigned char) finalize_jump;
1358 if (p[-1] != (unsigned char) finalize_jump)
1360 p[-1] = (unsigned char) jump;
1364 /* The end of a stupid repeat has a finalize-jump
1365 back to the start, where another failure point will be made
1366 which will point after all the repetitions found so far. */
1374 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1375 p += mcnt + 1; /* The 1 compensates for missing ++ above */
1378 case dummy_failure_jump:
1379 if (stackp == stacke)
1381 unsigned char **stackx
1382 = (unsigned char **) alloca (2 * (stacke - stackb)
1384 bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1385 stackp = stackx + (stackp - stackb);
1386 stacke = stackx + 2 * (stacke - stackb);
1394 if (d == string1 /* Points to first char */
1395 || d == end2 /* Points to end */
1396 || (d == end1 && size2 == 0)) /* Points to end */
1398 if ((SYNTAX (d[-1]) == Sword)
1399 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1404 if (d == string1 /* Points to first char */
1405 || d == end2 /* Points to end */
1406 || (d == end1 && size2 == 0)) /* Points to end */
1408 if ((SYNTAX (d[-1]) == Sword)
1409 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1414 if (d == end2 /* Points to end */
1415 || (d == end1 && size2 == 0) /* Points to end */
1416 || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1418 if (d == string1 /* Points to first char */
1419 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1424 if (d == string1 /* Points to first char */
1425 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1427 if (d == end2 /* Points to end */
1428 || (d == end1 && size2 == 0) /* Points to end */
1429 || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1435 if (((d - string2 <= (unsigned) size2)
1436 ? d - bf_p2 : d - bf_p1)
1442 if (((d - string2 <= (unsigned) size2)
1443 ? d - bf_p2 : d - bf_p1)
1449 if (((d - string2 <= (unsigned) size2)
1450 ? d - bf_p2 : d - bf_p1)
1463 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1468 goto matchnotsyntax;
1474 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1479 if (SYNTAX (*d++) == 0) goto fail;
1484 if (SYNTAX (*d++) != 0) goto fail;
1486 #endif /* not emacs */
1489 if (d == string1) /* Note, d cannot equal string2 */
1490 break; /* unless string1 == string2. */
1494 if (d == end2 || (d == end1 && size2 == 0))
1499 /* Match the next few pattern characters exactly.
1500 mcnt is how many characters to match. */
1507 if (translate[*d++] != *p++) goto fail;
1516 if (*d++ != *p++) goto fail;
1522 continue; /* Successfully matched one pattern command; keep matching */
1524 /* Jump here if any matching operation fails. */
1526 if (stackp != stackb)
1527 /* A restart point is known. Restart there and pop it. */
1530 { /* If innermost failure point is dormant, flush it and keep looking */
1536 if (d >= string1 && d <= end1)
1539 else break; /* Matching at this starting point really fails! */
1541 return -1; /* Failure to match */
1545 bcmp_translate (s1, s2, len, translate)
1546 unsigned char *s1, *s2;
1548 unsigned char *translate;
1550 register unsigned char *p1 = s1, *p2 = s2;
1553 if (translate [*p1++] != translate [*p2++]) return 1;
1559 /* Entry points compatible with bsd4.2 regex library */
1563 static struct re_pattern_buffer re_comp_buf;
1571 if (!re_comp_buf.buffer)
1572 return "No previous regular expression";
1576 if (!re_comp_buf.buffer)
1578 if (!(re_comp_buf.buffer = (char *) malloc (200)))
1579 return "Memory exhausted";
1580 re_comp_buf.allocated = 200;
1581 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1582 return "Memory exhausted";
1584 return re_compile_pattern (s, strlen (s), &re_comp_buf);
1591 int len = strlen (s);
1592 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1601 /* Indexed by a character, gives the upper case equivalent of the character */
1603 static char upcase[0400] =
1604 { 000, 001, 002, 003, 004, 005, 006, 007,
1605 010, 011, 012, 013, 014, 015, 016, 017,
1606 020, 021, 022, 023, 024, 025, 026, 027,
1607 030, 031, 032, 033, 034, 035, 036, 037,
1608 040, 041, 042, 043, 044, 045, 046, 047,
1609 050, 051, 052, 053, 054, 055, 056, 057,
1610 060, 061, 062, 063, 064, 065, 066, 067,
1611 070, 071, 072, 073, 074, 075, 076, 077,
1612 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1613 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1614 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1615 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1616 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1617 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1618 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1619 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1620 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1621 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1622 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1623 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1624 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1625 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1626 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1627 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1628 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1629 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1630 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1631 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1632 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1633 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1634 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1635 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1643 struct re_pattern_buffer buf;
1646 char fastmap[(1 << BYTEWIDTH)];
1648 /* Allow a command argument to specify the style of syntax. */
1650 obscure_syntax = atoi (argv[1]);
1653 buf.buffer = (char *) malloc (buf.allocated);
1654 buf.fastmap = fastmap;
1655 buf.translate = upcase;
1663 re_compile_pattern (pat, strlen(pat), &buf);
1665 for (i = 0; i < buf.used; i++)
1666 printchar (buf.buffer[i]);
1670 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
1672 re_compile_fastmap (&buf);
1673 printf ("Allowed by fastmap: ");
1674 for (i = 0; i < (1 << BYTEWIDTH); i++)
1675 if (fastmap[i]) printchar (i);
1679 gets (pat); /* Now read the string to match against */
1681 i = re_match (&buf, pat, strlen (pat), 0, 0);
1682 printf ("Match value %d.\n", i);
1688 struct re_pattern_buffer *bufp;
1692 printf ("buf is :\n----------------\n");
1693 for (i = 0; i < bufp->used; i++)
1694 printchar (bufp->buffer[i]);
1696 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1698 printf ("Allowed by fastmap: ");
1699 for (i = 0; i < (1 << BYTEWIDTH); i++)
1700 if (bufp->fastmap[i])
1702 printf ("\nAllowed by translate: ");
1703 if (bufp->translate)
1704 for (i = 0; i < (1 << BYTEWIDTH); i++)
1705 if (bufp->translate[i])
1707 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1708 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1715 if (c < 041 || c >= 0177)
1718 putchar (((c >> 6) & 3) + '0');
1719 putchar (((c >> 3) & 7) + '0');
1720 putchar ((c & 7) + '0');