1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
37 #define bcopy(s,d,n) memcpy((d),(s),(n))
38 #define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
39 #define bzero(s,n) memset((s),0,(n))
43 /* Make alloca work the best possible way. */
45 #define alloca __builtin_alloca
53 * Define the syntax stuff, so we can do the \<...\> things.
56 #ifndef Sword /* must be non-zero in some of the tests below... */
60 #define SYNTAX(c) re_syntax_table[c]
64 char *re_syntax_table;
68 static char re_syntax_table[256];
79 bzero (re_syntax_table, sizeof re_syntax_table);
81 for (c = 'a'; c <= 'z'; c++)
82 re_syntax_table[c] = Sword;
84 for (c = 'A'; c <= 'Z'; c++)
85 re_syntax_table[c] = Sword;
87 for (c = '0'; c <= '9'; c++)
88 re_syntax_table[c] = Sword;
93 #endif /* SYNTAX_TABLE */
94 #endif /* not emacs */
98 /* Number of failure points to allocate space for initially,
99 when matching. If this number is exceeded, more space is allocated,
100 so it is not a hard limit. */
104 #endif /* NFAILURES */
106 /* width of a byte in bits */
110 #ifndef SIGN_EXTEND_CHAR
111 #define SIGN_EXTEND_CHAR(x) (x)
114 static int obscure_syntax = 0;
116 /* Specify the precise syntax of regexp for compilation.
117 This provides for compatibility for various utilities
118 which historically have different, incompatible syntaxes.
120 The argument SYNTAX is a bit-mask containing the two bits
121 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
124 re_set_syntax (syntax)
128 ret = obscure_syntax;
129 obscure_syntax = syntax;
133 /* re_compile_pattern takes a regular-expression string
134 and converts it into a buffer full of byte commands for matching.
136 PATTERN is the address of the pattern string
137 SIZE is the length of it.
138 BUFP is a struct re_pattern_buffer * which points to the info
139 on where to store the byte commands.
140 This structure contains a char * which points to the
141 actual space, which should have been obtained with malloc.
142 re_compile_pattern may use realloc to grow the buffer space.
144 The number of bytes of commands can be found out by looking in
145 the struct re_pattern_buffer that bufp pointed to,
146 after re_compile_pattern returns.
149 #define PATPUSH(ch) (*b++ = (char) (ch))
151 #define PATFETCH(c) \
152 {if (p == pend) goto end_of_pattern; \
153 c = * (unsigned char *) p++; \
154 if (translate) c = translate[c]; }
156 #define PATFETCH_RAW(c) \
157 {if (p == pend) goto end_of_pattern; \
158 c = * (unsigned char *) p++; }
160 #define PATUNFETCH p--
162 #define EXTEND_BUFFER \
163 { char *old_buffer = bufp->buffer; \
164 if (bufp->allocated == (1<<16)) goto too_big; \
165 bufp->allocated *= 2; \
166 if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
167 if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
168 goto memory_exhausted; \
169 c = bufp->buffer - old_buffer; \
177 pending_exact += c; \
180 static int store_jump (), insert_jump ();
183 re_compile_pattern (pattern, size, bufp)
186 struct re_pattern_buffer *bufp;
188 register char *b = bufp->buffer;
189 register char *p = pattern;
190 char *pend = pattern + size;
191 register unsigned c, c1;
193 unsigned char *translate = (unsigned char *) bufp->translate;
195 /* address of the count-byte of the most recently inserted "exactn" command.
196 This makes it possible to tell whether a new exact-match character
197 can be added to that command or requires a new "exactn" command. */
199 char *pending_exact = 0;
201 /* address of the place where a forward-jump should go
202 to the end of the containing expression.
203 Each alternative of an "or", except the last, ends with a forward-jump
206 char *fixup_jump = 0;
208 /* address of start of the most recently finished expression.
209 This tells postfix * where to find the start of its operand. */
213 /* In processing a repeat, 1 means zero matches is allowed */
217 /* In processing a repeat, 1 means many matches is allowed */
221 /* address of beginning of regexp, or inside of last \( */
225 /* Stack of information saved by \( and restored by \).
226 Four stack elements are pushed by each \(:
227 First, the value of b.
228 Second, the value of fixup_jump.
229 Third, the value of regnum.
230 Fourth, the value of begalt. */
233 int *stackp = stackb;
234 int *stacke = stackb + 40;
237 /* Counts \('s as they are encountered. Remembered for the matching \),
238 where it becomes the "register number" to put in the stop_memory command */
242 bufp->fastmap_accurate = 0;
247 * Initialize the syntax table.
253 if (bufp->allocated == 0)
255 bufp->allocated = 28;
257 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
258 bufp->buffer = (char *) realloc (bufp->buffer, 28);
260 /* Caller did not allocate a buffer. Do it for him */
261 bufp->buffer = (char *) malloc (28);
262 if (!bufp->buffer) goto memory_exhausted;
263 begalt = b = bufp->buffer;
268 if (b - bufp->buffer > bufp->allocated - 10)
269 /* Note that EXTEND_BUFFER clobbers c */
277 if (obscure_syntax & RE_TIGHT_VBAR)
279 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
281 /* Make operand of last vbar end before this `$'. */
283 store_jump (fixup_jump, jump, b);
289 /* $ means succeed if at end of line, but only in special contexts.
290 If randomly in the middle of a pattern, it is a normal character. */
291 if (p == pend || *p == '\n'
292 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
293 || (obscure_syntax & RE_NO_BK_PARENS
295 : *p == '\\' && p[1] == ')')
296 || (obscure_syntax & RE_NO_BK_VBAR
298 : *p == '\\' && p[1] == '|'))
306 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
308 if (laststart && p[-2] != '\n'
309 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
311 if (obscure_syntax & RE_TIGHT_VBAR)
314 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
325 if (obscure_syntax & RE_BK_PLUS_QM)
329 /* If there is no previous pattern, char not special. */
330 if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
332 /* If there is a sequence of repetition chars,
333 collapse it down to equivalent to just one. */
338 zero_times_ok |= c != '+';
339 many_times_ok |= c != '?';
345 else if (!(obscure_syntax & RE_BK_PLUS_QM)
346 && (c == '+' || c == '?'))
348 else if ((obscure_syntax & RE_BK_PLUS_QM)
353 if (!(c1 == '+' || c1 == '?'))
368 /* Star, etc. applied to an empty pattern is equivalent
369 to an empty pattern. */
373 /* Now we know whether 0 matches is allowed,
374 and whether 2 or more matches is allowed. */
377 /* If more than one repetition is allowed,
378 put in a backward jump at the end. */
379 store_jump (b, maybe_finalize_jump, laststart - 3);
382 insert_jump (on_failure_jump, laststart, b + 3, b);
387 /* At least one repetition required: insert before the loop
388 a skip over the initial on-failure-jump instruction */
389 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
400 while (b - bufp->buffer
401 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
402 /* Note that EXTEND_BUFFER clobbers c */
407 PATPUSH (charset_not), p++;
412 PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
413 /* Clear the whole map */
414 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
415 /* Read in characters and ranges, setting map bits */
419 if (c == ']' && p != p1 + 1) break;
420 if (*p == '-' && p[1] != ']')
425 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
429 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
432 /* Discard any bitmap bytes that are all 0 at the end of the map.
433 Decrement the map-length byte too. */
434 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
440 if (! (obscure_syntax & RE_NO_BK_PARENS))
446 if (! (obscure_syntax & RE_NO_BK_PARENS))
452 if (! (obscure_syntax & RE_NEWLINE_OR))
458 if (! (obscure_syntax & RE_NO_BK_VBAR))
464 if (p == pend) goto invalid_pattern;
469 if (obscure_syntax & RE_NO_BK_PARENS)
472 if (stackp == stacke) goto nesting_too_deep;
473 if (regnum < RE_NREGS)
475 PATPUSH (start_memory);
478 *stackp++ = b - bufp->buffer;
479 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
480 *stackp++ = regnum++;
481 *stackp++ = begalt - bufp->buffer;
488 if (obscure_syntax & RE_NO_BK_PARENS)
491 if (stackp == stackb) goto unmatched_close;
492 begalt = *--stackp + bufp->buffer;
494 store_jump (fixup_jump, jump, b);
495 if (stackp[-1] < RE_NREGS)
497 PATPUSH (stop_memory);
498 PATPUSH (stackp[-1]);
503 fixup_jump = *stackp + bufp->buffer - 1;
504 laststart = *--stackp + bufp->buffer;
508 if (obscure_syntax & RE_NO_BK_VBAR)
511 insert_jump (on_failure_jump, begalt, b + 6, b);
515 store_jump (fixup_jump, jump, b);
529 PATPUSH (syntaxspec);
531 PATPUSH (syntax_spec_code[c]);
536 PATPUSH (notsyntaxspec);
538 PATPUSH (syntax_spec_code[c]);
549 PATPUSH (notwordchar);
565 PATPUSH (notwordbound);
588 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
598 if (obscure_syntax & RE_BK_PLUS_QM)
603 /* You might think it would be useful for \ to mean
604 not to translate; but if we don't translate it
605 it will never match anything. */
606 if (translate) c = translate[c];
613 if (!pending_exact || pending_exact + *pending_exact + 1 != b
614 || *pending_exact == 0177 || *p == '*' || *p == '^'
615 || ((obscure_syntax & RE_BK_PLUS_QM)
616 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
617 : (*p == '+' || *p == '?')))
630 store_jump (fixup_jump, jump, b);
632 if (stackp != stackb) goto unmatched_open;
634 bufp->used = b - bufp->buffer;
638 return "Invalid regular expression";
641 return "Unmatched \\(";
644 return "Unmatched \\)";
647 return "Premature end of regular expression";
650 return "Nesting too deep";
653 return "Regular expression too big";
656 return "Memory exhausted";
659 /* Store where `from' points a jump operation to jump to where `to' points.
660 `opcode' is the opcode to store. */
663 store_jump (from, opcode, to)
668 from[1] = (to - (from + 3)) & 0377;
669 from[2] = (to - (from + 3)) >> 8;
672 /* Open up space at char FROM, and insert there a jump to TO.
673 CURRENT_END gives te end of the storage no in use,
674 so we know how much data to copy up.
675 OP is the opcode of the jump to insert.
677 If you call this function, you must zero out pending_exact. */
680 insert_jump (op, from, to, current_end)
682 char *from, *to, *current_end;
684 register char *pto = current_end + 3;
685 register char *pfrom = current_end;
686 while (pfrom != from)
688 store_jump (from, op, to);
691 /* Given a pattern, compute a fastmap from it.
692 The fastmap records which of the (1 << BYTEWIDTH) possible characters
693 can start a string that matches the pattern.
694 This fastmap is used by re_search to skip quickly over totally implausible text.
696 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
698 The other components of bufp describe the pattern to be used. */
701 re_compile_fastmap (bufp)
702 struct re_pattern_buffer *bufp;
704 unsigned char *pattern = (unsigned char *) bufp->buffer;
705 int size = bufp->used;
706 register char *fastmap = bufp->fastmap;
707 register unsigned char *p = pattern;
708 register unsigned char *pend = pattern + size;
710 unsigned char *translate = (unsigned char *) bufp->translate;
712 unsigned char *stackb[NFAILURES];
713 unsigned char **stackp = stackb;
715 bzero (fastmap, (1 << BYTEWIDTH));
716 bufp->fastmap_accurate = 1;
717 bufp->can_be_null = 0;
723 bufp->can_be_null = 1;
726 #ifdef SWITCH_ENUM_BUG
727 switch ((int) ((enum regexpcode) *p++))
729 switch ((enum regexpcode) *p++)
734 fastmap[translate[p[1]]] = 1;
753 fastmap[translate['\n']] = 1;
756 if (bufp->can_be_null != 1)
757 bufp->can_be_null = 2;
761 case maybe_finalize_jump:
763 case dummy_failure_jump:
764 bufp->can_be_null = 1;
766 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
767 p += j + 1; /* The 1 compensates for missing ++ above */
770 /* Jump backward reached implies we just went through
771 the body of a loop and matched nothing.
772 Opcode jumped to should be an on_failure_jump.
773 Just treat it like an ordinary jump.
774 For a * loop, it has pushed its failure point already;
775 if so, discard that as redundant. */
776 if ((enum regexpcode) *p != on_failure_jump)
780 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
781 p += j + 1; /* The 1 compensates for missing ++ above */
782 if (stackp != stackb && *stackp == p)
786 case on_failure_jump:
788 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
799 bufp->can_be_null = 1;
802 for (j = 0; j < (1 << BYTEWIDTH); j++)
805 if (bufp->can_be_null)
807 /* Don't return; check the alternative paths
808 so we can set can_be_null if appropriate. */
812 for (j = 0; j < (1 << BYTEWIDTH); j++)
813 if (SYNTAX (j) == Sword)
818 for (j = 0; j < (1 << BYTEWIDTH); j++)
819 if (SYNTAX (j) != Sword)
826 for (j = 0; j < (1 << BYTEWIDTH); j++)
827 if (SYNTAX (j) == (enum syntaxcode) k)
833 for (j = 0; j < (1 << BYTEWIDTH); j++)
834 if (SYNTAX (j) != (enum syntaxcode) k)
840 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
841 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
844 fastmap[translate[j]] = 1;
851 /* Chars beyond end of map must be allowed */
852 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
854 fastmap[translate[j]] = 1;
858 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
859 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
862 fastmap[translate[j]] = 1;
869 /* Get here means we have successfully found the possible starting characters
870 of one path of the pattern. We need not follow this path any farther.
871 Instead, look at the next alternative remembered in the stack. */
872 if (stackp != stackb)
879 /* Like re_search_2, below, but only one string is specified. */
882 re_search (pbufp, string, size, startpos, range, regs)
883 struct re_pattern_buffer *pbufp;
885 int size, startpos, range;
886 struct re_registers *regs;
888 return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
891 /* Like re_match_2 but tries first a match starting at index STARTPOS,
892 then at STARTPOS + 1, and so on.
893 RANGE is the number of places to try before giving up.
894 If RANGE is negative, the starting positions tried are
895 STARTPOS, STARTPOS - 1, etc.
896 It is up to the caller to make sure that range is not so large
897 as to take the starting position outside of the input strings.
899 The value returned is the position at which the match was found,
900 or -1 if no match was found,
901 or -2 if error (such as failure stack overflow). */
904 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
905 struct re_pattern_buffer *pbufp;
906 char *string1, *string2;
910 struct re_registers *regs;
913 register char *fastmap = pbufp->fastmap;
914 register unsigned char *translate = (unsigned char *) pbufp->translate;
915 int total = size1 + size2;
918 /* Update the fastmap now if not correct already */
919 if (fastmap && !pbufp->fastmap_accurate)
920 re_compile_fastmap (pbufp);
922 /* Don't waste time in a long search for a pattern
923 that says it is anchored. */
924 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
935 /* If a fastmap is supplied, skip quickly over characters
936 that cannot possibly be the start of a match.
937 Note, however, that if the pattern can possibly match
938 the null string, we must test it at each starting point
939 so that we take the first null string we get. */
941 if (fastmap && startpos < total && pbufp->can_be_null != 1)
945 register int lim = 0;
946 register unsigned char *p;
948 if (startpos < size1 && startpos + range >= size1)
949 lim = range - (size1 - startpos);
951 p = ((unsigned char *)
952 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
956 while (range > lim && !fastmap[translate[*p++]])
961 while (range > lim && !fastmap[*p++])
964 startpos += irange - range;
968 register unsigned char c;
969 if (startpos >= size1)
970 c = string2[startpos - size1];
972 c = string1[startpos];
974 if (translate ? !fastmap[translate[c]] : !fastmap[c])
979 if (range >= 0 && startpos == total
980 && fastmap && pbufp->can_be_null == 0)
983 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
993 #endif /* C_ALLOCA */
997 if (range > 0) range--, startpos++; else range++, startpos--;
1002 #ifndef emacs /* emacs never uses this */
1004 re_match (pbufp, string, size, pos, regs)
1005 struct re_pattern_buffer *pbufp;
1008 struct re_registers *regs;
1010 return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1014 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1016 int re_max_failures = 2000;
1018 static int bcmp_translate();
1019 /* Match the pattern described by PBUFP
1020 against data which is the virtual concatenation of STRING1 and STRING2.
1021 SIZE1 and SIZE2 are the sizes of the two data strings.
1022 Start the match at position POS.
1023 Do not consider matching past the position MSTOP.
1025 If pbufp->fastmap is nonzero, then it had better be up to date.
1027 The reason that the data to match are specified as two components
1028 which are to be regarded as concatenated
1029 is so this function can be used directly on the contents of an Emacs buffer.
1031 -1 is returned if there is no match. -2 is returned if there is
1032 an error (such as match stack overflow). Otherwise the value is the length
1033 of the substring which was matched. */
1036 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1037 struct re_pattern_buffer *pbufp;
1038 unsigned char *string1, *string2;
1041 struct re_registers *regs;
1044 register unsigned char *p = (unsigned char *) pbufp->buffer;
1045 register unsigned char *pend = p + pbufp->used;
1046 /* End of first string */
1047 unsigned char *end1;
1048 /* End of second string */
1049 unsigned char *end2;
1050 /* Pointer just past last char to consider matching */
1051 unsigned char *end_match_1, *end_match_2;
1052 register unsigned char *d, *dend;
1054 unsigned char *translate = (unsigned char *) pbufp->translate;
1056 /* Failure point stack. Each place that can handle a failure further down the line
1057 pushes a failure point on this stack. It consists of two char *'s.
1058 The first one pushed is where to resume scanning the pattern;
1059 the second pushed is where to resume scanning the strings.
1060 If the latter is zero, the failure point is a "dummy".
1061 If a failure happens and the innermost failure point is dormant,
1062 it discards that failure point and tries the next one. */
1064 unsigned char *initial_stack[2 * NFAILURES];
1065 unsigned char **stackb = initial_stack;
1066 unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1068 /* Information on the "contents" of registers.
1069 These are pointers into the input strings; they record
1070 just what was matched (on this attempt) by some part of the pattern.
1071 The start_memory command stores the start of a register's contents
1072 and the stop_memory command stores the end.
1074 At that point, regstart[regnum] points to the first character in the register,
1075 regend[regnum] points to the first character beyond the end of the register,
1076 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1077 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1079 unsigned char *regstart[RE_NREGS];
1080 unsigned char *regend[RE_NREGS];
1081 unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1083 /* Set up pointers to ends of strings.
1084 Don't allow the second string to be empty unless both are empty. */
1092 end1 = string1 + size1;
1093 end2 = string2 + size2;
1095 /* Compute where to stop matching, within the two strings */
1098 end_match_1 = string1 + mstop;
1099 end_match_2 = string2;
1104 end_match_2 = string2 + mstop - size1;
1107 /* Initialize \) text positions to -1
1108 to mark ones that no \( or \) has been seen for. */
1110 for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1111 regend[mcnt] = (unsigned char *) -1;
1113 /* `p' scans through the pattern as `d' scans through the data.
1114 `dend' is the end of the input string that `d' points within.
1115 `d' is advanced into the following input string whenever necessary,
1116 but this happens before fetching;
1117 therefore, at the beginning of the loop,
1118 `d' can be pointing at the end of a string,
1119 but it cannot equal string2. */
1122 d = string1 + pos, dend = end_match_1;
1124 d = string2 + pos - size1, dend = end_match_2;
1126 /* Write PREFETCH; just before fetching a character with *d. */
1129 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1130 d = string2; /* end of string1 => advance to string2. */ \
1131 dend = end_match_2; }
1133 /* This loop loops over pattern commands.
1134 It exits by returning from the function if match is complete,
1135 or it drops through if match fails at this starting point in the input data. */
1140 /* End of pattern means we have succeeded! */
1142 /* If caller wants register contents data back, convert it to indices */
1145 regs->start[0] = pos;
1146 if (dend == end_match_1)
1147 regs->end[0] = d - string1;
1149 regs->end[0] = d - string2 + size1;
1150 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1152 if (regend[mcnt] == (unsigned char *) -1)
1154 regs->start[mcnt] = -1;
1155 regs->end[mcnt] = -1;
1158 if (regstart_seg1[mcnt])
1159 regs->start[mcnt] = regstart[mcnt] - string1;
1161 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1162 if (regend_seg1[mcnt])
1163 regs->end[mcnt] = regend[mcnt] - string1;
1165 regs->end[mcnt] = regend[mcnt] - string2 + size1;
1168 if (dend == end_match_1)
1169 return (d - string1 - pos);
1171 return d - string2 + size1 - pos;
1174 /* Otherwise match next pattern command */
1175 #ifdef SWITCH_ENUM_BUG
1176 switch ((int) ((enum regexpcode) *p++))
1178 switch ((enum regexpcode) *p++)
1182 /* \( is represented by a start_memory, \) by a stop_memory.
1183 Both of those commands contain a "register number" argument.
1184 The text matched within the \( and \) is recorded under that number.
1185 Then, \<digit> turns into a `duplicate' command which
1186 is followed by the numeric value of <digit> as the register number. */
1190 regstart_seg1[*p++] = (dend == end_match_1);
1195 regend_seg1[*p++] = (dend == end_match_1);
1200 int regno = *p++; /* Get which register to match against */
1201 register unsigned char *d2, *dend2;
1203 d2 = regstart[regno];
1204 dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1205 ? regend[regno] : end_match_1);
1208 /* Advance to next segment in register contents, if necessary */
1211 if (dend2 == end_match_2) break;
1212 if (dend2 == regend[regno]) break;
1213 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1215 /* At end of register contents => success */
1216 if (d2 == dend2) break;
1218 /* Advance to next segment in data being matched, if necessary */
1221 /* mcnt gets # consecutive chars to compare */
1223 if (mcnt > dend2 - d2)
1225 /* Compare that many; failure if mismatch, else skip them. */
1226 if (translate ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt))
1228 d += mcnt, d2 += mcnt;
1234 /* fetch a data character */
1236 /* Match anything but a newline. */
1237 if ((translate ? translate[*d++] : *d++) == '\n')
1244 /* Nonzero for charset_not */
1247 if (*(p - 1) == (unsigned char) charset_not)
1250 /* fetch a data character */
1258 if (c < *p * BYTEWIDTH
1259 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1264 if (!not) goto fail;
1270 if (d == string1 || d[-1] == '\n')
1276 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1280 /* "or" constructs ("|") are handled by starting each alternative
1281 with an on_failure_jump that points to the start of the next alternative.
1282 Each alternative except the last ends with a jump to the joining point.
1283 (Actually, each jump except for the last one really jumps
1284 to the following jump, because tensioning the jumps is a hassle.) */
1286 /* The start of a stupid repeat has an on_failure_jump that points
1287 past the end of the repeat text.
1288 This makes a failure point so that, on failure to match a repetition,
1289 matching restarts past as many repetitions have been found
1290 with no way to fail and look for another one. */
1292 /* A smart repeat is similar but loops back to the on_failure_jump
1293 so that each repetition makes another failure point. */
1295 case on_failure_jump:
1296 if (stackp == stacke)
1298 unsigned char **stackx;
1299 if (stacke - stackb > re_max_failures * 2)
1301 stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1303 bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1304 stackp = stackx + (stackp - stackb);
1305 stacke = stackx + 2 * (stacke - stackb);
1309 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1311 *stackp++ = mcnt + p;
1315 /* The end of a smart repeat has an maybe_finalize_jump back.
1316 Change it either to a finalize_jump or an ordinary jump. */
1318 case maybe_finalize_jump:
1320 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1323 register unsigned char *p2 = p;
1324 /* Compare what follows with the begining of the repeat.
1325 If we can establish that there is nothing that they would
1326 both match, we can change to finalize_jump */
1328 && (*p2 == (unsigned char) stop_memory
1329 || *p2 == (unsigned char) start_memory))
1332 p[-3] = (unsigned char) finalize_jump;
1333 else if (*p2 == (unsigned char) exactn
1334 || *p2 == (unsigned char) endline)
1336 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
1337 register unsigned char *p1 = p + mcnt;
1338 /* p1[0] ... p1[2] are an on_failure_jump.
1339 Examine what follows that */
1340 if (p1[3] == (unsigned char) exactn && p1[5] != c)
1341 p[-3] = (unsigned char) finalize_jump;
1342 else if (p1[3] == (unsigned char) charset
1343 || p1[3] == (unsigned char) charset_not)
1345 int not = p1[3] == (unsigned char) charset_not;
1346 if (c < p1[4] * BYTEWIDTH
1347 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1349 /* not is 1 if c would match */
1350 /* That means it is not safe to finalize */
1352 p[-3] = (unsigned char) finalize_jump;
1357 if (p[-1] != (unsigned char) finalize_jump)
1359 p[-1] = (unsigned char) jump;
1363 /* The end of a stupid repeat has a finalize-jump
1364 back to the start, where another failure point will be made
1365 which will point after all the repetitions found so far. */
1373 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1374 p += mcnt + 1; /* The 1 compensates for missing ++ above */
1377 case dummy_failure_jump:
1378 if (stackp == stacke)
1380 unsigned char **stackx
1381 = (unsigned char **) alloca (2 * (stacke - stackb)
1383 bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1384 stackp = stackx + (stackp - stackb);
1385 stacke = stackx + 2 * (stacke - stackb);
1393 if (d == string1 /* Points to first char */
1394 || d == end2 /* Points to end */
1395 || (d == end1 && size2 == 0)) /* Points to end */
1397 if ((SYNTAX (d[-1]) == Sword)
1398 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1403 if (d == string1 /* Points to first char */
1404 || d == end2 /* Points to end */
1405 || (d == end1 && size2 == 0)) /* Points to end */
1407 if ((SYNTAX (d[-1]) == Sword)
1408 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1413 if (d == end2 /* Points to end */
1414 || (d == end1 && size2 == 0) /* Points to end */
1415 || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1417 if (d == string1 /* Points to first char */
1418 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1423 if (d == string1 /* Points to first char */
1424 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1426 if (d == end2 /* Points to end */
1427 || (d == end1 && size2 == 0) /* Points to end */
1428 || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1434 if (((d - string2 <= (unsigned) size2)
1435 ? d - bf_p2 : d - bf_p1)
1441 if (((d - string2 <= (unsigned) size2)
1442 ? d - bf_p2 : d - bf_p1)
1448 if (((d - string2 <= (unsigned) size2)
1449 ? d - bf_p2 : d - bf_p1)
1462 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1467 goto matchnotsyntax;
1473 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1478 if (SYNTAX (*d++) == 0) goto fail;
1483 if (SYNTAX (*d++) != 0) goto fail;
1485 #endif /* not emacs */
1488 if (d == string1) /* Note, d cannot equal string2 */
1489 break; /* unless string1 == string2. */
1493 if (d == end2 || (d == end1 && size2 == 0))
1498 /* Match the next few pattern characters exactly.
1499 mcnt is how many characters to match. */
1506 if (translate[*d++] != *p++) goto fail;
1515 if (*d++ != *p++) goto fail;
1521 continue; /* Successfully matched one pattern command; keep matching */
1523 /* Jump here if any matching operation fails. */
1525 if (stackp != stackb)
1526 /* A restart point is known. Restart there and pop it. */
1529 { /* If innermost failure point is dormant, flush it and keep looking */
1535 if (d >= string1 && d <= end1)
1538 else break; /* Matching at this starting point really fails! */
1540 return -1; /* Failure to match */
1544 bcmp_translate (s1, s2, len, translate)
1545 unsigned char *s1, *s2;
1547 unsigned char *translate;
1549 register unsigned char *p1 = s1, *p2 = s2;
1552 if (translate [*p1++] != translate [*p2++]) return 1;
1558 /* Entry points compatible with bsd4.2 regex library */
1562 static struct re_pattern_buffer re_comp_buf;
1570 if (!re_comp_buf.buffer)
1571 return "No previous regular expression";
1575 if (!re_comp_buf.buffer)
1577 if (!(re_comp_buf.buffer = (char *) malloc (200)))
1578 return "Memory exhausted";
1579 re_comp_buf.allocated = 200;
1580 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1581 return "Memory exhausted";
1583 return re_compile_pattern (s, strlen (s), &re_comp_buf);
1590 int len = strlen (s);
1591 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1600 /* Indexed by a character, gives the upper case equivalent of the character */
1602 static char upcase[0400] =
1603 { 000, 001, 002, 003, 004, 005, 006, 007,
1604 010, 011, 012, 013, 014, 015, 016, 017,
1605 020, 021, 022, 023, 024, 025, 026, 027,
1606 030, 031, 032, 033, 034, 035, 036, 037,
1607 040, 041, 042, 043, 044, 045, 046, 047,
1608 050, 051, 052, 053, 054, 055, 056, 057,
1609 060, 061, 062, 063, 064, 065, 066, 067,
1610 070, 071, 072, 073, 074, 075, 076, 077,
1611 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1612 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1613 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1614 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1615 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1616 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1617 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1618 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1619 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1620 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1621 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1622 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1623 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1624 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1625 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1626 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1627 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1628 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1629 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1630 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1631 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1632 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1633 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1634 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1642 struct re_pattern_buffer buf;
1645 char fastmap[(1 << BYTEWIDTH)];
1647 /* Allow a command argument to specify the style of syntax. */
1649 obscure_syntax = atoi (argv[1]);
1652 buf.buffer = (char *) malloc (buf.allocated);
1653 buf.fastmap = fastmap;
1654 buf.translate = upcase;
1662 re_compile_pattern (pat, strlen(pat), &buf);
1664 for (i = 0; i < buf.used; i++)
1665 printchar (buf.buffer[i]);
1669 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
1671 re_compile_fastmap (&buf);
1672 printf ("Allowed by fastmap: ");
1673 for (i = 0; i < (1 << BYTEWIDTH); i++)
1674 if (fastmap[i]) printchar (i);
1678 gets (pat); /* Now read the string to match against */
1680 i = re_match (&buf, pat, strlen (pat), 0, 0);
1681 printf ("Match value %d.\n", i);
1687 struct re_pattern_buffer *bufp;
1691 printf ("buf is :\n----------------\n");
1692 for (i = 0; i < bufp->used; i++)
1693 printchar (bufp->buffer[i]);
1695 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1697 printf ("Allowed by fastmap: ");
1698 for (i = 0; i < (1 << BYTEWIDTH); i++)
1699 if (bufp->fastmap[i])
1701 printf ("\nAllowed by translate: ");
1702 if (bufp->translate)
1703 for (i = 0; i < (1 << BYTEWIDTH); i++)
1704 if (bufp->translate[i])
1706 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1707 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1714 if (c < 041 || c >= 0177)
1717 putchar (((c >> 6) & 3) + '0');
1718 putchar (((c >> 3) & 7) + '0');
1719 putchar ((c & 7) + '0');