1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
35 /* Make alloca work the best possible way. */
37 #define alloca __builtin_alloca
45 * Define the syntax stuff, so we can do the \<...\> things.
48 #ifndef Sword /* must be non-zero in some of the tests below... */
52 #define SYNTAX(c) re_syntax_table[c]
56 char *re_syntax_table;
60 static char re_syntax_table[256];
71 memset (re_syntax_table, '\0', sizeof re_syntax_table);
73 for (c = 'a'; c <= 'z'; c++)
74 re_syntax_table[c] = Sword;
76 for (c = 'A'; c <= 'Z'; c++)
77 re_syntax_table[c] = Sword;
79 for (c = '0'; c <= '9'; c++)
80 re_syntax_table[c] = Sword;
85 #endif /* SYNTAX_TABLE */
86 #endif /* not emacs */
90 /* Number of failure points to allocate space for initially,
91 when matching. If this number is exceeded, more space is allocated,
92 so it is not a hard limit. */
96 #endif /* NFAILURES */
98 /* width of a byte in bits */
102 #ifndef SIGN_EXTEND_CHAR
103 #define SIGN_EXTEND_CHAR(x) (x)
106 static int obscure_syntax = 0;
108 /* Specify the precise syntax of regexp for compilation.
109 This provides for compatibility for various utilities
110 which historically have different, incompatible syntaxes.
112 The argument SYNTAX is a bit-mask containing the two bits
113 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
116 re_set_syntax (syntax)
121 ret = obscure_syntax;
122 obscure_syntax = syntax;
126 /* re_compile_pattern takes a regular-expression string
127 and converts it into a buffer full of byte commands for matching.
129 PATTERN is the address of the pattern string
130 SIZE is the length of it.
131 BUFP is a struct re_pattern_buffer * which points to the info
132 on where to store the byte commands.
133 This structure contains a char * which points to the
134 actual space, which should have been obtained with malloc.
135 re_compile_pattern may use realloc to grow the buffer space.
137 The number of bytes of commands can be found out by looking in
138 the struct re_pattern_buffer that bufp pointed to,
139 after re_compile_pattern returns.
142 #define PATPUSH(ch) (*b++ = (char) (ch))
144 #define PATFETCH(c) \
145 {if (p == pend) goto end_of_pattern; \
146 c = * (unsigned char *) p++; \
147 if (translate) c = translate[c]; }
149 #define PATFETCH_RAW(c) \
150 {if (p == pend) goto end_of_pattern; \
151 c = * (unsigned char *) p++; }
153 #define PATUNFETCH p--
155 #define EXTEND_BUFFER \
156 { char *old_buffer = bufp->buffer; \
157 if (bufp->allocated == (1<<16)) goto too_big; \
158 bufp->allocated *= 2; \
159 if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
160 if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
161 goto memory_exhausted; \
162 c = bufp->buffer - old_buffer; \
170 pending_exact += c; \
173 static void store_jump (), insert_jump ();
176 re_compile_pattern (pattern, size, bufp)
179 struct re_pattern_buffer *bufp;
181 register char *b = bufp->buffer;
182 register char *p = pattern;
183 char *pend = pattern + size;
184 register unsigned c, c1;
186 unsigned char *translate = (unsigned char *) bufp->translate;
188 /* address of the count-byte of the most recently inserted "exactn" command.
189 This makes it possible to tell whether a new exact-match character
190 can be added to that command or requires a new "exactn" command. */
192 char *pending_exact = 0;
194 /* address of the place where a forward-jump should go
195 to the end of the containing expression.
196 Each alternative of an "or", except the last, ends with a forward-jump
199 char *fixup_jump = 0;
201 /* address of start of the most recently finished expression.
202 This tells postfix * where to find the start of its operand. */
206 /* In processing a repeat, 1 means zero matches is allowed */
210 /* In processing a repeat, 1 means many matches is allowed */
214 /* address of beginning of regexp, or inside of last \( */
218 /* Stack of information saved by \( and restored by \).
219 Four stack elements are pushed by each \(:
220 First, the value of b.
221 Second, the value of fixup_jump.
222 Third, the value of regnum.
223 Fourth, the value of begalt. */
226 int *stackp = stackb;
227 int *stacke = stackb + 40;
230 /* Counts \('s as they are encountered. Remembered for the matching \),
231 where it becomes the "register number" to put in the stop_memory command */
235 bufp->fastmap_accurate = 0;
240 * Initialize the syntax table.
246 if (bufp->allocated == 0)
248 bufp->allocated = 28;
250 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
251 bufp->buffer = (char *) realloc (bufp->buffer, 28);
253 /* Caller did not allocate a buffer. Do it for him */
254 bufp->buffer = (char *) malloc (28);
255 if (!bufp->buffer) goto memory_exhausted;
256 begalt = b = bufp->buffer;
261 if (b - bufp->buffer > bufp->allocated - 10)
262 /* Note that EXTEND_BUFFER clobbers c */
270 if (obscure_syntax & RE_TIGHT_VBAR)
272 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
274 /* Make operand of last vbar end before this `$'. */
276 store_jump (fixup_jump, jump, b);
282 /* $ means succeed if at end of line, but only in special contexts.
283 If randomly in the middle of a pattern, it is a normal character. */
284 if (p == pend || *p == '\n'
285 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
286 || (obscure_syntax & RE_NO_BK_PARENS
288 : *p == '\\' && p[1] == ')')
289 || (obscure_syntax & RE_NO_BK_VBAR
291 : *p == '\\' && p[1] == '|'))
299 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
301 if (laststart && p[-2] != '\n'
302 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
304 if (obscure_syntax & RE_TIGHT_VBAR)
307 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
318 if (obscure_syntax & RE_BK_PLUS_QM)
322 /* If there is no previous pattern, char not special. */
323 if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
325 /* If there is a sequence of repetition chars,
326 collapse it down to equivalent to just one. */
331 zero_times_ok |= c != '+';
332 many_times_ok |= c != '?';
338 else if (!(obscure_syntax & RE_BK_PLUS_QM)
339 && (c == '+' || c == '?'))
341 else if ((obscure_syntax & RE_BK_PLUS_QM)
346 if (!(c1 == '+' || c1 == '?'))
361 /* Star, etc. applied to an empty pattern is equivalent
362 to an empty pattern. */
366 /* Now we know whether 0 matches is allowed,
367 and whether 2 or more matches is allowed. */
370 /* If more than one repetition is allowed,
371 put in a backward jump at the end. */
372 store_jump (b, maybe_finalize_jump, laststart - 3);
375 insert_jump (on_failure_jump, laststart, b + 3, b);
380 /* At least one repetition required: insert before the loop
381 a skip over the initial on-failure-jump instruction */
382 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
393 while (b - bufp->buffer
394 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
395 /* Note that EXTEND_BUFFER clobbers c */
400 PATPUSH (charset_not), p++;
405 PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
406 /* Clear the whole map */
407 memset (b, '\0', (1 << BYTEWIDTH) / BYTEWIDTH);
408 /* Read in characters and ranges, setting map bits */
412 if (c == ']' && p != p1 + 1) break;
413 if (*p == '-' && p[1] != ']')
418 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
422 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
425 /* Discard any bitmap bytes that are all 0 at the end of the map.
426 Decrement the map-length byte too. */
427 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
433 if (! (obscure_syntax & RE_NO_BK_PARENS))
439 if (! (obscure_syntax & RE_NO_BK_PARENS))
445 if (! (obscure_syntax & RE_NEWLINE_OR))
451 if (! (obscure_syntax & RE_NO_BK_VBAR))
457 if (p == pend) goto invalid_pattern;
462 if (obscure_syntax & RE_NO_BK_PARENS)
465 if (stackp == stacke) goto nesting_too_deep;
466 if (regnum < RE_NREGS)
468 PATPUSH (start_memory);
471 *stackp++ = b - bufp->buffer;
472 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
473 *stackp++ = regnum++;
474 *stackp++ = begalt - bufp->buffer;
481 if (obscure_syntax & RE_NO_BK_PARENS)
484 if (stackp == stackb) goto unmatched_close;
485 begalt = *--stackp + bufp->buffer;
487 store_jump (fixup_jump, jump, b);
488 if (stackp[-1] < RE_NREGS)
490 PATPUSH (stop_memory);
491 PATPUSH (stackp[-1]);
496 fixup_jump = *stackp + bufp->buffer - 1;
497 laststart = *--stackp + bufp->buffer;
501 if (obscure_syntax & RE_NO_BK_VBAR)
504 insert_jump (on_failure_jump, begalt, b + 6, b);
508 store_jump (fixup_jump, jump, b);
522 PATPUSH (syntaxspec);
524 PATPUSH (syntax_spec_code[c]);
529 PATPUSH (notsyntaxspec);
531 PATPUSH (syntax_spec_code[c]);
542 PATPUSH (notwordchar);
558 PATPUSH (notwordbound);
581 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
591 if (obscure_syntax & RE_BK_PLUS_QM)
596 /* You might think it would be useful for \ to mean
597 not to translate; but if we don't translate it
598 it will never match anything. */
599 if (translate) c = translate[c];
606 if (!pending_exact || pending_exact + *pending_exact + 1 != b
607 || *pending_exact == 0177 || *p == '*' || *p == '^'
608 || ((obscure_syntax & RE_BK_PLUS_QM)
609 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
610 : (*p == '+' || *p == '?')))
623 store_jump (fixup_jump, jump, b);
625 if (stackp != stackb) goto unmatched_open;
627 bufp->used = b - bufp->buffer;
631 return "Invalid regular expression";
634 return "Unmatched \\(";
637 return "Unmatched \\)";
640 return "Premature end of regular expression";
643 return "Nesting too deep";
646 return "Regular expression too big";
649 return "Memory exhausted";
652 /* Store where `from' points a jump operation to jump to where `to' points.
653 `opcode' is the opcode to store. */
656 store_jump (from, opcode, to)
661 from[1] = (to - (from + 3)) & 0377;
662 from[2] = (to - (from + 3)) >> 8;
665 /* Open up space at char FROM, and insert there a jump to TO.
666 CURRENT_END gives te end of the storage no in use,
667 so we know how much data to copy up.
668 OP is the opcode of the jump to insert.
670 If you call this function, you must zero out pending_exact. */
673 insert_jump (op, from, to, current_end)
675 char *from, *to, *current_end;
677 register char *pto = current_end + 3;
678 register char *pfrom = current_end;
679 while (pfrom != from)
681 store_jump (from, op, to);
684 /* Given a pattern, compute a fastmap from it.
685 The fastmap records which of the (1 << BYTEWIDTH) possible characters
686 can start a string that matches the pattern.
687 This fastmap is used by re_search to skip quickly over totally implausible text.
689 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
691 The other components of bufp describe the pattern to be used. */
694 re_compile_fastmap (bufp)
695 struct re_pattern_buffer *bufp;
697 unsigned char *pattern = (unsigned char *) bufp->buffer;
698 int size = bufp->used;
699 register char *fastmap = bufp->fastmap;
700 register unsigned char *p = pattern;
701 register unsigned char *pend = pattern + size;
703 unsigned char *translate = (unsigned char *) bufp->translate;
705 unsigned char *stackb[NFAILURES];
706 unsigned char **stackp = stackb;
708 memset (fastmap, '\0', (1 << BYTEWIDTH));
709 bufp->fastmap_accurate = 1;
710 bufp->can_be_null = 0;
716 bufp->can_be_null = 1;
719 #ifdef SWITCH_ENUM_BUG
720 switch ((int) ((enum regexpcode) *p++))
722 switch ((enum regexpcode) *p++)
727 fastmap[translate[p[1]]] = 1;
746 fastmap[translate['\n']] = 1;
749 if (bufp->can_be_null != 1)
750 bufp->can_be_null = 2;
754 case maybe_finalize_jump:
756 case dummy_failure_jump:
757 bufp->can_be_null = 1;
759 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
760 p += j + 1; /* The 1 compensates for missing ++ above */
763 /* Jump backward reached implies we just went through
764 the body of a loop and matched nothing.
765 Opcode jumped to should be an on_failure_jump.
766 Just treat it like an ordinary jump.
767 For a * loop, it has pushed its failure point already;
768 if so, discard that as redundant. */
769 if ((enum regexpcode) *p != on_failure_jump)
773 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
774 p += j + 1; /* The 1 compensates for missing ++ above */
775 if (stackp != stackb && *stackp == p)
779 case on_failure_jump:
781 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
792 bufp->can_be_null = 1;
795 for (j = 0; j < (1 << BYTEWIDTH); j++)
798 if (bufp->can_be_null)
800 /* Don't return; check the alternative paths
801 so we can set can_be_null if appropriate. */
805 for (j = 0; j < (1 << BYTEWIDTH); j++)
806 if (SYNTAX (j) == Sword)
811 for (j = 0; j < (1 << BYTEWIDTH); j++)
812 if (SYNTAX (j) != Sword)
819 for (j = 0; j < (1 << BYTEWIDTH); j++)
820 if (SYNTAX (j) == (enum syntaxcode) k)
826 for (j = 0; j < (1 << BYTEWIDTH); j++)
827 if (SYNTAX (j) != (enum syntaxcode) k)
833 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
834 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
837 fastmap[translate[j]] = 1;
844 /* Chars beyond end of map must be allowed */
845 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
847 fastmap[translate[j]] = 1;
851 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
852 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
855 fastmap[translate[j]] = 1;
862 /* Get here means we have successfully found the possible starting characters
863 of one path of the pattern. We need not follow this path any farther.
864 Instead, look at the next alternative remembered in the stack. */
865 if (stackp != stackb)
872 /* Like re_search_2, below, but only one string is specified. */
875 re_search (pbufp, string, size, startpos, range, regs)
876 struct re_pattern_buffer *pbufp;
878 int size, startpos, range;
879 struct re_registers *regs;
881 return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
884 /* Like re_match_2 but tries first a match starting at index STARTPOS,
885 then at STARTPOS + 1, and so on.
886 RANGE is the number of places to try before giving up.
887 If RANGE is negative, the starting positions tried are
888 STARTPOS, STARTPOS - 1, etc.
889 It is up to the caller to make sure that range is not so large
890 as to take the starting position outside of the input strings.
892 The value returned is the position at which the match was found,
893 or -1 if no match was found,
894 or -2 if error (such as failure stack overflow). */
897 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
898 struct re_pattern_buffer *pbufp;
899 char *string1, *string2;
903 struct re_registers *regs;
906 register char *fastmap = pbufp->fastmap;
907 register unsigned char *translate = (unsigned char *) pbufp->translate;
908 int total = size1 + size2;
911 /* Update the fastmap now if not correct already */
912 if (fastmap && !pbufp->fastmap_accurate)
913 re_compile_fastmap (pbufp);
915 /* Don't waste time in a long search for a pattern
916 that says it is anchored. */
917 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
928 /* If a fastmap is supplied, skip quickly over characters
929 that cannot possibly be the start of a match.
930 Note, however, that if the pattern can possibly match
931 the null string, we must test it at each starting point
932 so that we take the first null string we get. */
934 if (fastmap && startpos < total && pbufp->can_be_null != 1)
938 register int lim = 0;
939 register unsigned char *p;
941 if (startpos < size1 && startpos + range >= size1)
942 lim = range - (size1 - startpos);
944 p = ((unsigned char *)
945 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
949 while (range > lim && !fastmap[translate[*p++]])
954 while (range > lim && !fastmap[*p++])
957 startpos += irange - range;
961 register unsigned char c;
962 if (startpos >= size1)
963 c = string2[startpos - size1];
965 c = string1[startpos];
967 if (translate ? !fastmap[translate[c]] : !fastmap[c])
972 if (range >= 0 && startpos == total
973 && fastmap && pbufp->can_be_null == 0)
976 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
986 #endif /* C_ALLOCA */
990 if (range > 0) range--, startpos++; else range++, startpos--;
995 #ifndef emacs /* emacs never uses this */
997 re_match (pbufp, string, size, pos, regs)
998 struct re_pattern_buffer *pbufp;
1001 struct re_registers *regs;
1003 return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1007 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1009 int re_max_failures = 2000;
1011 static int memcmp_translate();
1012 /* Match the pattern described by PBUFP
1013 against data which is the virtual concatenation of STRING1 and STRING2.
1014 SIZE1 and SIZE2 are the sizes of the two data strings.
1015 Start the match at position POS.
1016 Do not consider matching past the position MSTOP.
1018 If pbufp->fastmap is nonzero, then it had better be up to date.
1020 The reason that the data to match are specified as two components
1021 which are to be regarded as concatenated
1022 is so this function can be used directly on the contents of an Emacs buffer.
1024 -1 is returned if there is no match. -2 is returned if there is
1025 an error (such as match stack overflow). Otherwise the value is the length
1026 of the substring which was matched. */
1029 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1030 struct re_pattern_buffer *pbufp;
1031 unsigned char *string1, *string2;
1034 struct re_registers *regs;
1037 register unsigned char *p = (unsigned char *) pbufp->buffer;
1038 register unsigned char *pend = p + pbufp->used;
1039 /* End of first string */
1040 unsigned char *end1;
1041 /* End of second string */
1042 unsigned char *end2;
1043 /* Pointer just past last char to consider matching */
1044 unsigned char *end_match_1, *end_match_2;
1045 register unsigned char *d, *dend;
1047 unsigned char *translate = (unsigned char *) pbufp->translate;
1049 /* Failure point stack. Each place that can handle a failure further down the line
1050 pushes a failure point on this stack. It consists of two char *'s.
1051 The first one pushed is where to resume scanning the pattern;
1052 the second pushed is where to resume scanning the strings.
1053 If the latter is zero, the failure point is a "dummy".
1054 If a failure happens and the innermost failure point is dormant,
1055 it discards that failure point and tries the next one. */
1057 unsigned char *initial_stack[2 * NFAILURES];
1058 unsigned char **stackb = initial_stack;
1059 unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1061 /* Information on the "contents" of registers.
1062 These are pointers into the input strings; they record
1063 just what was matched (on this attempt) by some part of the pattern.
1064 The start_memory command stores the start of a register's contents
1065 and the stop_memory command stores the end.
1067 At that point, regstart[regnum] points to the first character in the register,
1068 regend[regnum] points to the first character beyond the end of the register,
1069 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1070 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1072 unsigned char *regstart[RE_NREGS];
1073 unsigned char *regend[RE_NREGS];
1074 unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1076 /* Set up pointers to ends of strings.
1077 Don't allow the second string to be empty unless both are empty. */
1085 end1 = string1 + size1;
1086 end2 = string2 + size2;
1088 /* Compute where to stop matching, within the two strings */
1091 end_match_1 = string1 + mstop;
1092 end_match_2 = string2;
1097 end_match_2 = string2 + mstop - size1;
1100 /* Initialize \) text positions to -1
1101 to mark ones that no \( or \) has been seen for. */
1103 for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1104 regend[mcnt] = (unsigned char *) -1;
1106 /* `p' scans through the pattern as `d' scans through the data.
1107 `dend' is the end of the input string that `d' points within.
1108 `d' is advanced into the following input string whenever necessary,
1109 but this happens before fetching;
1110 therefore, at the beginning of the loop,
1111 `d' can be pointing at the end of a string,
1112 but it cannot equal string2. */
1115 d = string1 + pos, dend = end_match_1;
1117 d = string2 + pos - size1, dend = end_match_2;
1119 /* Write PREFETCH; just before fetching a character with *d. */
1122 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1123 d = string2; /* end of string1 => advance to string2. */ \
1124 dend = end_match_2; }
1126 /* This loop loops over pattern commands.
1127 It exits by returning from the function if match is complete,
1128 or it drops through if match fails at this starting point in the input data. */
1133 /* End of pattern means we have succeeded! */
1135 /* If caller wants register contents data back, convert it to indices */
1138 regs->start[0] = pos;
1139 if (dend == end_match_1)
1140 regs->end[0] = d - string1;
1142 regs->end[0] = d - string2 + size1;
1143 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1145 if (regend[mcnt] == (unsigned char *) -1)
1147 regs->start[mcnt] = -1;
1148 regs->end[mcnt] = -1;
1151 if (regstart_seg1[mcnt])
1152 regs->start[mcnt] = regstart[mcnt] - string1;
1154 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1155 if (regend_seg1[mcnt])
1156 regs->end[mcnt] = regend[mcnt] - string1;
1158 regs->end[mcnt] = regend[mcnt] - string2 + size1;
1161 if (dend == end_match_1)
1162 return (d - string1 - pos);
1164 return d - string2 + size1 - pos;
1167 /* Otherwise match next pattern command */
1168 #ifdef SWITCH_ENUM_BUG
1169 switch ((int) ((enum regexpcode) *p++))
1171 switch ((enum regexpcode) *p++)
1175 /* \( is represented by a start_memory, \) by a stop_memory.
1176 Both of those commands contain a "register number" argument.
1177 The text matched within the \( and \) is recorded under that number.
1178 Then, \<digit> turns into a `duplicate' command which
1179 is followed by the numeric value of <digit> as the register number. */
1183 regstart_seg1[*p++] = (dend == end_match_1);
1188 regend_seg1[*p++] = (dend == end_match_1);
1193 int regno = *p++; /* Get which register to match against */
1194 register unsigned char *d2, *dend2;
1196 d2 = regstart[regno];
1197 dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1198 ? regend[regno] : end_match_1);
1201 /* Advance to next segment in register contents, if necessary */
1204 if (dend2 == end_match_2) break;
1205 if (dend2 == regend[regno]) break;
1206 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1208 /* At end of register contents => success */
1209 if (d2 == dend2) break;
1211 /* Advance to next segment in data being matched, if necessary */
1214 /* mcnt gets # consecutive chars to compare */
1216 if (mcnt > dend2 - d2)
1218 /* Compare that many; failure if mismatch, else skip them. */
1219 if (translate ? memcmp_translate (d, d2, mcnt, translate) : memcmp (d, d2, mcnt))
1221 d += mcnt, d2 += mcnt;
1227 /* fetch a data character */
1229 /* Match anything but a newline. */
1230 if ((translate ? translate[*d++] : *d++) == '\n')
1237 /* Nonzero for charset_not */
1240 if (*(p - 1) == (unsigned char) charset_not)
1243 /* fetch a data character */
1251 if (c < *p * BYTEWIDTH
1252 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1257 if (!not) goto fail;
1263 if (d == string1 || d[-1] == '\n')
1269 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1273 /* "or" constructs ("|") are handled by starting each alternative
1274 with an on_failure_jump that points to the start of the next alternative.
1275 Each alternative except the last ends with a jump to the joining point.
1276 (Actually, each jump except for the last one really jumps
1277 to the following jump, because tensioning the jumps is a hassle.) */
1279 /* The start of a stupid repeat has an on_failure_jump that points
1280 past the end of the repeat text.
1281 This makes a failure point so that, on failure to match a repetition,
1282 matching restarts past as many repetitions have been found
1283 with no way to fail and look for another one. */
1285 /* A smart repeat is similar but loops back to the on_failure_jump
1286 so that each repetition makes another failure point. */
1288 case on_failure_jump:
1289 if (stackp == stacke)
1291 unsigned char **stackx;
1292 if (stacke - stackb > re_max_failures * 2)
1294 stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1296 memcpy (stackx, stackb, (stacke - stackb) * sizeof (char *));
1297 stackp = stackx + (stackp - stackb);
1298 stacke = stackx + 2 * (stacke - stackb);
1302 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1304 *stackp++ = mcnt + p;
1308 /* The end of a smart repeat has an maybe_finalize_jump back.
1309 Change it either to a finalize_jump or an ordinary jump. */
1311 case maybe_finalize_jump:
1313 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1316 register unsigned char *p2 = p;
1317 /* Compare what follows with the begining of the repeat.
1318 If we can establish that there is nothing that they would
1319 both match, we can change to finalize_jump */
1321 && (*p2 == (unsigned char) stop_memory
1322 || *p2 == (unsigned char) start_memory))
1325 p[-3] = (unsigned char) finalize_jump;
1326 else if (*p2 == (unsigned char) exactn
1327 || *p2 == (unsigned char) endline)
1329 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
1330 register unsigned char *p1 = p + mcnt;
1331 /* p1[0] ... p1[2] are an on_failure_jump.
1332 Examine what follows that */
1333 if (p1[3] == (unsigned char) exactn && p1[5] != c)
1334 p[-3] = (unsigned char) finalize_jump;
1335 else if (p1[3] == (unsigned char) charset
1336 || p1[3] == (unsigned char) charset_not)
1338 int not = p1[3] == (unsigned char) charset_not;
1339 if (c < p1[4] * BYTEWIDTH
1340 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1342 /* not is 1 if c would match */
1343 /* That means it is not safe to finalize */
1345 p[-3] = (unsigned char) finalize_jump;
1350 if (p[-1] != (unsigned char) finalize_jump)
1352 p[-1] = (unsigned char) jump;
1356 /* The end of a stupid repeat has a finalize-jump
1357 back to the start, where another failure point will be made
1358 which will point after all the repetitions found so far. */
1366 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1367 p += mcnt + 1; /* The 1 compensates for missing ++ above */
1370 case dummy_failure_jump:
1371 if (stackp == stacke)
1373 unsigned char **stackx
1374 = (unsigned char **) alloca (2 * (stacke - stackb)
1376 memcpy (stackx, stackb, (stacke - stackb) * sizeof (char *));
1377 stackp = stackx + (stackp - stackb);
1378 stacke = stackx + 2 * (stacke - stackb);
1386 if (d == string1 /* Points to first char */
1387 || d == end2 /* Points to end */
1388 || (d == end1 && size2 == 0)) /* Points to end */
1390 if ((SYNTAX (d[-1]) == Sword)
1391 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1396 if (d == string1 /* Points to first char */
1397 || d == end2 /* Points to end */
1398 || (d == end1 && size2 == 0)) /* Points to end */
1400 if ((SYNTAX (d[-1]) == Sword)
1401 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1406 if (d == end2 /* Points to end */
1407 || (d == end1 && size2 == 0) /* Points to end */
1408 || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1410 if (d == string1 /* Points to first char */
1411 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1416 if (d == string1 /* Points to first char */
1417 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1419 if (d == end2 /* Points to end */
1420 || (d == end1 && size2 == 0) /* Points to end */
1421 || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1427 if (((d - string2 <= (unsigned) size2)
1428 ? d - bf_p2 : d - bf_p1)
1434 if (((d - string2 <= (unsigned) size2)
1435 ? d - bf_p2 : d - bf_p1)
1441 if (((d - string2 <= (unsigned) size2)
1442 ? d - bf_p2 : d - bf_p1)
1455 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1460 goto matchnotsyntax;
1466 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1471 if (SYNTAX (*d++) == 0) goto fail;
1476 if (SYNTAX (*d++) != 0) goto fail;
1478 #endif /* not emacs */
1481 if (d == string1) /* Note, d cannot equal string2 */
1482 break; /* unless string1 == string2. */
1486 if (d == end2 || (d == end1 && size2 == 0))
1491 /* Match the next few pattern characters exactly.
1492 mcnt is how many characters to match. */
1499 if (translate[*d++] != *p++) goto fail;
1508 if (*d++ != *p++) goto fail;
1514 continue; /* Successfully matched one pattern command; keep matching */
1516 /* Jump here if any matching operation fails. */
1518 if (stackp != stackb)
1519 /* A restart point is known. Restart there and pop it. */
1522 { /* If innermost failure point is dormant, flush it and keep looking */
1528 if (d >= string1 && d <= end1)
1531 else break; /* Matching at this starting point really fails! */
1533 return -1; /* Failure to match */
1537 memcmp_translate (s1, s2, len, translate)
1538 unsigned char *s1, *s2;
1540 unsigned char *translate;
1542 register unsigned char *p1 = s1, *p2 = s2;
1545 if (translate [*p1++] != translate [*p2++]) return 1;
1551 /* Entry points compatible with bsd4.2 regex library */
1555 static struct re_pattern_buffer re_comp_buf;
1563 if (!re_comp_buf.buffer)
1564 return "No previous regular expression";
1568 if (!re_comp_buf.buffer)
1570 if (!(re_comp_buf.buffer = (char *) malloc (200)))
1571 return "Memory exhausted";
1572 re_comp_buf.allocated = 200;
1573 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1574 return "Memory exhausted";
1576 return re_compile_pattern (s, strlen (s), &re_comp_buf);
1583 int len = strlen (s);
1584 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1593 /* Indexed by a character, gives the upper case equivalent of the character */
1595 static char upcase[0400] =
1596 { 000, 001, 002, 003, 004, 005, 006, 007,
1597 010, 011, 012, 013, 014, 015, 016, 017,
1598 020, 021, 022, 023, 024, 025, 026, 027,
1599 030, 031, 032, 033, 034, 035, 036, 037,
1600 040, 041, 042, 043, 044, 045, 046, 047,
1601 050, 051, 052, 053, 054, 055, 056, 057,
1602 060, 061, 062, 063, 064, 065, 066, 067,
1603 070, 071, 072, 073, 074, 075, 076, 077,
1604 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1605 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1606 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1607 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1608 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1609 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1610 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1611 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1612 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1613 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1614 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1615 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1616 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1617 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1618 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1619 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1620 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1621 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1622 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1623 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1624 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1625 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1626 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1627 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1635 struct re_pattern_buffer buf;
1638 char fastmap[(1 << BYTEWIDTH)];
1640 /* Allow a command argument to specify the style of syntax. */
1642 obscure_syntax = atoi (argv[1]);
1645 buf.buffer = (char *) malloc (buf.allocated);
1646 buf.fastmap = fastmap;
1647 buf.translate = upcase;
1655 re_compile_pattern (pat, strlen(pat), &buf);
1657 for (i = 0; i < buf.used; i++)
1658 printchar (buf.buffer[i]);
1662 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
1664 re_compile_fastmap (&buf);
1665 printf ("Allowed by fastmap: ");
1666 for (i = 0; i < (1 << BYTEWIDTH); i++)
1667 if (fastmap[i]) printchar (i);
1671 gets (pat); /* Now read the string to match against */
1673 i = re_match (&buf, pat, strlen (pat), 0, 0);
1674 printf ("Match value %d.\n", i);
1680 struct re_pattern_buffer *bufp;
1684 printf ("buf is :\n----------------\n");
1685 for (i = 0; i < bufp->used; i++)
1686 printchar (bufp->buffer[i]);
1688 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1690 printf ("Allowed by fastmap: ");
1691 for (i = 0; i < (1 << BYTEWIDTH); i++)
1692 if (bufp->fastmap[i])
1694 printf ("\nAllowed by translate: ");
1695 if (bufp->translate)
1696 for (i = 0; i < (1 << BYTEWIDTH); i++)
1697 if (bufp->translate[i])
1699 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1700 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1707 if (c < 041 || c >= 0177)
1710 putchar (((c >> 6) & 3) + '0');
1711 putchar (((c >> 3) & 7) + '0');
1712 putchar ((c & 7) + '0');