1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
35 #define bcopy(s,d,n) memcpy((d),(s),(n))
36 #define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
37 #define bzero(s,n) memset((s),0,(n))
39 /* Make alloca work the best possible way. */
41 #define alloca __builtin_alloca
49 * Define the syntax stuff, so we can do the \<...\> things.
52 #ifndef Sword /* must be non-zero in some of the tests below... */
56 #define SYNTAX(c) re_syntax_table[c]
60 char *re_syntax_table;
64 static char re_syntax_table[256];
75 bzero (re_syntax_table, sizeof re_syntax_table);
77 for (c = 'a'; c <= 'z'; c++)
78 re_syntax_table[c] = Sword;
80 for (c = 'A'; c <= 'Z'; c++)
81 re_syntax_table[c] = Sword;
83 for (c = '0'; c <= '9'; c++)
84 re_syntax_table[c] = Sword;
89 #endif /* SYNTAX_TABLE */
90 #endif /* not emacs */
94 /* Number of failure points to allocate space for initially,
95 when matching. If this number is exceeded, more space is allocated,
96 so it is not a hard limit. */
100 #endif /* NFAILURES */
102 /* width of a byte in bits */
106 #ifndef SIGN_EXTEND_CHAR
107 #define SIGN_EXTEND_CHAR(x) (x)
110 static int obscure_syntax = 0;
112 /* Specify the precise syntax of regexp for compilation.
113 This provides for compatibility for various utilities
114 which historically have different, incompatible syntaxes.
116 The argument SYNTAX is a bit-mask containing the two bits
117 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
120 re_set_syntax (syntax)
125 ret = obscure_syntax;
126 obscure_syntax = syntax;
130 /* re_compile_pattern takes a regular-expression string
131 and converts it into a buffer full of byte commands for matching.
133 PATTERN is the address of the pattern string
134 SIZE is the length of it.
135 BUFP is a struct re_pattern_buffer * which points to the info
136 on where to store the byte commands.
137 This structure contains a char * which points to the
138 actual space, which should have been obtained with malloc.
139 re_compile_pattern may use realloc to grow the buffer space.
141 The number of bytes of commands can be found out by looking in
142 the struct re_pattern_buffer that bufp pointed to,
143 after re_compile_pattern returns.
146 #define PATPUSH(ch) (*b++ = (char) (ch))
148 #define PATFETCH(c) \
149 {if (p == pend) goto end_of_pattern; \
150 c = * (unsigned char *) p++; \
151 if (translate) c = translate[c]; }
153 #define PATFETCH_RAW(c) \
154 {if (p == pend) goto end_of_pattern; \
155 c = * (unsigned char *) p++; }
157 #define PATUNFETCH p--
159 #define EXTEND_BUFFER \
160 { char *old_buffer = bufp->buffer; \
161 if (bufp->allocated == (1<<16)) goto too_big; \
162 bufp->allocated *= 2; \
163 if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
164 if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
165 goto memory_exhausted; \
166 c = bufp->buffer - old_buffer; \
174 pending_exact += c; \
177 static void store_jump (), insert_jump ();
180 re_compile_pattern (pattern, size, bufp)
183 struct re_pattern_buffer *bufp;
185 register char *b = bufp->buffer;
186 register char *p = pattern;
187 char *pend = pattern + size;
188 register unsigned c, c1;
190 unsigned char *translate = (unsigned char *) bufp->translate;
192 /* address of the count-byte of the most recently inserted "exactn" command.
193 This makes it possible to tell whether a new exact-match character
194 can be added to that command or requires a new "exactn" command. */
196 char *pending_exact = 0;
198 /* address of the place where a forward-jump should go
199 to the end of the containing expression.
200 Each alternative of an "or", except the last, ends with a forward-jump
203 char *fixup_jump = 0;
205 /* address of start of the most recently finished expression.
206 This tells postfix * where to find the start of its operand. */
210 /* In processing a repeat, 1 means zero matches is allowed */
214 /* In processing a repeat, 1 means many matches is allowed */
218 /* address of beginning of regexp, or inside of last \( */
222 /* Stack of information saved by \( and restored by \).
223 Four stack elements are pushed by each \(:
224 First, the value of b.
225 Second, the value of fixup_jump.
226 Third, the value of regnum.
227 Fourth, the value of begalt. */
230 int *stackp = stackb;
231 int *stacke = stackb + 40;
234 /* Counts \('s as they are encountered. Remembered for the matching \),
235 where it becomes the "register number" to put in the stop_memory command */
239 bufp->fastmap_accurate = 0;
244 * Initialize the syntax table.
250 if (bufp->allocated == 0)
252 bufp->allocated = 28;
254 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
255 bufp->buffer = (char *) realloc (bufp->buffer, 28);
257 /* Caller did not allocate a buffer. Do it for him */
258 bufp->buffer = (char *) malloc (28);
259 if (!bufp->buffer) goto memory_exhausted;
260 begalt = b = bufp->buffer;
265 if (b - bufp->buffer > bufp->allocated - 10)
266 /* Note that EXTEND_BUFFER clobbers c */
274 if (obscure_syntax & RE_TIGHT_VBAR)
276 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
278 /* Make operand of last vbar end before this `$'. */
280 store_jump (fixup_jump, jump, b);
286 /* $ means succeed if at end of line, but only in special contexts.
287 If randomly in the middle of a pattern, it is a normal character. */
288 if (p == pend || *p == '\n'
289 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
290 || (obscure_syntax & RE_NO_BK_PARENS
292 : *p == '\\' && p[1] == ')')
293 || (obscure_syntax & RE_NO_BK_VBAR
295 : *p == '\\' && p[1] == '|'))
303 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
305 if (laststart && p[-2] != '\n'
306 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
308 if (obscure_syntax & RE_TIGHT_VBAR)
311 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
322 if (obscure_syntax & RE_BK_PLUS_QM)
326 /* If there is no previous pattern, char not special. */
327 if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
329 /* If there is a sequence of repetition chars,
330 collapse it down to equivalent to just one. */
335 zero_times_ok |= c != '+';
336 many_times_ok |= c != '?';
342 else if (!(obscure_syntax & RE_BK_PLUS_QM)
343 && (c == '+' || c == '?'))
345 else if ((obscure_syntax & RE_BK_PLUS_QM)
350 if (!(c1 == '+' || c1 == '?'))
365 /* Star, etc. applied to an empty pattern is equivalent
366 to an empty pattern. */
370 /* Now we know whether 0 matches is allowed,
371 and whether 2 or more matches is allowed. */
374 /* If more than one repetition is allowed,
375 put in a backward jump at the end. */
376 store_jump (b, maybe_finalize_jump, laststart - 3);
379 insert_jump (on_failure_jump, laststart, b + 3, b);
384 /* At least one repetition required: insert before the loop
385 a skip over the initial on-failure-jump instruction */
386 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
397 while (b - bufp->buffer
398 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
399 /* Note that EXTEND_BUFFER clobbers c */
404 PATPUSH (charset_not), p++;
409 PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
410 /* Clear the whole map */
411 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
412 /* Read in characters and ranges, setting map bits */
416 if (c == ']' && p != p1 + 1) break;
417 if (*p == '-' && p[1] != ']')
422 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
426 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
429 /* Discard any bitmap bytes that are all 0 at the end of the map.
430 Decrement the map-length byte too. */
431 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
437 if (! (obscure_syntax & RE_NO_BK_PARENS))
443 if (! (obscure_syntax & RE_NO_BK_PARENS))
449 if (! (obscure_syntax & RE_NEWLINE_OR))
455 if (! (obscure_syntax & RE_NO_BK_VBAR))
461 if (p == pend) goto invalid_pattern;
466 if (obscure_syntax & RE_NO_BK_PARENS)
469 if (stackp == stacke) goto nesting_too_deep;
470 if (regnum < RE_NREGS)
472 PATPUSH (start_memory);
475 *stackp++ = b - bufp->buffer;
476 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
477 *stackp++ = regnum++;
478 *stackp++ = begalt - bufp->buffer;
485 if (obscure_syntax & RE_NO_BK_PARENS)
488 if (stackp == stackb) goto unmatched_close;
489 begalt = *--stackp + bufp->buffer;
491 store_jump (fixup_jump, jump, b);
492 if (stackp[-1] < RE_NREGS)
494 PATPUSH (stop_memory);
495 PATPUSH (stackp[-1]);
500 fixup_jump = *stackp + bufp->buffer - 1;
501 laststart = *--stackp + bufp->buffer;
505 if (obscure_syntax & RE_NO_BK_VBAR)
508 insert_jump (on_failure_jump, begalt, b + 6, b);
512 store_jump (fixup_jump, jump, b);
526 PATPUSH (syntaxspec);
528 PATPUSH (syntax_spec_code[c]);
533 PATPUSH (notsyntaxspec);
535 PATPUSH (syntax_spec_code[c]);
546 PATPUSH (notwordchar);
562 PATPUSH (notwordbound);
585 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
595 if (obscure_syntax & RE_BK_PLUS_QM)
600 /* You might think it would be useful for \ to mean
601 not to translate; but if we don't translate it
602 it will never match anything. */
603 if (translate) c = translate[c];
610 if (!pending_exact || pending_exact + *pending_exact + 1 != b
611 || *pending_exact == 0177 || *p == '*' || *p == '^'
612 || ((obscure_syntax & RE_BK_PLUS_QM)
613 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
614 : (*p == '+' || *p == '?')))
627 store_jump (fixup_jump, jump, b);
629 if (stackp != stackb) goto unmatched_open;
631 bufp->used = b - bufp->buffer;
635 return "Invalid regular expression";
638 return "Unmatched \\(";
641 return "Unmatched \\)";
644 return "Premature end of regular expression";
647 return "Nesting too deep";
650 return "Regular expression too big";
653 return "Memory exhausted";
656 /* Store where `from' points a jump operation to jump to where `to' points.
657 `opcode' is the opcode to store. */
660 store_jump (from, opcode, to)
665 from[1] = (to - (from + 3)) & 0377;
666 from[2] = (to - (from + 3)) >> 8;
669 /* Open up space at char FROM, and insert there a jump to TO.
670 CURRENT_END gives te end of the storage no in use,
671 so we know how much data to copy up.
672 OP is the opcode of the jump to insert.
674 If you call this function, you must zero out pending_exact. */
677 insert_jump (op, from, to, current_end)
679 char *from, *to, *current_end;
681 register char *pto = current_end + 3;
682 register char *pfrom = current_end;
683 while (pfrom != from)
685 store_jump (from, op, to);
688 /* Given a pattern, compute a fastmap from it.
689 The fastmap records which of the (1 << BYTEWIDTH) possible characters
690 can start a string that matches the pattern.
691 This fastmap is used by re_search to skip quickly over totally implausible text.
693 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
695 The other components of bufp describe the pattern to be used. */
698 re_compile_fastmap (bufp)
699 struct re_pattern_buffer *bufp;
701 unsigned char *pattern = (unsigned char *) bufp->buffer;
702 int size = bufp->used;
703 register char *fastmap = bufp->fastmap;
704 register unsigned char *p = pattern;
705 register unsigned char *pend = pattern + size;
707 unsigned char *translate = (unsigned char *) bufp->translate;
709 unsigned char *stackb[NFAILURES];
710 unsigned char **stackp = stackb;
712 bzero (fastmap, (1 << BYTEWIDTH));
713 bufp->fastmap_accurate = 1;
714 bufp->can_be_null = 0;
720 bufp->can_be_null = 1;
723 #ifdef SWITCH_ENUM_BUG
724 switch ((int) ((enum regexpcode) *p++))
726 switch ((enum regexpcode) *p++)
731 fastmap[translate[p[1]]] = 1;
750 fastmap[translate['\n']] = 1;
753 if (bufp->can_be_null != 1)
754 bufp->can_be_null = 2;
758 case maybe_finalize_jump:
760 case dummy_failure_jump:
761 bufp->can_be_null = 1;
763 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
764 p += j + 1; /* The 1 compensates for missing ++ above */
767 /* Jump backward reached implies we just went through
768 the body of a loop and matched nothing.
769 Opcode jumped to should be an on_failure_jump.
770 Just treat it like an ordinary jump.
771 For a * loop, it has pushed its failure point already;
772 if so, discard that as redundant. */
773 if ((enum regexpcode) *p != on_failure_jump)
777 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
778 p += j + 1; /* The 1 compensates for missing ++ above */
779 if (stackp != stackb && *stackp == p)
783 case on_failure_jump:
785 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
796 bufp->can_be_null = 1;
799 for (j = 0; j < (1 << BYTEWIDTH); j++)
802 if (bufp->can_be_null)
804 /* Don't return; check the alternative paths
805 so we can set can_be_null if appropriate. */
809 for (j = 0; j < (1 << BYTEWIDTH); j++)
810 if (SYNTAX (j) == Sword)
815 for (j = 0; j < (1 << BYTEWIDTH); j++)
816 if (SYNTAX (j) != Sword)
823 for (j = 0; j < (1 << BYTEWIDTH); j++)
824 if (SYNTAX (j) == (enum syntaxcode) k)
830 for (j = 0; j < (1 << BYTEWIDTH); j++)
831 if (SYNTAX (j) != (enum syntaxcode) k)
837 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
838 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
841 fastmap[translate[j]] = 1;
848 /* Chars beyond end of map must be allowed */
849 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
851 fastmap[translate[j]] = 1;
855 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
856 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
859 fastmap[translate[j]] = 1;
866 /* Get here means we have successfully found the possible starting characters
867 of one path of the pattern. We need not follow this path any farther.
868 Instead, look at the next alternative remembered in the stack. */
869 if (stackp != stackb)
876 /* Like re_search_2, below, but only one string is specified. */
879 re_search (pbufp, string, size, startpos, range, regs)
880 struct re_pattern_buffer *pbufp;
882 int size, startpos, range;
883 struct re_registers *regs;
885 return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
888 /* Like re_match_2 but tries first a match starting at index STARTPOS,
889 then at STARTPOS + 1, and so on.
890 RANGE is the number of places to try before giving up.
891 If RANGE is negative, the starting positions tried are
892 STARTPOS, STARTPOS - 1, etc.
893 It is up to the caller to make sure that range is not so large
894 as to take the starting position outside of the input strings.
896 The value returned is the position at which the match was found,
897 or -1 if no match was found,
898 or -2 if error (such as failure stack overflow). */
901 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
902 struct re_pattern_buffer *pbufp;
903 char *string1, *string2;
907 struct re_registers *regs;
910 register char *fastmap = pbufp->fastmap;
911 register unsigned char *translate = (unsigned char *) pbufp->translate;
912 int total = size1 + size2;
915 /* Update the fastmap now if not correct already */
916 if (fastmap && !pbufp->fastmap_accurate)
917 re_compile_fastmap (pbufp);
919 /* Don't waste time in a long search for a pattern
920 that says it is anchored. */
921 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
932 /* If a fastmap is supplied, skip quickly over characters
933 that cannot possibly be the start of a match.
934 Note, however, that if the pattern can possibly match
935 the null string, we must test it at each starting point
936 so that we take the first null string we get. */
938 if (fastmap && startpos < total && pbufp->can_be_null != 1)
942 register int lim = 0;
943 register unsigned char *p;
945 if (startpos < size1 && startpos + range >= size1)
946 lim = range - (size1 - startpos);
948 p = ((unsigned char *)
949 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
953 while (range > lim && !fastmap[translate[*p++]])
958 while (range > lim && !fastmap[*p++])
961 startpos += irange - range;
965 register unsigned char c;
966 if (startpos >= size1)
967 c = string2[startpos - size1];
969 c = string1[startpos];
971 if (translate ? !fastmap[translate[c]] : !fastmap[c])
976 if (range >= 0 && startpos == total
977 && fastmap && pbufp->can_be_null == 0)
980 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
990 #endif /* C_ALLOCA */
994 if (range > 0) range--, startpos++; else range++, startpos--;
999 #ifndef emacs /* emacs never uses this */
1001 re_match (pbufp, string, size, pos, regs)
1002 struct re_pattern_buffer *pbufp;
1005 struct re_registers *regs;
1007 return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1011 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1013 int re_max_failures = 2000;
1015 static int bcmp_translate();
1016 /* Match the pattern described by PBUFP
1017 against data which is the virtual concatenation of STRING1 and STRING2.
1018 SIZE1 and SIZE2 are the sizes of the two data strings.
1019 Start the match at position POS.
1020 Do not consider matching past the position MSTOP.
1022 If pbufp->fastmap is nonzero, then it had better be up to date.
1024 The reason that the data to match are specified as two components
1025 which are to be regarded as concatenated
1026 is so this function can be used directly on the contents of an Emacs buffer.
1028 -1 is returned if there is no match. -2 is returned if there is
1029 an error (such as match stack overflow). Otherwise the value is the length
1030 of the substring which was matched. */
1033 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1034 struct re_pattern_buffer *pbufp;
1035 unsigned char *string1, *string2;
1038 struct re_registers *regs;
1041 register unsigned char *p = (unsigned char *) pbufp->buffer;
1042 register unsigned char *pend = p + pbufp->used;
1043 /* End of first string */
1044 unsigned char *end1;
1045 /* End of second string */
1046 unsigned char *end2;
1047 /* Pointer just past last char to consider matching */
1048 unsigned char *end_match_1, *end_match_2;
1049 register unsigned char *d, *dend;
1051 unsigned char *translate = (unsigned char *) pbufp->translate;
1053 /* Failure point stack. Each place that can handle a failure further down the line
1054 pushes a failure point on this stack. It consists of two char *'s.
1055 The first one pushed is where to resume scanning the pattern;
1056 the second pushed is where to resume scanning the strings.
1057 If the latter is zero, the failure point is a "dummy".
1058 If a failure happens and the innermost failure point is dormant,
1059 it discards that failure point and tries the next one. */
1061 unsigned char *initial_stack[2 * NFAILURES];
1062 unsigned char **stackb = initial_stack;
1063 unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1065 /* Information on the "contents" of registers.
1066 These are pointers into the input strings; they record
1067 just what was matched (on this attempt) by some part of the pattern.
1068 The start_memory command stores the start of a register's contents
1069 and the stop_memory command stores the end.
1071 At that point, regstart[regnum] points to the first character in the register,
1072 regend[regnum] points to the first character beyond the end of the register,
1073 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1074 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1076 unsigned char *regstart[RE_NREGS];
1077 unsigned char *regend[RE_NREGS];
1078 unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1080 /* Set up pointers to ends of strings.
1081 Don't allow the second string to be empty unless both are empty. */
1089 end1 = string1 + size1;
1090 end2 = string2 + size2;
1092 /* Compute where to stop matching, within the two strings */
1095 end_match_1 = string1 + mstop;
1096 end_match_2 = string2;
1101 end_match_2 = string2 + mstop - size1;
1104 /* Initialize \) text positions to -1
1105 to mark ones that no \( or \) has been seen for. */
1107 for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1108 regend[mcnt] = (unsigned char *) -1;
1110 /* `p' scans through the pattern as `d' scans through the data.
1111 `dend' is the end of the input string that `d' points within.
1112 `d' is advanced into the following input string whenever necessary,
1113 but this happens before fetching;
1114 therefore, at the beginning of the loop,
1115 `d' can be pointing at the end of a string,
1116 but it cannot equal string2. */
1119 d = string1 + pos, dend = end_match_1;
1121 d = string2 + pos - size1, dend = end_match_2;
1123 /* Write PREFETCH; just before fetching a character with *d. */
1126 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1127 d = string2; /* end of string1 => advance to string2. */ \
1128 dend = end_match_2; }
1130 /* This loop loops over pattern commands.
1131 It exits by returning from the function if match is complete,
1132 or it drops through if match fails at this starting point in the input data. */
1137 /* End of pattern means we have succeeded! */
1139 /* If caller wants register contents data back, convert it to indices */
1142 regs->start[0] = pos;
1143 if (dend == end_match_1)
1144 regs->end[0] = d - string1;
1146 regs->end[0] = d - string2 + size1;
1147 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1149 if (regend[mcnt] == (unsigned char *) -1)
1151 regs->start[mcnt] = -1;
1152 regs->end[mcnt] = -1;
1155 if (regstart_seg1[mcnt])
1156 regs->start[mcnt] = regstart[mcnt] - string1;
1158 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1159 if (regend_seg1[mcnt])
1160 regs->end[mcnt] = regend[mcnt] - string1;
1162 regs->end[mcnt] = regend[mcnt] - string2 + size1;
1165 if (dend == end_match_1)
1166 return (d - string1 - pos);
1168 return d - string2 + size1 - pos;
1171 /* Otherwise match next pattern command */
1172 #ifdef SWITCH_ENUM_BUG
1173 switch ((int) ((enum regexpcode) *p++))
1175 switch ((enum regexpcode) *p++)
1179 /* \( is represented by a start_memory, \) by a stop_memory.
1180 Both of those commands contain a "register number" argument.
1181 The text matched within the \( and \) is recorded under that number.
1182 Then, \<digit> turns into a `duplicate' command which
1183 is followed by the numeric value of <digit> as the register number. */
1187 regstart_seg1[*p++] = (dend == end_match_1);
1192 regend_seg1[*p++] = (dend == end_match_1);
1197 int regno = *p++; /* Get which register to match against */
1198 register unsigned char *d2, *dend2;
1200 d2 = regstart[regno];
1201 dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1202 ? regend[regno] : end_match_1);
1205 /* Advance to next segment in register contents, if necessary */
1208 if (dend2 == end_match_2) break;
1209 if (dend2 == regend[regno]) break;
1210 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1212 /* At end of register contents => success */
1213 if (d2 == dend2) break;
1215 /* Advance to next segment in data being matched, if necessary */
1218 /* mcnt gets # consecutive chars to compare */
1220 if (mcnt > dend2 - d2)
1222 /* Compare that many; failure if mismatch, else skip them. */
1223 if (translate ? bcmp_translate (d, d2, mcnt, translate) : bcmp (d, d2, mcnt))
1225 d += mcnt, d2 += mcnt;
1231 /* fetch a data character */
1233 /* Match anything but a newline. */
1234 if ((translate ? translate[*d++] : *d++) == '\n')
1241 /* Nonzero for charset_not */
1244 if (*(p - 1) == (unsigned char) charset_not)
1247 /* fetch a data character */
1255 if (c < *p * BYTEWIDTH
1256 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1261 if (!not) goto fail;
1267 if (d == string1 || d[-1] == '\n')
1273 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1277 /* "or" constructs ("|") are handled by starting each alternative
1278 with an on_failure_jump that points to the start of the next alternative.
1279 Each alternative except the last ends with a jump to the joining point.
1280 (Actually, each jump except for the last one really jumps
1281 to the following jump, because tensioning the jumps is a hassle.) */
1283 /* The start of a stupid repeat has an on_failure_jump that points
1284 past the end of the repeat text.
1285 This makes a failure point so that, on failure to match a repetition,
1286 matching restarts past as many repetitions have been found
1287 with no way to fail and look for another one. */
1289 /* A smart repeat is similar but loops back to the on_failure_jump
1290 so that each repetition makes another failure point. */
1292 case on_failure_jump:
1293 if (stackp == stacke)
1295 unsigned char **stackx;
1296 if (stacke - stackb > re_max_failures * 2)
1298 stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1300 bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1301 stackp = stackx + (stackp - stackb);
1302 stacke = stackx + 2 * (stacke - stackb);
1306 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1308 *stackp++ = mcnt + p;
1312 /* The end of a smart repeat has an maybe_finalize_jump back.
1313 Change it either to a finalize_jump or an ordinary jump. */
1315 case maybe_finalize_jump:
1317 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1320 register unsigned char *p2 = p;
1321 /* Compare what follows with the begining of the repeat.
1322 If we can establish that there is nothing that they would
1323 both match, we can change to finalize_jump */
1325 && (*p2 == (unsigned char) stop_memory
1326 || *p2 == (unsigned char) start_memory))
1329 p[-3] = (unsigned char) finalize_jump;
1330 else if (*p2 == (unsigned char) exactn
1331 || *p2 == (unsigned char) endline)
1333 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
1334 register unsigned char *p1 = p + mcnt;
1335 /* p1[0] ... p1[2] are an on_failure_jump.
1336 Examine what follows that */
1337 if (p1[3] == (unsigned char) exactn && p1[5] != c)
1338 p[-3] = (unsigned char) finalize_jump;
1339 else if (p1[3] == (unsigned char) charset
1340 || p1[3] == (unsigned char) charset_not)
1342 int not = p1[3] == (unsigned char) charset_not;
1343 if (c < p1[4] * BYTEWIDTH
1344 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1346 /* not is 1 if c would match */
1347 /* That means it is not safe to finalize */
1349 p[-3] = (unsigned char) finalize_jump;
1354 if (p[-1] != (unsigned char) finalize_jump)
1356 p[-1] = (unsigned char) jump;
1360 /* The end of a stupid repeat has a finalize-jump
1361 back to the start, where another failure point will be made
1362 which will point after all the repetitions found so far. */
1370 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1371 p += mcnt + 1; /* The 1 compensates for missing ++ above */
1374 case dummy_failure_jump:
1375 if (stackp == stacke)
1377 unsigned char **stackx
1378 = (unsigned char **) alloca (2 * (stacke - stackb)
1380 bcopy (stackb, stackx, (stacke - stackb) * sizeof (char *));
1381 stackp = stackx + (stackp - stackb);
1382 stacke = stackx + 2 * (stacke - stackb);
1390 if (d == string1 /* Points to first char */
1391 || d == end2 /* Points to end */
1392 || (d == end1 && size2 == 0)) /* Points to end */
1394 if ((SYNTAX (d[-1]) == Sword)
1395 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1400 if (d == string1 /* Points to first char */
1401 || d == end2 /* Points to end */
1402 || (d == end1 && size2 == 0)) /* Points to end */
1404 if ((SYNTAX (d[-1]) == Sword)
1405 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1410 if (d == end2 /* Points to end */
1411 || (d == end1 && size2 == 0) /* Points to end */
1412 || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1414 if (d == string1 /* Points to first char */
1415 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1420 if (d == string1 /* Points to first char */
1421 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1423 if (d == end2 /* Points to end */
1424 || (d == end1 && size2 == 0) /* Points to end */
1425 || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1431 if (((d - string2 <= (unsigned) size2)
1432 ? d - bf_p2 : d - bf_p1)
1438 if (((d - string2 <= (unsigned) size2)
1439 ? d - bf_p2 : d - bf_p1)
1445 if (((d - string2 <= (unsigned) size2)
1446 ? d - bf_p2 : d - bf_p1)
1459 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1464 goto matchnotsyntax;
1470 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1475 if (SYNTAX (*d++) == 0) goto fail;
1480 if (SYNTAX (*d++) != 0) goto fail;
1482 #endif /* not emacs */
1485 if (d == string1) /* Note, d cannot equal string2 */
1486 break; /* unless string1 == string2. */
1490 if (d == end2 || (d == end1 && size2 == 0))
1495 /* Match the next few pattern characters exactly.
1496 mcnt is how many characters to match. */
1503 if (translate[*d++] != *p++) goto fail;
1512 if (*d++ != *p++) goto fail;
1518 continue; /* Successfully matched one pattern command; keep matching */
1520 /* Jump here if any matching operation fails. */
1522 if (stackp != stackb)
1523 /* A restart point is known. Restart there and pop it. */
1526 { /* If innermost failure point is dormant, flush it and keep looking */
1532 if (d >= string1 && d <= end1)
1535 else break; /* Matching at this starting point really fails! */
1537 return -1; /* Failure to match */
1541 bcmp_translate (s1, s2, len, translate)
1542 unsigned char *s1, *s2;
1544 unsigned char *translate;
1546 register unsigned char *p1 = s1, *p2 = s2;
1549 if (translate [*p1++] != translate [*p2++]) return 1;
1555 /* Entry points compatible with bsd4.2 regex library */
1559 static struct re_pattern_buffer re_comp_buf;
1567 if (!re_comp_buf.buffer)
1568 return "No previous regular expression";
1572 if (!re_comp_buf.buffer)
1574 if (!(re_comp_buf.buffer = (char *) malloc (200)))
1575 return "Memory exhausted";
1576 re_comp_buf.allocated = 200;
1577 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1578 return "Memory exhausted";
1580 return re_compile_pattern (s, strlen (s), &re_comp_buf);
1587 int len = strlen (s);
1588 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1597 /* Indexed by a character, gives the upper case equivalent of the character */
1599 static char upcase[0400] =
1600 { 000, 001, 002, 003, 004, 005, 006, 007,
1601 010, 011, 012, 013, 014, 015, 016, 017,
1602 020, 021, 022, 023, 024, 025, 026, 027,
1603 030, 031, 032, 033, 034, 035, 036, 037,
1604 040, 041, 042, 043, 044, 045, 046, 047,
1605 050, 051, 052, 053, 054, 055, 056, 057,
1606 060, 061, 062, 063, 064, 065, 066, 067,
1607 070, 071, 072, 073, 074, 075, 076, 077,
1608 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1609 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1610 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1611 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1612 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1613 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1614 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1615 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1616 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1617 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1618 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1619 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1620 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1621 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1622 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1623 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1624 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1625 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1626 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1627 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1628 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1629 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1630 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1631 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1639 struct re_pattern_buffer buf;
1642 char fastmap[(1 << BYTEWIDTH)];
1644 /* Allow a command argument to specify the style of syntax. */
1646 obscure_syntax = atoi (argv[1]);
1649 buf.buffer = (char *) malloc (buf.allocated);
1650 buf.fastmap = fastmap;
1651 buf.translate = upcase;
1659 re_compile_pattern (pat, strlen(pat), &buf);
1661 for (i = 0; i < buf.used; i++)
1662 printchar (buf.buffer[i]);
1666 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
1668 re_compile_fastmap (&buf);
1669 printf ("Allowed by fastmap: ");
1670 for (i = 0; i < (1 << BYTEWIDTH); i++)
1671 if (fastmap[i]) printchar (i);
1675 gets (pat); /* Now read the string to match against */
1677 i = re_match (&buf, pat, strlen (pat), 0, 0);
1678 printf ("Match value %d.\n", i);
1684 struct re_pattern_buffer *bufp;
1688 printf ("buf is :\n----------------\n");
1689 for (i = 0; i < bufp->used; i++)
1690 printchar (bufp->buffer[i]);
1692 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1694 printf ("Allowed by fastmap: ");
1695 for (i = 0; i < (1 << BYTEWIDTH); i++)
1696 if (bufp->fastmap[i])
1698 printf ("\nAllowed by translate: ");
1699 if (bufp->translate)
1700 for (i = 0; i < (1 << BYTEWIDTH); i++)
1701 if (bufp->translate[i])
1703 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1704 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1711 if (c < 041 || c >= 0177)
1714 putchar (((c >> 6) & 3) + '0');
1715 putchar (((c >> 3) & 7) + '0');
1716 putchar ((c & 7) + '0');