1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
39 * Define the syntax stuff, so we can do the \<...\> things.
42 #ifndef Sword /* must be non-zero in some of the tests below... */
46 #define SYNTAX(c) re_syntax_table[c]
50 char *re_syntax_table;
54 static char re_syntax_table[256];
65 memset (re_syntax_table, '\0', sizeof re_syntax_table);
67 for (c = 'a'; c <= 'z'; c++)
68 re_syntax_table[c] = Sword;
70 for (c = 'A'; c <= 'Z'; c++)
71 re_syntax_table[c] = Sword;
73 for (c = '0'; c <= '9'; c++)
74 re_syntax_table[c] = Sword;
79 #endif /* SYNTAX_TABLE */
80 #endif /* not emacs */
84 /* Number of failure points to allocate space for initially,
85 when matching. If this number is exceeded, more space is allocated,
86 so it is not a hard limit. */
90 #endif /* NFAILURES */
92 /* width of a byte in bits */
96 #ifndef SIGN_EXTEND_CHAR
97 #define SIGN_EXTEND_CHAR(x) (x)
100 static int obscure_syntax = 0;
102 /* Specify the precise syntax of regexp for compilation.
103 This provides for compatibility for various utilities
104 which historically have different, incompatible syntaxes.
106 The argument SYNTAX is a bit-mask containing the two bits
107 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
110 re_set_syntax (syntax)
115 ret = obscure_syntax;
116 obscure_syntax = syntax;
120 /* re_compile_pattern takes a regular-expression string
121 and converts it into a buffer full of byte commands for matching.
123 PATTERN is the address of the pattern string
124 SIZE is the length of it.
125 BUFP is a struct re_pattern_buffer * which points to the info
126 on where to store the byte commands.
127 This structure contains a char * which points to the
128 actual space, which should have been obtained with malloc.
129 re_compile_pattern may use realloc to grow the buffer space.
131 The number of bytes of commands can be found out by looking in
132 the struct re_pattern_buffer that bufp pointed to,
133 after re_compile_pattern returns.
136 #define PATPUSH(ch) (*b++ = (char) (ch))
138 #define PATFETCH(c) \
139 {if (p == pend) goto end_of_pattern; \
140 c = * (unsigned char *) p++; \
141 if (translate) c = translate[c]; }
143 #define PATFETCH_RAW(c) \
144 {if (p == pend) goto end_of_pattern; \
145 c = * (unsigned char *) p++; }
147 #define PATUNFETCH p--
149 /* This is not an arbitrary limit: the arguments which represent offsets
150 into the pattern are two bytes long. So if 2^16 bytes turns out to
151 be too small, many things would have to change. */
152 #define MAX_BUF_SIZE (1 << 16)
155 /* Extend the buffer by twice its current size via realloc and
156 reset the pointers that pointed into the old block to point to the
157 correct places in the new one. If extending the buffer results in it
158 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
159 #define EXTEND_BUFFER \
161 char *old_buffer = bufp->buffer; \
162 if (bufp->allocated == MAX_BUF_SIZE) \
164 bufp->allocated <<= 1; \
165 if (bufp->allocated > MAX_BUF_SIZE) \
166 bufp->allocated = MAX_BUF_SIZE; \
167 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated);\
168 if (bufp->buffer == NULL) \
169 goto memory_exhausted; \
170 /* If the buffer moved, move all the pointers into it. */ \
171 if (old_buffer != bufp->buffer) \
173 b = (b - old_buffer) + bufp->buffer; \
174 begalt = (begalt - old_buffer) + bufp->buffer; \
176 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer;\
178 laststart = (laststart - old_buffer) + bufp->buffer; \
180 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
184 static void store_jump (), insert_jump ();
187 re_compile_pattern (pattern, size, bufp)
190 struct re_pattern_buffer *bufp;
192 register char *b = bufp->buffer;
193 register char *p = pattern;
194 char *pend = pattern + size;
195 register unsigned c, c1;
197 unsigned char *translate = (unsigned char *) bufp->translate;
199 /* address of the count-byte of the most recently inserted "exactn" command.
200 This makes it possible to tell whether a new exact-match character
201 can be added to that command or requires a new "exactn" command. */
203 char *pending_exact = 0;
205 /* address of the place where a forward-jump should go
206 to the end of the containing expression.
207 Each alternative of an "or", except the last, ends with a forward-jump
210 char *fixup_jump = 0;
212 /* address of start of the most recently finished expression.
213 This tells postfix * where to find the start of its operand. */
217 /* In processing a repeat, 1 means zero matches is allowed */
221 /* In processing a repeat, 1 means many matches is allowed */
225 /* address of beginning of regexp, or inside of last \( */
229 /* Stack of information saved by \( and restored by \).
230 Four stack elements are pushed by each \(:
231 First, the value of b.
232 Second, the value of fixup_jump.
233 Third, the value of regnum.
234 Fourth, the value of begalt. */
237 int *stackp = stackb;
238 int *stacke = stackb + 40;
241 /* Counts \('s as they are encountered. Remembered for the matching \),
242 where it becomes the "register number" to put in the stop_memory command */
246 bufp->fastmap_accurate = 0;
251 * Initialize the syntax table.
257 if (bufp->allocated == 0)
259 bufp->allocated = 28;
261 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
262 bufp->buffer = (char *) realloc (bufp->buffer, 28);
264 /* Caller did not allocate a buffer. Do it for him */
265 bufp->buffer = (char *) malloc (28);
266 if (!bufp->buffer) goto memory_exhausted;
267 begalt = b = bufp->buffer;
272 if (b - bufp->buffer > bufp->allocated - 10)
273 /* Note that EXTEND_BUFFER clobbers c */
281 if (obscure_syntax & RE_TIGHT_VBAR)
283 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p != pend)
285 /* Make operand of last vbar end before this `$'. */
287 store_jump (fixup_jump, jump, b);
293 /* $ means succeed if at end of line, but only in special contexts.
294 If randomly in the middle of a pattern, it is a normal character. */
295 if (p == pend || *p == '\n'
296 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
297 || (obscure_syntax & RE_NO_BK_PARENS
299 : *p == '\\' && p[1] == ')')
300 || (obscure_syntax & RE_NO_BK_VBAR
302 : *p == '\\' && p[1] == '|'))
310 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
312 if (laststart && p[-2] != '\n'
313 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
315 if (obscure_syntax & RE_TIGHT_VBAR)
318 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
329 if (obscure_syntax & RE_BK_PLUS_QM)
333 /* If there is no previous pattern, char not special. */
334 if (!laststart && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
336 /* If there is a sequence of repetition chars,
337 collapse it down to equivalent to just one. */
342 zero_times_ok |= c != '+';
343 many_times_ok |= c != '?';
349 else if (!(obscure_syntax & RE_BK_PLUS_QM)
350 && (c == '+' || c == '?'))
352 else if ((obscure_syntax & RE_BK_PLUS_QM)
357 if (!(c1 == '+' || c1 == '?'))
372 /* Star, etc. applied to an empty pattern is equivalent
373 to an empty pattern. */
377 /* Now we know whether 0 matches is allowed,
378 and whether 2 or more matches is allowed. */
381 /* If more than one repetition is allowed,
382 put in a backward jump at the end. */
383 store_jump (b, maybe_finalize_jump, laststart - 3);
386 insert_jump (on_failure_jump, laststart, b + 3, b);
391 /* At least one repetition required: insert before the loop
392 a skip over the initial on-failure-jump instruction */
393 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
404 while (b - bufp->buffer
405 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
406 /* Note that EXTEND_BUFFER clobbers c */
411 PATPUSH (charset_not), p++;
416 PATPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
417 /* Clear the whole map */
418 memset (b, '\0', (1 << BYTEWIDTH) / BYTEWIDTH);
419 /* Read in characters and ranges, setting map bits */
423 if (c == ']' && p != p1 + 1) break;
424 if (*p == '-' && p[1] != ']')
429 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH), c++;
433 b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
436 /* Discard any bitmap bytes that are all 0 at the end of the map.
437 Decrement the map-length byte too. */
438 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
444 if (! (obscure_syntax & RE_NO_BK_PARENS))
450 if (! (obscure_syntax & RE_NO_BK_PARENS))
456 if (! (obscure_syntax & RE_NEWLINE_OR))
462 if (! (obscure_syntax & RE_NO_BK_VBAR))
468 if (p == pend) goto invalid_pattern;
473 if (obscure_syntax & RE_NO_BK_PARENS)
476 if (stackp == stacke) goto nesting_too_deep;
477 if (regnum < RE_NREGS)
479 PATPUSH (start_memory);
482 *stackp++ = b - bufp->buffer;
483 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
484 *stackp++ = regnum++;
485 *stackp++ = begalt - bufp->buffer;
492 if (obscure_syntax & RE_NO_BK_PARENS)
495 if (stackp == stackb) goto unmatched_close;
496 begalt = *--stackp + bufp->buffer;
498 store_jump (fixup_jump, jump, b);
499 if (stackp[-1] < RE_NREGS)
501 PATPUSH (stop_memory);
502 PATPUSH (stackp[-1]);
507 fixup_jump = *stackp + bufp->buffer - 1;
508 laststart = *--stackp + bufp->buffer;
512 if (obscure_syntax & RE_NO_BK_VBAR)
515 insert_jump (on_failure_jump, begalt, b + 6, b);
519 store_jump (fixup_jump, jump, b);
533 PATPUSH (syntaxspec);
535 PATPUSH (syntax_spec_code[c]);
540 PATPUSH (notsyntaxspec);
542 PATPUSH (syntax_spec_code[c]);
553 PATPUSH (notwordchar);
569 PATPUSH (notwordbound);
592 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
602 if (obscure_syntax & RE_BK_PLUS_QM)
607 /* You might think it would be useful for \ to mean
608 not to translate; but if we don't translate it
609 it will never match anything. */
610 if (translate) c = translate[c];
617 if (!pending_exact || pending_exact + *pending_exact + 1 != b
618 || *pending_exact == 0177 || *p == '*' || *p == '^'
619 || ((obscure_syntax & RE_BK_PLUS_QM)
620 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
621 : (*p == '+' || *p == '?')))
634 store_jump (fixup_jump, jump, b);
636 if (stackp != stackb) goto unmatched_open;
638 bufp->used = b - bufp->buffer;
642 return "Invalid regular expression";
645 return "Unmatched \\(";
648 return "Unmatched \\)";
651 return "Premature end of regular expression";
654 return "Nesting too deep";
657 return "Regular expression too big";
660 return "Memory exhausted";
663 /* Store where `from' points a jump operation to jump to where `to' points.
664 `opcode' is the opcode to store. */
667 store_jump (from, opcode, to)
672 from[1] = (to - (from + 3)) & 0377;
673 from[2] = (to - (from + 3)) >> 8;
676 /* Open up space at char FROM, and insert there a jump to TO.
677 CURRENT_END gives te end of the storage no in use,
678 so we know how much data to copy up.
679 OP is the opcode of the jump to insert.
681 If you call this function, you must zero out pending_exact. */
684 insert_jump (op, from, to, current_end)
686 char *from, *to, *current_end;
688 register char *pto = current_end + 3;
689 register char *pfrom = current_end;
690 while (pfrom != from)
692 store_jump (from, op, to);
695 /* Given a pattern, compute a fastmap from it.
696 The fastmap records which of the (1 << BYTEWIDTH) possible characters
697 can start a string that matches the pattern.
698 This fastmap is used by re_search to skip quickly over totally implausible text.
700 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
702 The other components of bufp describe the pattern to be used. */
705 re_compile_fastmap (bufp)
706 struct re_pattern_buffer *bufp;
708 unsigned char *pattern = (unsigned char *) bufp->buffer;
709 int size = bufp->used;
710 register char *fastmap = bufp->fastmap;
711 register unsigned char *p = pattern;
712 register unsigned char *pend = pattern + size;
714 unsigned char *translate = (unsigned char *) bufp->translate;
716 unsigned char *stackb[NFAILURES];
717 unsigned char **stackp = stackb;
719 memset (fastmap, '\0', (1 << BYTEWIDTH));
720 bufp->fastmap_accurate = 1;
721 bufp->can_be_null = 0;
727 bufp->can_be_null = 1;
730 #ifdef SWITCH_ENUM_BUG
731 switch ((int) ((enum regexpcode) *p++))
733 switch ((enum regexpcode) *p++)
738 fastmap[translate[p[1]]] = 1;
757 fastmap[translate['\n']] = 1;
760 if (bufp->can_be_null != 1)
761 bufp->can_be_null = 2;
765 case maybe_finalize_jump:
767 case dummy_failure_jump:
768 bufp->can_be_null = 1;
770 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
771 p += j + 1; /* The 1 compensates for missing ++ above */
774 /* Jump backward reached implies we just went through
775 the body of a loop and matched nothing.
776 Opcode jumped to should be an on_failure_jump.
777 Just treat it like an ordinary jump.
778 For a * loop, it has pushed its failure point already;
779 if so, discard that as redundant. */
780 if ((enum regexpcode) *p != on_failure_jump)
784 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
785 p += j + 1; /* The 1 compensates for missing ++ above */
786 if (stackp != stackb && *stackp == p)
790 case on_failure_jump:
792 j += SIGN_EXTEND_CHAR (*(char *)p) << 8;
803 bufp->can_be_null = 1;
806 for (j = 0; j < (1 << BYTEWIDTH); j++)
809 if (bufp->can_be_null)
811 /* Don't return; check the alternative paths
812 so we can set can_be_null if appropriate. */
816 for (j = 0; j < (1 << BYTEWIDTH); j++)
817 if (SYNTAX (j) == Sword)
822 for (j = 0; j < (1 << BYTEWIDTH); j++)
823 if (SYNTAX (j) != Sword)
830 for (j = 0; j < (1 << BYTEWIDTH); j++)
831 if (SYNTAX (j) == (enum syntaxcode) k)
837 for (j = 0; j < (1 << BYTEWIDTH); j++)
838 if (SYNTAX (j) != (enum syntaxcode) k)
844 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
845 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
848 fastmap[translate[j]] = 1;
855 /* Chars beyond end of map must be allowed */
856 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
858 fastmap[translate[j]] = 1;
862 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
863 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
866 fastmap[translate[j]] = 1;
873 /* Get here means we have successfully found the possible starting characters
874 of one path of the pattern. We need not follow this path any farther.
875 Instead, look at the next alternative remembered in the stack. */
876 if (stackp != stackb)
883 /* Like re_search_2, below, but only one string is specified. */
886 re_search (pbufp, string, size, startpos, range, regs)
887 struct re_pattern_buffer *pbufp;
889 int size, startpos, range;
890 struct re_registers *regs;
892 return re_search_2 (pbufp, 0, 0, string, size, startpos, range, regs, size);
895 /* Like re_match_2 but tries first a match starting at index STARTPOS,
896 then at STARTPOS + 1, and so on.
897 RANGE is the number of places to try before giving up.
898 If RANGE is negative, the starting positions tried are
899 STARTPOS, STARTPOS - 1, etc.
900 It is up to the caller to make sure that range is not so large
901 as to take the starting position outside of the input strings.
903 The value returned is the position at which the match was found,
904 or -1 if no match was found,
905 or -2 if error (such as failure stack overflow). */
908 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range, regs, mstop)
909 struct re_pattern_buffer *pbufp;
910 char *string1, *string2;
914 struct re_registers *regs;
917 register char *fastmap = pbufp->fastmap;
918 register unsigned char *translate = (unsigned char *) pbufp->translate;
919 int total = size1 + size2;
922 /* Update the fastmap now if not correct already */
923 if (fastmap && !pbufp->fastmap_accurate)
924 re_compile_fastmap (pbufp);
926 /* Don't waste time in a long search for a pattern
927 that says it is anchored. */
928 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
939 /* If a fastmap is supplied, skip quickly over characters
940 that cannot possibly be the start of a match.
941 Note, however, that if the pattern can possibly match
942 the null string, we must test it at each starting point
943 so that we take the first null string we get. */
945 if (fastmap && startpos < total && pbufp->can_be_null != 1)
949 register int lim = 0;
950 register unsigned char *p;
952 if (startpos < size1 && startpos + range >= size1)
953 lim = range - (size1 - startpos);
955 p = ((unsigned char *)
956 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
960 while (range > lim && !fastmap[translate[*p++]])
965 while (range > lim && !fastmap[*p++])
968 startpos += irange - range;
972 register unsigned char c;
973 if (startpos >= size1)
974 c = string2[startpos - size1];
976 c = string1[startpos];
978 if (translate ? !fastmap[translate[c]] : !fastmap[c])
983 if (range >= 0 && startpos == total
984 && fastmap && pbufp->can_be_null == 0)
987 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos, regs, mstop);
997 #endif /* C_ALLOCA */
1001 if (range > 0) range--, startpos++; else range++, startpos--;
1006 #ifndef emacs /* emacs never uses this */
1008 re_match (pbufp, string, size, pos, regs)
1009 struct re_pattern_buffer *pbufp;
1012 struct re_registers *regs;
1014 return re_match_2 (pbufp, 0, 0, string, size, pos, regs, size);
1018 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1020 int re_max_failures = 2000;
1022 static int memcmp_translate();
1023 /* Match the pattern described by PBUFP
1024 against data which is the virtual concatenation of STRING1 and STRING2.
1025 SIZE1 and SIZE2 are the sizes of the two data strings.
1026 Start the match at position POS.
1027 Do not consider matching past the position MSTOP.
1029 If pbufp->fastmap is nonzero, then it had better be up to date.
1031 The reason that the data to match are specified as two components
1032 which are to be regarded as concatenated
1033 is so this function can be used directly on the contents of an Emacs buffer.
1035 -1 is returned if there is no match. -2 is returned if there is
1036 an error (such as match stack overflow). Otherwise the value is the length
1037 of the substring which was matched. */
1040 re_match_2 (pbufp, string1, size1, string2, size2, pos, regs, mstop)
1041 struct re_pattern_buffer *pbufp;
1042 unsigned char *string1, *string2;
1045 struct re_registers *regs;
1048 register unsigned char *p = (unsigned char *) pbufp->buffer;
1049 register unsigned char *pend = p + pbufp->used;
1050 /* End of first string */
1051 unsigned char *end1;
1052 /* End of second string */
1053 unsigned char *end2;
1054 /* Pointer just past last char to consider matching */
1055 unsigned char *end_match_1, *end_match_2;
1056 register unsigned char *d, *dend;
1058 unsigned char *translate = (unsigned char *) pbufp->translate;
1060 /* Failure point stack. Each place that can handle a failure further down the line
1061 pushes a failure point on this stack. It consists of two char *'s.
1062 The first one pushed is where to resume scanning the pattern;
1063 the second pushed is where to resume scanning the strings.
1064 If the latter is zero, the failure point is a "dummy".
1065 If a failure happens and the innermost failure point is dormant,
1066 it discards that failure point and tries the next one. */
1068 unsigned char *initial_stack[2 * NFAILURES];
1069 unsigned char **stackb = initial_stack;
1070 unsigned char **stackp = stackb, **stacke = &stackb[2 * NFAILURES];
1072 /* Information on the "contents" of registers.
1073 These are pointers into the input strings; they record
1074 just what was matched (on this attempt) by some part of the pattern.
1075 The start_memory command stores the start of a register's contents
1076 and the stop_memory command stores the end.
1078 At that point, regstart[regnum] points to the first character in the register,
1079 regend[regnum] points to the first character beyond the end of the register,
1080 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1081 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1083 unsigned char *regstart[RE_NREGS];
1084 unsigned char *regend[RE_NREGS];
1085 unsigned char regstart_seg1[RE_NREGS], regend_seg1[RE_NREGS];
1087 /* Set up pointers to ends of strings.
1088 Don't allow the second string to be empty unless both are empty. */
1096 end1 = string1 + size1;
1097 end2 = string2 + size2;
1099 /* Compute where to stop matching, within the two strings */
1102 end_match_1 = string1 + mstop;
1103 end_match_2 = string2;
1108 end_match_2 = string2 + mstop - size1;
1111 /* Initialize \) text positions to -1
1112 to mark ones that no \( or \) has been seen for. */
1114 for (mcnt = 0; mcnt < sizeof (regend) / sizeof (*regend); mcnt++)
1115 regend[mcnt] = (unsigned char *) -1;
1117 /* `p' scans through the pattern as `d' scans through the data.
1118 `dend' is the end of the input string that `d' points within.
1119 `d' is advanced into the following input string whenever necessary,
1120 but this happens before fetching;
1121 therefore, at the beginning of the loop,
1122 `d' can be pointing at the end of a string,
1123 but it cannot equal string2. */
1126 d = string1 + pos, dend = end_match_1;
1128 d = string2 + pos - size1, dend = end_match_2;
1130 /* Write PREFETCH; just before fetching a character with *d. */
1133 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1134 d = string2; /* end of string1 => advance to string2. */ \
1135 dend = end_match_2; }
1137 /* This loop loops over pattern commands.
1138 It exits by returning from the function if match is complete,
1139 or it drops through if match fails at this starting point in the input data. */
1144 /* End of pattern means we have succeeded! */
1146 /* If caller wants register contents data back, convert it to indices */
1149 regs->start[0] = pos;
1150 if (dend == end_match_1)
1151 regs->end[0] = d - string1;
1153 regs->end[0] = d - string2 + size1;
1154 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
1156 if (regend[mcnt] == (unsigned char *) -1)
1158 regs->start[mcnt] = -1;
1159 regs->end[mcnt] = -1;
1162 if (regstart_seg1[mcnt])
1163 regs->start[mcnt] = regstart[mcnt] - string1;
1165 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
1166 if (regend_seg1[mcnt])
1167 regs->end[mcnt] = regend[mcnt] - string1;
1169 regs->end[mcnt] = regend[mcnt] - string2 + size1;
1172 if (dend == end_match_1)
1173 return (d - string1 - pos);
1175 return d - string2 + size1 - pos;
1178 /* Otherwise match next pattern command */
1179 #ifdef SWITCH_ENUM_BUG
1180 switch ((int) ((enum regexpcode) *p++))
1182 switch ((enum regexpcode) *p++)
1186 /* \( is represented by a start_memory, \) by a stop_memory.
1187 Both of those commands contain a "register number" argument.
1188 The text matched within the \( and \) is recorded under that number.
1189 Then, \<digit> turns into a `duplicate' command which
1190 is followed by the numeric value of <digit> as the register number. */
1194 regstart_seg1[*p++] = (dend == end_match_1);
1199 regend_seg1[*p++] = (dend == end_match_1);
1204 int regno = *p++; /* Get which register to match against */
1205 register unsigned char *d2, *dend2;
1207 d2 = regstart[regno];
1208 dend2 = ((regstart_seg1[regno] == regend_seg1[regno])
1209 ? regend[regno] : end_match_1);
1212 /* Advance to next segment in register contents, if necessary */
1215 if (dend2 == end_match_2) break;
1216 if (dend2 == regend[regno]) break;
1217 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
1219 /* At end of register contents => success */
1220 if (d2 == dend2) break;
1222 /* Advance to next segment in data being matched, if necessary */
1225 /* mcnt gets # consecutive chars to compare */
1227 if (mcnt > dend2 - d2)
1229 /* Compare that many; failure if mismatch, else skip them. */
1230 if (translate ? memcmp_translate (d, d2, mcnt, translate) : memcmp (d, d2, mcnt))
1232 d += mcnt, d2 += mcnt;
1238 /* fetch a data character */
1240 /* Match anything but a newline. */
1241 if ((translate ? translate[*d++] : *d++) == '\n')
1248 /* Nonzero for charset_not */
1251 if (*(p - 1) == (unsigned char) charset_not)
1254 /* fetch a data character */
1262 if (c < *p * BYTEWIDTH
1263 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1268 if (!not) goto fail;
1274 if (d == string1 || d[-1] == '\n')
1280 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
1284 /* "or" constructs ("|") are handled by starting each alternative
1285 with an on_failure_jump that points to the start of the next alternative.
1286 Each alternative except the last ends with a jump to the joining point.
1287 (Actually, each jump except for the last one really jumps
1288 to the following jump, because tensioning the jumps is a hassle.) */
1290 /* The start of a stupid repeat has an on_failure_jump that points
1291 past the end of the repeat text.
1292 This makes a failure point so that, on failure to match a repetition,
1293 matching restarts past as many repetitions have been found
1294 with no way to fail and look for another one. */
1296 /* A smart repeat is similar but loops back to the on_failure_jump
1297 so that each repetition makes another failure point. */
1299 case on_failure_jump:
1300 if (stackp == stacke)
1302 unsigned char **stackx;
1303 if (stacke - stackb > re_max_failures * 2)
1305 stackx = (unsigned char **) alloca (2 * (stacke - stackb)
1307 memcpy (stackx, stackb, (stacke - stackb) * sizeof (char *));
1308 stackp = stackx + (stackp - stackb);
1309 stacke = stackx + 2 * (stacke - stackb);
1313 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1315 *stackp++ = mcnt + p;
1319 /* The end of a smart repeat has an maybe_finalize_jump back.
1320 Change it either to a finalize_jump or an ordinary jump. */
1322 case maybe_finalize_jump:
1324 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1327 register unsigned char *p2 = p;
1328 /* Compare what follows with the begining of the repeat.
1329 If we can establish that there is nothing that they would
1330 both match, we can change to finalize_jump */
1332 && (*p2 == (unsigned char) stop_memory
1333 || *p2 == (unsigned char) start_memory))
1336 p[-3] = (unsigned char) finalize_jump;
1337 else if (*p2 == (unsigned char) exactn
1338 || *p2 == (unsigned char) endline)
1340 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
1341 register unsigned char *p1 = p + mcnt;
1342 /* p1[0] ... p1[2] are an on_failure_jump.
1343 Examine what follows that */
1344 if (p1[3] == (unsigned char) exactn && p1[5] != c)
1345 p[-3] = (unsigned char) finalize_jump;
1346 else if (p1[3] == (unsigned char) charset
1347 || p1[3] == (unsigned char) charset_not)
1349 int not = p1[3] == (unsigned char) charset_not;
1350 if (c < p1[4] * BYTEWIDTH
1351 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
1353 /* not is 1 if c would match */
1354 /* That means it is not safe to finalize */
1356 p[-3] = (unsigned char) finalize_jump;
1361 if (p[-1] != (unsigned char) finalize_jump)
1363 p[-1] = (unsigned char) jump;
1367 /* The end of a stupid repeat has a finalize-jump
1368 back to the start, where another failure point will be made
1369 which will point after all the repetitions found so far. */
1377 mcnt += SIGN_EXTEND_CHAR (*(char *)p) << 8;
1378 p += mcnt + 1; /* The 1 compensates for missing ++ above */
1381 case dummy_failure_jump:
1382 if (stackp == stacke)
1384 unsigned char **stackx
1385 = (unsigned char **) alloca (2 * (stacke - stackb)
1387 memcpy (stackx, stackb, (stacke - stackb) * sizeof (char *));
1388 stackp = stackx + (stackp - stackb);
1389 stacke = stackx + 2 * (stacke - stackb);
1397 if (d == string1 /* Points to first char */
1398 || d == end2 /* Points to end */
1399 || (d == end1 && size2 == 0)) /* Points to end */
1401 if ((SYNTAX (d[-1]) == Sword)
1402 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1407 if (d == string1 /* Points to first char */
1408 || d == end2 /* Points to end */
1409 || (d == end1 && size2 == 0)) /* Points to end */
1411 if ((SYNTAX (d[-1]) == Sword)
1412 != (SYNTAX (d == end1 ? *string2 : *d) == Sword))
1417 if (d == end2 /* Points to end */
1418 || (d == end1 && size2 == 0) /* Points to end */
1419 || SYNTAX (* (d == end1 ? string2 : d)) != Sword) /* Next char not a letter */
1421 if (d == string1 /* Points to first char */
1422 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1427 if (d == string1 /* Points to first char */
1428 || SYNTAX (d[-1]) != Sword) /* prev char not letter */
1430 if (d == end2 /* Points to end */
1431 || (d == end1 && size2 == 0) /* Points to end */
1432 || SYNTAX (d == end1 ? *string2 : *d) != Sword) /* Next char not a letter */
1438 if (((d - string2 <= (unsigned) size2)
1439 ? d - bf_p2 : d - bf_p1)
1445 if (((d - string2 <= (unsigned) size2)
1446 ? d - bf_p2 : d - bf_p1)
1452 if (((d - string2 <= (unsigned) size2)
1453 ? d - bf_p2 : d - bf_p1)
1466 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
1471 goto matchnotsyntax;
1477 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
1482 if (SYNTAX (*d++) == 0) goto fail;
1487 if (SYNTAX (*d++) != 0) goto fail;
1489 #endif /* not emacs */
1492 if (d == string1) /* Note, d cannot equal string2 */
1493 break; /* unless string1 == string2. */
1497 if (d == end2 || (d == end1 && size2 == 0))
1502 /* Match the next few pattern characters exactly.
1503 mcnt is how many characters to match. */
1510 if (translate[*d++] != *p++) goto fail;
1519 if (*d++ != *p++) goto fail;
1525 continue; /* Successfully matched one pattern command; keep matching */
1527 /* Jump here if any matching operation fails. */
1529 if (stackp != stackb)
1530 /* A restart point is known. Restart there and pop it. */
1533 { /* If innermost failure point is dormant, flush it and keep looking */
1539 if (d >= string1 && d <= end1)
1542 else break; /* Matching at this starting point really fails! */
1544 return -1; /* Failure to match */
1548 memcmp_translate (s1, s2, len, translate)
1549 unsigned char *s1, *s2;
1551 unsigned char *translate;
1553 register unsigned char *p1 = s1, *p2 = s2;
1556 if (translate [*p1++] != translate [*p2++]) return 1;
1562 /* Entry points compatible with bsd4.2 regex library */
1566 static struct re_pattern_buffer re_comp_buf;
1574 if (!re_comp_buf.buffer)
1575 return "No previous regular expression";
1579 if (!re_comp_buf.buffer)
1581 if (!(re_comp_buf.buffer = (char *) malloc (200)))
1582 return "Memory exhausted";
1583 re_comp_buf.allocated = 200;
1584 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
1585 return "Memory exhausted";
1587 return re_compile_pattern (s, strlen (s), &re_comp_buf);
1594 int len = strlen (s);
1595 return 0 <= re_search (&re_comp_buf, s, len, 0, len, 0);
1604 /* Indexed by a character, gives the upper case equivalent of the character */
1606 static char upcase[0400] =
1607 { 000, 001, 002, 003, 004, 005, 006, 007,
1608 010, 011, 012, 013, 014, 015, 016, 017,
1609 020, 021, 022, 023, 024, 025, 026, 027,
1610 030, 031, 032, 033, 034, 035, 036, 037,
1611 040, 041, 042, 043, 044, 045, 046, 047,
1612 050, 051, 052, 053, 054, 055, 056, 057,
1613 060, 061, 062, 063, 064, 065, 066, 067,
1614 070, 071, 072, 073, 074, 075, 076, 077,
1615 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1616 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1617 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1618 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1619 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1620 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1621 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1622 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1623 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1624 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1625 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1626 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1627 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1628 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1629 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1630 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1631 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1632 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1633 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1634 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1635 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1636 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1637 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1638 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1646 struct re_pattern_buffer buf;
1649 char fastmap[(1 << BYTEWIDTH)];
1651 /* Allow a command argument to specify the style of syntax. */
1653 obscure_syntax = atoi (argv[1]);
1656 buf.buffer = (char *) malloc (buf.allocated);
1657 buf.fastmap = fastmap;
1658 buf.translate = upcase;
1666 re_compile_pattern (pat, strlen(pat), &buf);
1668 for (i = 0; i < buf.used; i++)
1669 printchar (buf.buffer[i]);
1671 putchar_unfiltered ('\n');
1673 printf_unfiltered ("%d allocated, %d used.\n", buf.allocated, buf.used);
1675 re_compile_fastmap (&buf);
1676 printf_unfiltered ("Allowed by fastmap: ");
1677 for (i = 0; i < (1 << BYTEWIDTH); i++)
1678 if (fastmap[i]) printchar (i);
1679 putchar_unfiltered ('\n');
1682 gets (pat); /* Now read the string to match against */
1684 i = re_match (&buf, pat, strlen (pat), 0, 0);
1685 printf_unfiltered ("Match value %d.\n", i);
1691 struct re_pattern_buffer *bufp;
1695 printf_unfiltered ("buf is :\n----------------\n");
1696 for (i = 0; i < bufp->used; i++)
1697 printchar (bufp->buffer[i]);
1699 printf_unfiltered ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
1701 printf_unfiltered ("Allowed by fastmap: ");
1702 for (i = 0; i < (1 << BYTEWIDTH); i++)
1703 if (bufp->fastmap[i])
1705 printf_unfiltered ("\nAllowed by translate: ");
1706 if (bufp->translate)
1707 for (i = 0; i < (1 << BYTEWIDTH); i++)
1708 if (bufp->translate[i])
1710 printf_unfiltered ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
1711 printf_unfiltered ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
1718 if (c < 041 || c >= 0177)
1720 putchar_unfiltered ('\\');
1721 putchar_unfiltered (((c >> 6) & 3) + '0');
1722 putchar_unfiltered (((c >> 3) & 7) + '0');
1723 putchar_unfiltered ((c & 7) + '0');
1726 putchar_unfiltered (c);
1732 puts_unfiltered (string);