1 /* This is the Assembler Pre-Processor
2 Copyright (C) 1987, 1990, 1991, 1992, 1994 Free Software Foundation, Inc.
4 This file is part of GAS, the GNU Assembler.
6 GAS is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GAS is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GAS; see the file COPYING. If not, write to
18 the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
21 /* App, the assembler pre-processor. This pre-processor strips out excess
22 spaces, turns single-quoted characters into a decimal constant, and turns
23 # <number> <filename> <garbage> into a .line <number>\n.file <filename>
24 pair. This needs better error-handling. */
27 #include "as.h" /* For BAD_CASE() only */
31 #define const /* empty */
36 static const char symbol_chars[] =
37 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
39 #define LEX_IS_SYMBOL_COMPONENT 1
40 #define LEX_IS_WHITESPACE 2
41 #define LEX_IS_LINE_SEPARATOR 3
42 #define LEX_IS_COMMENT_START 4
43 #define LEX_IS_LINE_COMMENT_START 5
44 #define LEX_IS_TWOCHAR_COMMENT_1ST 6
45 #define LEX_IS_TWOCHAR_COMMENT_2ND 7
46 #define LEX_IS_STRINGQUOTE 8
47 #define LEX_IS_COLON 9
48 #define LEX_IS_NEWLINE 10
49 #define LEX_IS_ONECHAR_QUOTE 11
50 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT)
51 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE)
52 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR)
53 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START)
54 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START)
55 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE)
57 static int process_escape PARAMS ((int));
59 /* FIXME-soon: The entire lexer/parser thingy should be
60 built statically at compile time rather than dynamically
61 each and every time the assembler is run. xoxorich. */
68 lex[' '] = LEX_IS_WHITESPACE;
69 lex['\t'] = LEX_IS_WHITESPACE;
70 lex['\n'] = LEX_IS_NEWLINE;
71 lex[';'] = LEX_IS_LINE_SEPARATOR;
72 lex[':'] = LEX_IS_COLON;
76 lex['"'] = LEX_IS_STRINGQUOTE;
79 lex['\''] = LEX_IS_ONECHAR_QUOTE;
82 #ifdef SINGLE_QUOTE_STRINGS
83 lex['\''] = LEX_IS_STRINGQUOTE;
87 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
88 in state 5 of do_scrub_chars must be changed. */
90 /* Note that these override the previous defaults, e.g. if ';' is a
91 comment char, then it isn't a line separator. */
92 for (p = symbol_chars; *p; ++p)
94 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
95 } /* declare symbol characters */
97 for (p = comment_chars; *p; p++)
99 lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
100 } /* declare comment chars */
102 for (p = line_comment_chars; *p; p++)
104 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
105 } /* declare line comment chars */
107 for (p = line_separator_chars; *p; p++)
109 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
110 } /* declare line separators */
112 /* Only allow slash-star comments if slash is not in use */
115 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
117 /* FIXME-soon. This is a bad hack but otherwise, we can't do
118 c-style comments when '/' is a line comment char. xoxorich. */
121 lex['*'] = LEX_IS_TWOCHAR_COMMENT_2ND;
126 lex['\''] = LEX_IS_STRINGQUOTE;
127 lex[';'] = LEX_IS_COMMENT_START;
128 lex['*'] = LEX_IS_LINE_COMMENT_START;
129 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
130 then it can't be used in an expression. */
131 lex['!'] = LEX_IS_LINE_COMMENT_START;
133 } /* do_scrub_begin() */
135 /* Saved state of the scrubber */
137 static int old_state;
138 static char *out_string;
139 static char out_buf[20];
140 static int add_newlines;
141 static char *saved_input;
142 static int saved_input_len;
144 /* Data structure for saving the state of app across #include's. Note that
145 app is called asynchronously to the parsing of the .include's, so our
146 state at the time .include is interpreted is completely unrelated.
147 That's why we have to save it all. */
154 char out_buf[sizeof (out_buf)];
163 register struct app_save *saved;
165 saved = (struct app_save *) xmalloc (sizeof (*saved));
166 saved->state = state;
167 saved->old_state = old_state;
168 saved->out_string = out_string;
169 memcpy (saved->out_buf, out_buf, sizeof (out_buf));
170 saved->add_newlines = add_newlines;
171 saved->saved_input = saved_input;
172 saved->saved_input_len = saved_input_len;
174 /* do_scrub_begin() is not useful, just wastes time. */
179 return (char *) saved;
186 register struct app_save *saved = (struct app_save *) arg;
188 /* There is no do_scrub_end (). */
189 state = saved->state;
190 old_state = saved->old_state;
191 out_string = saved->out_string;
192 memcpy (out_buf, saved->out_buf, sizeof (out_buf));
193 add_newlines = saved->add_newlines;
194 saved_input = saved->saved_input;
195 saved_input_len = saved->saved_input_len;
200 /* @@ This assumes that \n &c are the same on host and target. This is not
227 /* This function is called to process input characters. The GET
228 parameter is used to retrieve more input characters. GET should
229 set its parameter to point to a buffer, and return the length of
230 the buffer; it should return 0 at end of file. The scrubbed output
231 characters are put into the buffer starting at TOSTART; the TOSTART
232 buffer is TOLEN bytes in length. The function returns the number
233 of scrubbed characters put into TOSTART. This will be TOLEN unless
234 end of file was seen. This function is arranged as a state
235 machine, and saves its state so that it may return at any point.
236 This is the way the old code used to work. */
239 do_scrub_chars (get, tostart, tolen)
240 int (*get) PARAMS ((char **));
245 char *toend = tostart + tolen;
249 register int ch, ch2 = 0;
250 int not_cpp_line = 0;
252 /*State 0: beginning of normal line
253 1: After first whitespace on line (flush more white)
254 2: After first non-white (opcode) on line (keep 1white)
255 3: after second white on line (into operands) (flush white)
256 4: after putting out a .line, put out digits
257 5: parsing a string, then go to old-state
258 6: putting out \ escape in a "d string.
259 7: After putting out a .appfile, put out string.
260 8: After putting out a .appfile string, flush until newline.
261 9: After seeing symbol char in state 3 (keep 1white after symchar)
262 10: After seeing whitespace in state 9 (keep white before symchar)
263 11: After seeing a symbol character in state 0 (eg a label definition)
264 -1: output string in out_string and go to the state in old_state
265 -2: flush text until a '*' '/' is seen, then go to state old_state
268 /* I added states 9 and 10 because the MIPS ECOFF assembler uses
269 constructs like ``.loc 1 20''. This was turning into ``.loc
270 120''. States 9 and 10 ensure that a space is never dropped in
271 between characters which could appear in a identifier. Ian
274 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
275 correctly on the PA (and any other target where colons are optional).
278 /* This macro gets the next input character. */
283 : ((saved_input != NULL \
284 ? (free (saved_input), \
285 saved_input = NULL, \
288 fromlen = (*get) (&from), \
289 fromend = from + fromlen, \
294 /* This macro pushes a character back on the input stream. */
296 #define UNGET(uch) (*--from = (uch))
298 /* This macro puts a character into the output buffer. If this
299 character fills the output buffer, this macro jumps to the label
300 TOFULL. We use this rather ugly approach because we need to
301 handle two different termination conditions: EOF on the input
302 stream, and a full output buffer. It would be simpler if we
303 always read in the entire input stream before processing it, but
304 I don't want to make such a significant change to the assembler's
316 if (saved_input != NULL)
319 fromend = from + saved_input_len;
323 fromlen = (*get) (&from);
326 fromend = from + fromlen;
331 /* The cases in this switch end with continue, in order to
332 branch back to the top of this while loop and generate the
333 next output character in the appropriate state. */
338 if (*out_string == '\0')
355 as_warn ("end of file in comment");
364 while ((ch = GET ()) == '*')
369 as_warn ("end of file in comment");
387 else if (ch >= '0' && ch <= '9')
391 while (ch != EOF && IS_WHITESPACE (ch))
396 out_string = "\n\t.appfile ";
403 while (ch != EOF && ch != '\n')
412 /* We are going to copy everything up to a quote character,
413 with special handling for a backslash. We try to
414 optimize the copying in the simple case without using the
415 GET and PUT macros. */
420 for (s = from; s < fromend; s++)
423 /* This condition must be changed if the type of any
424 other character can be LEX_IS_STRINGQUOTE. */
432 if (len > toend - to)
436 memcpy (to, from, len);
445 as_warn ("end of file in string: inserted '\"'");
450 else if (lex[ch] == LEX_IS_STRINGQUOTE)
455 #ifndef NO_STRING_ESCAPES
462 else if (flag_mri && ch == '\n')
464 /* Just quietly terminate the string. This permits lines like
465 bne label loop if we haven't reach end yet
482 /* Handle strings broken across lines, by turning '\n' into
509 #if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
511 as_warn ("Unknown escape '\\%c' in string: Ignored", ch);
513 #else /* ONLY_STANDARD_ESCAPES */
515 /* Accept \x as x for any x */
517 #endif /* ONLY_STANDARD_ESCAPES */
520 as_warn ("End of file in string: '\"' inserted");
539 while (ch != '\n' && ch != EOF);
547 /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
556 as_warn ("end of file not at end of a line; newline inserted");
565 case LEX_IS_WHITESPACE:
568 /* Preserve a single whitespace character at the
569 beginning of a line. */
578 while (ch != EOF && IS_WHITESPACE (ch));
583 || (state == 0 && IS_LINE_COMMENT (ch))
585 || IS_LINE_SEPARATOR (ch))
587 /* cpp never outputs a leading space before the #, so
588 try to avoid being confused. */
592 /* In MRI mode, we keep these spaces. */
600 /* If we're in state 2 or 11, we've seen a non-white
601 character followed by whitespace. If the next character
602 is ':', this is whitespace after a label name which we
603 normally must ignore. In MRI mode, though, spaces are
604 not permitted between the label and the colon. */
605 if ((state == 2 || state == 11)
606 && lex[ch] == LEX_IS_COLON
618 goto recycle; /* Punted leading sp */
620 /* We can arrive here if we leave a leading whitespace
621 character at the beginning of a line. */
627 /* Optimize common case by skipping UNGET/GET. */
628 PUT (' '); /* Sp after opco */
637 /* In MRI mode, we keep these spaces. */
642 goto recycle; /* Sp in operands */
647 /* In MRI mode, we keep these spaces. */
653 state = 10; /* Sp after symbol char */
658 PUT (' '); /* Sp after label definition. */
665 case LEX_IS_TWOCHAR_COMMENT_1ST:
667 if (ch2 != EOF && lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND)
674 if (ch2 != EOF && IS_NEWLINE (ch2))
678 (lex[ch2] != LEX_IS_TWOCHAR_COMMENT_2ND));
681 (lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND))
687 || lex[ch2] == LEX_IS_TWOCHAR_COMMENT_1ST)
692 as_warn ("end of file in multiline comment");
701 if (state == 9 || state == 10)
707 case LEX_IS_STRINGQUOTE:
710 /* Preserve the whitespace in foo "bar" */
715 /* PUT didn't jump out. We could just break, but we
716 know what will happen, so optimize a bit. */
729 case LEX_IS_ONECHAR_QUOTE:
732 /* Preserve the whitespace in foo 'b' */
741 as_warn ("end of file after a one-character quote; \\0 inserted");
749 as_warn ("end of file in escape character");
753 ch = process_escape (ch);
755 sprintf (out_buf, "%d", (int) (unsigned char) ch);
757 /* None of these 'x constants for us. We want 'x'. */
758 if ((ch = GET ()) != '\'')
760 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
761 as_warn ("Missing close quote: (assumed)");
767 if (strlen (out_buf) == 1)
777 out_string = out_buf;
783 if (state == 9 || state == 10)
791 /* Roll out a bunch of newlines from inside comments, etc. */
797 /* fall thru into... */
799 case LEX_IS_LINE_SEPARATOR:
804 case LEX_IS_LINE_COMMENT_START:
805 if (state == 0) /* Only comment at start of line. */
807 /* FIXME-someday: The two character comment stuff was
808 badly thought out. On i386, we want '/' as line
809 comment start AND we want C style comments. hence
810 this hack. The whole lexical process should be
811 reworked. xoxorich. */
833 while (ch != EOF && IS_WHITESPACE (ch));
836 as_warn ("end of file in comment; newline inserted");
840 if (ch < '0' || ch > '9' || not_cpp_line)
842 /* Non-numerics: Eat whole comment line */
843 while (ch != EOF && !IS_NEWLINE (ch))
846 as_warn ("EOF in Comment: Newline inserted");
851 /* Numerics begin comment. Perhaps CPP `# 123 "filename"' */
855 out_string = "\t.appline ";
860 /* We have a line comment character which is not at the
861 start of a line. If this is also a normal comment
862 character, fall through. Otherwise treat it as a default
864 if (strchr (comment_chars, ch) == NULL
866 || (ch != '!' && ch != '*')))
869 && (ch == '!' || ch == '*')
874 case LEX_IS_COMMENT_START:
879 while (ch != EOF && !IS_NEWLINE (ch));
881 as_warn ("end of file in comment; newline inserted");
886 case LEX_IS_SYMBOL_COMPONENT:
889 /* This is a symbol character following another symbol
890 character, with whitespace in between. We skipped
891 the whitespace earlier, so output it now. */
901 /* This is a common case. Quickly copy CH and all the
902 following symbol component or normal characters. */
908 for (s = from; s < fromend; s++)
915 && type != LEX_IS_SYMBOL_COMPONENT)
920 /* Handle the last character normally, for
925 if (len > (toend - to) - 1)
926 len = (toend - to) - 1;
932 memcpy (to, from, len);
940 case 8: *to++ = *from++;
941 case 7: *to++ = *from++;
942 case 6: *to++ = *from++;
943 case 5: *to++ = *from++;
944 case 4: *to++ = *from++;
945 case 3: *to++ = *from++;
946 case 2: *to++ = *from++;
947 case 1: *to++ = *from++;
957 /* Some relatively `normal' character. */
960 state = 11; /* Now seeing label definition */
964 state = 2; /* Ditto */
968 if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
971 else if (state == 10)
983 /* We have reached the end of the input. */
987 /* The output buffer is full. Save any input we have not yet
993 save = (char *) xmalloc (fromend - from);
994 memcpy (save, from, fromend - from);
995 if (saved_input != NULL)
998 saved_input_len = fromend - from;
1002 if (saved_input != NULL)
1008 return to - tostart;