/Users/alexjokela/projects/lattice/src/lexer.c
Line | Count | Source |
1 | | #include "lexer.h" |
2 | | #include <stdlib.h> |
3 | | #include <string.h> |
4 | | #include <stdio.h> |
5 | | #include <ctype.h> |
6 | | |
7 | 2.89k | Lexer lexer_new(const char *source) { |
8 | 2.89k | Lexer lex; |
9 | 2.89k | lex.source = source; |
10 | 2.89k | lex.len = strlen(source); |
11 | 2.89k | lex.pos = 0; |
12 | 2.89k | lex.line = 1; |
13 | 2.89k | lex.col = 1; |
14 | 2.89k | return lex; |
15 | 2.89k | } |
16 | | |
17 | 7.57M | static char lex_peek(const Lexer *lex) { |
18 | 7.57M | if (lex->pos >= lex->len) return '\0'; |
19 | 7.56M | return lex->source[lex->pos]; |
20 | 7.57M | } |
21 | | |
22 | 46.7k | static char lex_peek_ahead(const Lexer *lex, size_t offset) { |
23 | 46.7k | size_t idx = lex->pos + offset; |
24 | 46.7k | if (idx >= lex->len) return '\0'; |
25 | 46.7k | return lex->source[idx]; |
26 | 46.7k | } |
27 | | |
28 | 3.06M | static char lex_advance(Lexer *lex) { |
29 | 3.06M | if (lex->pos >= lex->len) return '\0'; |
30 | 3.06M | char ch = lex->source[lex->pos++]; |
31 | 3.06M | if (ch == '\n') { |
32 | 104k | lex->line++; |
33 | 104k | lex->col = 1; |
34 | 2.96M | } else { |
35 | 2.96M | lex->col++; |
36 | 2.96M | } |
37 | 3.06M | return ch; |
38 | 3.06M | } |
39 | | |
40 | 872k | static void skip_whitespace_and_comments(Lexer *lex) { |
41 | 900k | for (;;) { |
42 | | /* Skip whitespace */ |
43 | 1.58M | while (lex->pos < lex->len && isspace((unsigned char)lex_peek(lex))) { |
44 | 686k | lex_advance(lex); |
45 | 686k | } |
46 | | /* Line comment */ |
47 | 900k | if (lex_peek(lex) == '/' && lex_peek_ahead(lex, 1) == '/') { |
48 | 1.22M | while (lex->pos < lex->len && lex_peek(lex) != '\n') { |
49 | 1.19M | lex_advance(lex); |
50 | 1.19M | } |
51 | 27.4k | continue; |
52 | 27.4k | } |
53 | | /* Block comment (nestable) */ |
54 | 872k | if (lex_peek(lex) == '/' && lex_peek_ahead(lex, 1) == '*') { |
55 | 0 | lex_advance(lex); /* / */ |
56 | 0 | lex_advance(lex); /* * */ |
57 | 0 | int depth = 1; |
58 | 0 | while (depth > 0 && lex->pos < lex->len) { |
59 | 0 | char c = lex_advance(lex); |
60 | 0 | if (c == '/' && lex_peek(lex) == '*') { |
61 | 0 | lex_advance(lex); |
62 | 0 | depth++; |
63 | 0 | } else if (c == '*' && lex_peek(lex) == '/') { |
64 | 0 | lex_advance(lex); |
65 | 0 | depth--; |
66 | 0 | } |
67 | 0 | } |
68 | 0 | continue; |
69 | 0 | } |
70 | 872k | break; |
71 | 872k | } |
72 | 872k | } |
73 | | |
74 | 184k | static char *read_ident(Lexer *lex) { |
75 | 184k | size_t start = lex->pos; |
76 | 946k | while (lex->pos < lex->len && (isalnum((unsigned char)lex_peek(lex)) || lex_peek(lex) == '_')) { |
77 | 762k | lex_advance(lex); |
78 | 762k | } |
79 | 184k | size_t len = lex->pos - start; |
80 | 184k | char *s = malloc(len + 1); |
81 | 184k | memcpy(s, lex->source + start, len); |
82 | 184k | s[len] = '\0'; |
83 | 184k | return s; |
84 | 184k | } |
85 | | |
86 | 184k | static TokenType keyword_lookup(const char *ident) { |
87 | 184k | if (strcmp(ident, "flux") == 0) return TOK_FLUX; |
88 | 181k | if (strcmp(ident, "fix") == 0) return TOK_FIX; |
89 | 181k | if (strcmp(ident, "let") == 0) return TOK_LET; |
90 | 171k | if (strcmp(ident, "freeze") == 0) return TOK_FREEZE; |
91 | 170k | if (strcmp(ident, "thaw") == 0) return TOK_THAW; |
92 | 170k | if (strcmp(ident, "forge") == 0) return TOK_FORGE; |
93 | 170k | if (strcmp(ident, "fn") == 0) return TOK_FN; |
94 | 162k | if (strcmp(ident, "struct") == 0) return TOK_STRUCT; |
95 | 162k | if (strcmp(ident, "if") == 0) return TOK_IF; |
96 | 155k | if (strcmp(ident, "else") == 0) return TOK_ELSE; |
97 | 154k | if (strcmp(ident, "for") == 0) return TOK_FOR; |
98 | 153k | if (strcmp(ident, "in") == 0) return TOK_IN; |
99 | 152k | if (strcmp(ident, "while") == 0) return TOK_WHILE; |
100 | 152k | if (strcmp(ident, "loop") == 0) return TOK_LOOP; |
101 | 151k | if (strcmp(ident, "return") == 0) return TOK_RETURN; |
102 | 142k | if (strcmp(ident, "break") == 0) return TOK_BREAK; |
103 | 142k | if (strcmp(ident, "continue") == 0) return TOK_CONTINUE; |
104 | 142k | if (strcmp(ident, "spawn") == 0) return TOK_SPAWN; |
105 | 142k | if (strcmp(ident, "true") == 0) return TOK_TRUE; |
106 | 141k | if (strcmp(ident, "false") == 0) return TOK_FALSE; |
107 | 139k | if (strcmp(ident, "nil") == 0) return TOK_NIL; |
108 | 138k | if (strcmp(ident, "clone") == 0) return TOK_CLONE; |
109 | 138k | if (strcmp(ident, "anneal") == 0) return TOK_ANNEAL; |
110 | 138k | if (strcmp(ident, "print") == 0) return TOK_PRINT; |
111 | 135k | if (strcmp(ident, "try") == 0) return TOK_TRY; |
112 | 135k | if (strcmp(ident, "catch") == 0) return TOK_CATCH; |
113 | 134k | if (strcmp(ident, "scope") == 0) return TOK_SCOPE; |
114 | 134k | if (strcmp(ident, "test") == 0) return TOK_TEST; |
115 | 134k | if (strcmp(ident, "match") == 0) return TOK_MATCH; |
116 | 134k | if (strcmp(ident, "enum") == 0) return TOK_ENUM; |
117 | 134k | if (strcmp(ident, "import") == 0) return TOK_IMPORT; |
118 | 134k | if (strcmp(ident, "from") == 0) return TOK_FROM; |
119 | 134k | if (strcmp(ident, "as") == 0) return TOK_AS; |
120 | 134k | if (strcmp(ident, "crystallize") == 0) return TOK_CRYSTALLIZE; |
121 | 134k | if (strcmp(ident, "sublimate") == 0) return TOK_SUBLIMATE; |
122 | 134k | if (strcmp(ident, "defer") == 0) return TOK_DEFER; |
123 | 134k | if (strcmp(ident, "trait") == 0) return TOK_TRAIT; |
124 | 134k | if (strcmp(ident, "impl") == 0) return TOK_IMPL; |
125 | 134k | if (strcmp(ident, "export") == 0) return TOK_EXPORT; |
126 | 134k | return TOK_IDENT; |
127 | 134k | } |
128 | | |
129 | | /* Forward declarations for mutual recursion (string interpolation) */ |
130 | | static bool lex_string_or_interp(Lexer *lex, LatVec *tokens, char **err); |
131 | | static bool lex_triple_quote_string(Lexer *lex, LatVec *tokens, char **err); |
132 | | static bool lex_one(Lexer *lex, LatVec *tokens, char **err); |
133 | | |
134 | 416k | static bool next_token(Lexer *lex, Token *out, char **err) { |
135 | 416k | size_t line = lex->line; |
136 | 416k | size_t col = lex->col; |
137 | 416k | char ch = lex_peek(lex); |
138 | | |
139 | | /* Mode directive: #mode */ |
140 | 416k | if (ch == '#') { |
141 | 9 | lex_advance(lex); |
142 | 9 | char *word = read_ident(lex); |
143 | 9 | if (strcmp(word, "mode") != 0) { |
144 | 0 | *err = NULL; |
145 | 0 | (void)asprintf(err, "%zu:%zu: unexpected directive '#%s'", line, col, word); |
146 | 0 | free(word); |
147 | 0 | return false; |
148 | 0 | } |
149 | 9 | free(word); |
150 | 9 | skip_whitespace_and_comments(lex); |
151 | 9 | char *mode = read_ident(lex); |
152 | 9 | if (strcmp(mode, "casual") != 0 && strcmp(mode, "strict") != 0) { |
153 | 0 | *err = NULL; |
154 | 0 | (void)asprintf(err, "%zu:%zu: expected 'casual' or 'strict' after #mode, got '%s'", line, col, mode); |
155 | 0 | free(mode); |
156 | 0 | return false; |
157 | 0 | } |
158 | 9 | *out = token_str(TOK_MODE_DIRECTIVE, mode, line, col); |
159 | 9 | return true; |
160 | 9 | } |
161 | | |
162 | | /* String literals are handled by lex_string_or_interp() via lex_one() */ |
163 | | |
164 | | /* Number literal */ |
165 | 416k | if (isdigit((unsigned char)ch)) { |
166 | 10.5k | size_t start = lex->pos; |
167 | 10.5k | bool is_float = false; |
168 | 22.8k | while (lex->pos < lex->len && isdigit((unsigned char)lex_peek(lex))) { |
169 | 12.2k | lex_advance(lex); |
170 | 12.2k | } |
171 | 10.5k | if (lex_peek(lex) == '.' && isdigit((unsigned char)lex_peek_ahead(lex, 1))) { |
172 | 324 | is_float = true; |
173 | 324 | lex_advance(lex); /* '.' */ |
174 | 843 | while (lex->pos < lex->len && isdigit((unsigned char)lex_peek(lex))) { |
175 | 519 | lex_advance(lex); |
176 | 519 | } |
177 | 324 | } |
178 | 10.5k | size_t num_len = lex->pos - start; |
179 | 10.5k | char *num_str = malloc(num_len + 1); |
180 | 10.5k | memcpy(num_str, lex->source + start, num_len); |
181 | 10.5k | num_str[num_len] = '\0'; |
182 | 10.5k | if (is_float) { |
183 | 324 | double val = strtod(num_str, NULL); |
184 | 324 | free(num_str); |
185 | 324 | *out = token_float(val, line, col); |
186 | 10.2k | } else { |
187 | 10.2k | int64_t val = strtoll(num_str, NULL, 10); |
188 | 10.2k | free(num_str); |
189 | 10.2k | *out = token_int(val, line, col); |
190 | 10.2k | } |
191 | 10.5k | return true; |
192 | 10.5k | } |
193 | | |
194 | | /* Identifiers and keywords */ |
195 | 406k | if (isalpha((unsigned char)ch) || ch == '_') { |
196 | 184k | char *ident = read_ident(lex); |
197 | 184k | TokenType type = keyword_lookup(ident); |
198 | 184k | if (type != TOK_IDENT) { |
199 | 50.5k | free(ident); |
200 | 50.5k | *out = token_simple(type, line, col); |
201 | 134k | } else { |
202 | 134k | *out = token_str(TOK_IDENT, ident, line, col); |
203 | 134k | } |
204 | 184k | return true; |
205 | 184k | } |
206 | | |
207 | | /* Operators and punctuation */ |
208 | 221k | lex_advance(lex); |
209 | 221k | switch (ch) { |
210 | 30 | case '~': *out = token_simple(TOK_TILDE, line, col); return true; |
211 | 3.57k | case '+': |
212 | 3.57k | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_PLUS_EQ, line, col); } |
213 | 3.51k | else { *out = token_simple(TOK_PLUS, line, col); } |
214 | 3.57k | return true; |
215 | 30 | case '%': |
216 | 30 | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_PERCENT_EQ, line, col); } |
217 | 27 | else { *out = token_simple(TOK_PERCENT, line, col); } |
218 | 30 | return true; |
219 | 45.0k | case '(': *out = token_simple(TOK_LPAREN, line, col); return true; |
220 | 45.0k | case ')': *out = token_simple(TOK_RPAREN, line, col); return true; |
221 | 21.1k | case '{': *out = token_simple(TOK_LBRACE, line, col); return true; |
222 | 21.1k | case '}': *out = token_simple(TOK_RBRACE, line, col); return true; |
223 | 4.97k | case '[': *out = token_simple(TOK_LBRACKET, line, col); return true; |
224 | 4.97k | case ']': *out = token_simple(TOK_RBRACKET, line, col); return true; |
225 | 16.6k | case ',': *out = token_simple(TOK_COMMA, line, col); return true; |
226 | 24 | case ';': *out = token_simple(TOK_SEMICOLON, line, col); return true; |
227 | 42 | case '/': |
228 | 42 | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_SLASH_EQ, line, col); } |
229 | 39 | else { *out = token_simple(TOK_SLASH, line, col); } |
230 | 42 | return true; |
231 | 126 | case '*': |
232 | 126 | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_STAR_EQ, line, col); } |
233 | 123 | else { *out = token_simple(TOK_STAR, line, col); } |
234 | 126 | return true; |
235 | 108 | case '&': |
236 | 108 | if (lex_peek(lex) == '&') { lex_advance(lex); *out = token_simple(TOK_AND, line, col); } |
237 | 15 | else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_AMP_EQ, line, col); } |
238 | 12 | else { *out = token_simple(TOK_AMPERSAND, line, col); } |
239 | 108 | return true; |
240 | 3.86k | case '|': |
241 | 3.86k | if (lex_peek(lex) == '|') { lex_advance(lex); *out = token_simple(TOK_OR, line, col); } |
242 | 3.85k | else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_PIPE_EQ, line, col); } |
243 | 3.85k | else { *out = token_simple(TOK_PIPE, line, col); } |
244 | 3.86k | return true; |
245 | 12 | case '^': |
246 | 12 | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_CARET_EQ, line, col); } |
247 | 9 | else { *out = token_simple(TOK_CARET, line, col); } |
248 | 12 | return true; |
249 | 19.1k | case '=': |
250 | 19.1k | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_EQEQ, line, col); } |
251 | 16.3k | else if (lex_peek(lex) == '>') { lex_advance(lex); *out = token_simple(TOK_FATARROW, line, col); } |
252 | 16.2k | else { *out = token_simple(TOK_EQ, line, col); } |
253 | 19.1k | return true; |
254 | 1.36k | case '!': |
255 | 1.36k | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_BANGEQ, line, col); } |
256 | 561 | else { *out = token_simple(TOK_BANG, line, col); } |
257 | 1.36k | return true; |
258 | 807 | case '<': |
259 | 807 | if (lex_peek(lex) == '<') { |
260 | 9 | lex_advance(lex); |
261 | 9 | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_LSHIFT_EQ, line, col); } |
262 | 6 | else { *out = token_simple(TOK_LSHIFT, line, col); } |
263 | 9 | } |
264 | 798 | else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_LTEQ, line, col); } |
265 | 576 | else { *out = token_simple(TOK_LT, line, col); } |
266 | 807 | return true; |
267 | 1.03k | case '>': |
268 | 1.03k | if (lex_peek(lex) == '>') { |
269 | 6 | lex_advance(lex); |
270 | 6 | if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_RSHIFT_EQ, line, col); } |
271 | 3 | else { *out = token_simple(TOK_RSHIFT, line, col); } |
272 | 6 | } |
273 | 1.02k | else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_GTEQ, line, col); } |
274 | 639 | else { *out = token_simple(TOK_GT, line, col); } |
275 | 1.03k | return true; |
276 | 5.04k | case '-': |
277 | 5.04k | if (lex_peek(lex) == '>') { lex_advance(lex); *out = token_simple(TOK_ARROW, line, col); } |
278 | 393 | else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_MINUS_EQ, line, col); } |
279 | 390 | else { *out = token_simple(TOK_MINUS, line, col); } |
280 | 5.04k | return true; |
281 | 16.3k | case '.': |
282 | 16.3k | if (lex_peek(lex) == '.') { |
283 | 456 | lex_advance(lex); |
284 | 456 | if (lex_peek(lex) == '.') { lex_advance(lex); *out = token_simple(TOK_DOTDOTDOT, line, col); } |
285 | 48 | else { *out = token_simple(TOK_DOTDOT, line, col); } |
286 | 456 | } |
287 | 15.9k | else { *out = token_simple(TOK_DOT, line, col); } |
288 | 16.3k | return true; |
289 | 10.7k | case ':': |
290 | 10.7k | if (lex_peek(lex) == ':') { lex_advance(lex); *out = token_simple(TOK_COLONCOLON, line, col); } |
291 | 9.11k | else { *out = token_simple(TOK_COLON, line, col); } |
292 | 10.7k | return true; |
293 | 66 | case '?': |
294 | 66 | if (lex_peek(lex) == '?') { lex_advance(lex); *out = token_simple(TOK_QUESTION_QUESTION, line, col); } |
295 | 48 | else if (lex_peek(lex) == '.') { lex_advance(lex); *out = token_simple(TOK_QUESTION_DOT, line, col); } |
296 | 21 | else if (lex_peek(lex) == '[') { lex_advance(lex); *out = token_simple(TOK_QUESTION_LBRACKET, line, col); } |
297 | 18 | else { *out = token_simple(TOK_QUESTION, line, col); } |
298 | 66 | return true; |
299 | 0 | default: |
300 | 0 | *err = NULL; |
301 | 0 | (void)asprintf(err, "%zu:%zu: unexpected character '%c'", line, col, ch); |
302 | 0 | return false; |
303 | 221k | } |
304 | 221k | } |
305 | | |
306 | | /* Helper: scan escape sequence inside a string, appending to buf. |
307 | | * On entry, the backslash has already been consumed. Returns false on error. */ |
308 | | static bool lex_string_escape(Lexer *lex, char **buf, size_t *buf_len, |
309 | 708 | size_t *buf_cap, size_t line, size_t col, char **err) { |
310 | 708 | if (lex->pos >= lex->len) { |
311 | 0 | *err = NULL; |
312 | 0 | (void)asprintf(err, "%zu:%zu: unterminated string escape", line, col); |
313 | 0 | return false; |
314 | 0 | } |
315 | 708 | char esc = lex_advance(lex); |
316 | 708 | char c; |
317 | 708 | switch (esc) { |
318 | 228 | case 'n': c = '\n'; break; |
319 | 60 | case 't': c = '\t'; break; |
320 | 33 | case 'r': c = '\r'; break; |
321 | 3 | case '0': c = '\0'; break; |
322 | 93 | case '\\': c = '\\'; break; |
323 | 198 | case '"': c = '"'; break; |
324 | 3 | case '\'': c = '\''; break; |
325 | 3 | case '$': c = '$'; break; |
326 | 9 | case 'x': { |
327 | 9 | if (lex->pos + 1 >= lex->len) { |
328 | 0 | *err = NULL; |
329 | 0 | (void)asprintf(err, "%zu:%zu: incomplete \\x escape", line, col); |
330 | 0 | return false; |
331 | 0 | } |
332 | 9 | char h1 = lex_advance(lex); |
333 | 9 | char h2 = lex_advance(lex); |
334 | 9 | int d1 = -1, d2 = -1; |
335 | 9 | if (h1 >= '0' && h1 <= '9') d1 = h1 - '0'; |
336 | 3 | else if (h1 >= 'a' && h1 <= 'f') d1 = h1 - 'a' + 10; |
337 | 3 | else if (h1 >= 'A' && h1 <= 'F') d1 = h1 - 'A' + 10; |
338 | 9 | if (h2 >= '0' && h2 <= '9') d2 = h2 - '0'; |
339 | 6 | else if (h2 >= 'a' && h2 <= 'f') d2 = h2 - 'a' + 10; |
340 | 3 | else if (h2 >= 'A' && h2 <= 'F') d2 = h2 - 'A' + 10; |
341 | 9 | if (d1 < 0 || d2 < 0) { |
342 | 3 | *err = NULL; |
343 | 3 | (void)asprintf(err, "%zu:%zu: invalid hex escape '\\x%c%c'", line, col, h1, h2); |
344 | 3 | return false; |
345 | 3 | } |
346 | 6 | c = (char)((d1 << 4) | d2); |
347 | 6 | break; |
348 | 9 | } |
349 | 78 | default: c = esc; break; |
350 | 708 | } |
351 | 705 | if (*buf_len + 1 >= *buf_cap) { |
352 | 0 | *buf_cap *= 2; |
353 | 0 | *buf = realloc(*buf, *buf_cap); |
354 | 0 | } |
355 | 705 | (*buf)[(*buf_len)++] = c; |
356 | 705 | return true; |
357 | 708 | } |
358 | | |
359 | | /* Scan a string literal, handling interpolation with ${...}. |
360 | | * On entry, lex is positioned at the opening '"'. |
361 | | * Pushes TOK_STRING_LIT (no interpolation) or |
362 | | * TOK_INTERP_START / expression tokens / TOK_INTERP_MID / ... / TOK_INTERP_END. */ |
363 | 18.1k | static bool lex_string_or_interp(Lexer *lex, LatVec *tokens, char **err) { |
364 | 18.1k | size_t line = lex->line; |
365 | 18.1k | size_t col = lex->col; |
366 | 18.1k | lex_advance(lex); /* consume opening " */ |
367 | | |
368 | 18.1k | bool has_interp = false; |
369 | 18.1k | size_t buf_cap = 64; |
370 | 18.1k | size_t buf_len = 0; |
371 | 18.1k | char *buf = malloc(buf_cap); |
372 | | |
373 | 155k | for (;;) { |
374 | 155k | if (lex->pos >= lex->len) { |
375 | 0 | free(buf); |
376 | 0 | *err = NULL; |
377 | 0 | (void)asprintf(err, "%zu:%zu: unterminated string literal", line, col); |
378 | 0 | return false; |
379 | 0 | } |
380 | | |
381 | | /* Check for interpolation: ${ */ |
382 | 155k | if (lex_peek(lex) == '$' && lex_peek_ahead(lex, 1) == '{') { |
383 | | /* Emit accumulated text as INTERP_START or INTERP_MID */ |
384 | 69 | buf[buf_len] = '\0'; |
385 | 69 | TokenType seg_type = has_interp ? TOK_INTERP_MID : TOK_INTERP_START; |
386 | 69 | Token seg = token_str(seg_type, buf, line, col); |
387 | 69 | lat_vec_push(tokens, &seg); |
388 | 69 | has_interp = true; |
389 | | |
390 | 69 | lex_advance(lex); /* consume $ */ |
391 | 69 | lex_advance(lex); /* consume { */ |
392 | | |
393 | | /* Lex expression tokens until brace depth returns to 0 */ |
394 | 69 | int depth = 1; |
395 | 198 | while (depth > 0) { |
396 | 198 | skip_whitespace_and_comments(lex); |
397 | 198 | if (lex->pos >= lex->len) { |
398 | 0 | *err = NULL; |
399 | 0 | (void)asprintf(err, "%zu:%zu: unterminated string interpolation", line, col); |
400 | 0 | return false; |
401 | 0 | } |
402 | | /* End of interpolation */ |
403 | 198 | if (lex_peek(lex) == '}' && depth == 1) { |
404 | 69 | lex_advance(lex); /* consume closing } */ |
405 | 69 | break; |
406 | 69 | } |
407 | | /* Lex one token (handles nested strings with interpolation) */ |
408 | 129 | size_t before = tokens->len; |
409 | 129 | if (!lex_one(lex, tokens, err)) return false; |
410 | | /* Track brace depth */ |
411 | 129 | if (tokens->len > before) { |
412 | 129 | Token *last = lat_vec_get(tokens, tokens->len - 1); |
413 | 129 | if (last->type == TOK_LBRACE) depth++; |
414 | 129 | else if (last->type == TOK_RBRACE) depth--; |
415 | 129 | } |
416 | 129 | } |
417 | | |
418 | | /* Reset buffer for next string segment */ |
419 | 69 | buf_cap = 64; |
420 | 69 | buf_len = 0; |
421 | 69 | buf = malloc(buf_cap); |
422 | 69 | continue; |
423 | 69 | } |
424 | | |
425 | | /* Check for end of string */ |
426 | 155k | if (lex_peek(lex) == '"') { |
427 | 18.1k | lex_advance(lex); /* consume closing " */ |
428 | 18.1k | buf[buf_len] = '\0'; |
429 | 18.1k | if (has_interp) { |
430 | 42 | Token seg = token_str(TOK_INTERP_END, buf, line, col); |
431 | 42 | lat_vec_push(tokens, &seg); |
432 | 18.1k | } else { |
433 | 18.1k | Token seg = token_str(TOK_STRING_LIT, buf, line, col); |
434 | 18.1k | lat_vec_push(tokens, &seg); |
435 | 18.1k | } |
436 | 18.1k | return true; |
437 | 18.1k | } |
438 | | |
439 | | /* Check for escape sequence */ |
440 | 137k | if (lex_peek(lex) == '\\') { |
441 | 699 | lex_advance(lex); /* consume backslash */ |
442 | 699 | if (!lex_string_escape(lex, &buf, &buf_len, &buf_cap, line, col, err)) { |
443 | 3 | free(buf); |
444 | 3 | return false; |
445 | 3 | } |
446 | 696 | continue; |
447 | 699 | } |
448 | | |
449 | | /* Regular character */ |
450 | 136k | char c = lex_advance(lex); |
451 | 136k | if (buf_len + 1 >= buf_cap) { |
452 | 9 | buf_cap *= 2; |
453 | 9 | buf = realloc(buf, buf_cap); |
454 | 9 | } |
455 | 136k | buf[buf_len++] = c; |
456 | 136k | } |
457 | 18.1k | } |
458 | | |
459 | | /* Scan a single-quoted string literal (no interpolation). |
460 | | * On entry, lex is positioned at the opening '\''. |
461 | | * Pushes a TOK_STRING_LIT token. */ |
462 | 30 | static bool lex_single_quote_string(Lexer *lex, LatVec *tokens, char **err) { |
463 | 30 | size_t line = lex->line; |
464 | 30 | size_t col = lex->col; |
465 | 30 | lex_advance(lex); /* consume opening ' */ |
466 | | |
467 | 30 | size_t buf_cap = 64; |
468 | 30 | size_t buf_len = 0; |
469 | 30 | char *buf = malloc(buf_cap); |
470 | | |
471 | 408 | for (;;) { |
472 | 408 | if (lex->pos >= lex->len) { |
473 | 0 | free(buf); |
474 | 0 | *err = NULL; |
475 | 0 | (void)asprintf(err, "%zu:%zu: unterminated string literal", line, col); |
476 | 0 | return false; |
477 | 0 | } |
478 | | |
479 | | /* End of string */ |
480 | 408 | if (lex_peek(lex) == '\'') { |
481 | 30 | lex_advance(lex); /* consume closing ' */ |
482 | 30 | buf[buf_len] = '\0'; |
483 | 30 | Token seg = token_str(TOK_STRING_LIT, buf, line, col); |
484 | 30 | lat_vec_push(tokens, &seg); |
485 | 30 | return true; |
486 | 30 | } |
487 | | |
488 | | /* Escape sequence */ |
489 | 378 | if (lex_peek(lex) == '\\') { |
490 | 9 | lex_advance(lex); /* consume backslash */ |
491 | 9 | if (!lex_string_escape(lex, &buf, &buf_len, &buf_cap, line, col, err)) { |
492 | 0 | free(buf); |
493 | 0 | return false; |
494 | 0 | } |
495 | 9 | continue; |
496 | 9 | } |
497 | | |
498 | | /* Regular character */ |
499 | 369 | char c = lex_advance(lex); |
500 | 369 | if (buf_len + 1 >= buf_cap) { |
501 | 0 | buf_cap *= 2; |
502 | 0 | buf = realloc(buf, buf_cap); |
503 | 0 | } |
504 | 369 | buf[buf_len++] = c; |
505 | 369 | } |
506 | 30 | } |
507 | | |
508 | | /* Dedent a raw triple-quoted string based on closing indentation. |
509 | | * If the closing """ is on its own line with only whitespace before it, |
510 | | * that whitespace count is used as the dedent level. Returns a new |
511 | | * heap-allocated string. Caller must free. */ |
512 | 27 | static char *dedent_triple_string(const char *raw, size_t raw_len, size_t *out_len) { |
513 | | /* Find last newline in raw content */ |
514 | 27 | size_t last_nl = raw_len; /* sentinel: no newline found */ |
515 | 189 | for (size_t i = raw_len; i > 0; i--) { |
516 | 177 | if (raw[i - 1] == '\n') { |
517 | 15 | last_nl = i - 1; |
518 | 15 | break; |
519 | 15 | } |
520 | 177 | } |
521 | | |
522 | 27 | size_t closing_indent = 0; |
523 | 27 | size_t content_end = raw_len; |
524 | | |
525 | 27 | if (last_nl < raw_len) { |
526 | | /* Check if everything after last newline is whitespace */ |
527 | 15 | bool all_ws = true; |
528 | 15 | size_t ws_count = 0; |
529 | 75 | for (size_t i = last_nl + 1; i < raw_len; i++) { |
530 | 60 | if (raw[i] == ' ') ws_count++; |
531 | 0 | else if (raw[i] == '\t') ws_count += 4; |
532 | 0 | else { all_ws = false; break; } |
533 | 60 | } |
534 | 15 | if (all_ws) { |
535 | 15 | closing_indent = ws_count; |
536 | 15 | content_end = last_nl; /* exclude trailing \n + whitespace */ |
537 | 15 | } |
538 | 15 | } |
539 | | |
540 | 27 | if (closing_indent == 0) { |
541 | | /* No dedenting needed */ |
542 | 12 | char *result = malloc(content_end + 1); |
543 | 12 | memcpy(result, raw, content_end); |
544 | 12 | result[content_end] = '\0'; |
545 | 12 | *out_len = content_end; |
546 | 12 | return result; |
547 | 12 | } |
548 | | |
549 | | /* Dedent: strip up to closing_indent whitespace from start of each line */ |
550 | 15 | size_t result_cap = content_end + 1; |
551 | 15 | char *result = malloc(result_cap); |
552 | 15 | size_t result_len = 0; |
553 | 15 | size_t i = 0; |
554 | 15 | bool at_line_start = true; |
555 | | |
556 | 438 | while (i < content_end) { |
557 | 423 | if (at_line_start) { |
558 | 27 | size_t skipped = 0; |
559 | 135 | while (i < content_end && skipped < closing_indent) { |
560 | 108 | if (raw[i] == ' ') { skipped++; i++; } |
561 | 0 | else if (raw[i] == '\t') { skipped += 4; i++; } |
562 | 0 | else break; |
563 | 108 | } |
564 | 27 | at_line_start = false; |
565 | 27 | } |
566 | 423 | if (i >= content_end) break; |
567 | 423 | char c = raw[i++]; |
568 | 423 | if (c == '\n') at_line_start = true; |
569 | 423 | if (result_len + 1 >= result_cap) { |
570 | 0 | result_cap *= 2; |
571 | 0 | result = realloc(result, result_cap); |
572 | 0 | } |
573 | 423 | result[result_len++] = c; |
574 | 423 | } |
575 | | |
576 | 15 | result[result_len] = '\0'; |
577 | 15 | *out_len = result_len; |
578 | 15 | return result; |
579 | 27 | } |
580 | | |
581 | | /* Scan a triple-quoted string literal with optional interpolation and dedenting. |
582 | | * On entry, lex is positioned at the first '"' of opening """. |
583 | | * Supports ${...} interpolation like double-quoted strings. |
584 | | * Dedents based on closing """ indentation. */ |
585 | 27 | static bool lex_triple_quote_string(Lexer *lex, LatVec *tokens, char **err) { |
586 | 27 | size_t line = lex->line; |
587 | 27 | size_t col = lex->col; |
588 | | |
589 | | /* Consume opening """ */ |
590 | 27 | lex_advance(lex); lex_advance(lex); lex_advance(lex); |
591 | | |
592 | | /* Skip optional newline immediately after """ */ |
593 | 27 | if (lex_peek(lex) == '\n') { |
594 | 15 | lex_advance(lex); |
595 | 15 | } else if (lex_peek(lex) == '\r' && lex_peek_ahead(lex, 1) == '\n') { |
596 | 0 | lex_advance(lex); lex_advance(lex); |
597 | 0 | } |
598 | | |
599 | | /* Collect raw content until closing """ */ |
600 | 27 | size_t raw_cap = 256; |
601 | 27 | size_t raw_len = 0; |
602 | 27 | char *raw = malloc(raw_cap); |
603 | | |
604 | 735 | for (;;) { |
605 | 735 | if (lex->pos >= lex->len) { |
606 | 0 | free(raw); |
607 | 0 | *err = NULL; |
608 | 0 | (void)asprintf(err, "%zu:%zu: unterminated triple-quoted string", line, col); |
609 | 0 | return false; |
610 | 0 | } |
611 | 735 | if (lex_peek(lex) == '"' && lex_peek_ahead(lex, 1) == '"' && |
612 | 735 | lex_peek_ahead(lex, 2) == '"') { |
613 | 27 | lex_advance(lex); lex_advance(lex); lex_advance(lex); |
614 | 27 | break; |
615 | 27 | } |
616 | 708 | char c = lex_advance(lex); |
617 | 708 | if (raw_len + 1 >= raw_cap) { |
618 | 0 | raw_cap *= 2; |
619 | 0 | raw = realloc(raw, raw_cap); |
620 | 0 | } |
621 | 708 | raw[raw_len++] = c; |
622 | 708 | } |
623 | 27 | raw[raw_len] = '\0'; |
624 | | |
625 | | /* Dedent */ |
626 | 27 | size_t dedented_len = 0; |
627 | 27 | char *dedented = dedent_triple_string(raw, raw_len, &dedented_len); |
628 | 27 | free(raw); |
629 | | |
630 | | /* Process dedented content for escapes and interpolation */ |
631 | 27 | bool has_interp = false; |
632 | 27 | size_t buf_cap = 64; |
633 | 27 | size_t buf_len = 0; |
634 | 27 | char *buf = malloc(buf_cap); |
635 | 27 | size_t pos = 0; |
636 | | |
637 | 489 | while (pos < dedented_len) { |
638 | | /* Check for interpolation ${ */ |
639 | 462 | if (dedented[pos] == '$' && pos + 1 < dedented_len && dedented[pos + 1] == '{') { |
640 | 9 | buf[buf_len] = '\0'; |
641 | 9 | TokenType seg_type = has_interp ? TOK_INTERP_MID : TOK_INTERP_START; |
642 | 9 | Token seg = token_str(seg_type, buf, line, col); |
643 | 9 | lat_vec_push(tokens, &seg); |
644 | 9 | has_interp = true; |
645 | 9 | pos += 2; /* skip ${ */ |
646 | | |
647 | | /* Extract expression text by finding matching } */ |
648 | 9 | int depth = 1; |
649 | 9 | size_t expr_start = pos; |
650 | 51 | while (pos < dedented_len && depth > 0) { |
651 | 51 | char ec = dedented[pos]; |
652 | 51 | if (ec == '{') { depth++; } |
653 | 51 | else if (ec == '}') { depth--; if (depth == 0) break; } |
654 | 42 | else if (ec == '"') { |
655 | 0 | pos++; |
656 | 0 | while (pos < dedented_len && dedented[pos] != '"') { |
657 | 0 | if (dedented[pos] == '\\') pos++; |
658 | 0 | pos++; |
659 | 0 | } |
660 | 42 | } else if (ec == '\'') { |
661 | 0 | pos++; |
662 | 0 | while (pos < dedented_len && dedented[pos] != '\'') { |
663 | 0 | if (dedented[pos] == '\\') pos++; |
664 | 0 | pos++; |
665 | 0 | } |
666 | 0 | } |
667 | 42 | pos++; |
668 | 42 | } |
669 | | |
670 | 9 | if (depth != 0) { |
671 | 0 | free(buf); free(dedented); |
672 | 0 | *err = NULL; |
673 | 0 | (void)asprintf(err, "%zu:%zu: unterminated interpolation in triple-quoted string", |
674 | 0 | line, col); |
675 | 0 | return false; |
676 | 0 | } |
677 | | |
678 | | /* Lex expression tokens via a sub-lexer */ |
679 | 9 | size_t expr_len = pos - expr_start; |
680 | 9 | char *expr_src = malloc(expr_len + 1); |
681 | 9 | memcpy(expr_src, dedented + expr_start, expr_len); |
682 | 9 | expr_src[expr_len] = '\0'; |
683 | | |
684 | 9 | Lexer expr_lex = lexer_new(expr_src); |
685 | 24 | for (;;) { |
686 | 24 | skip_whitespace_and_comments(&expr_lex); |
687 | 24 | if (expr_lex.pos >= expr_lex.len) break; |
688 | 15 | if (!lex_one(&expr_lex, tokens, err)) { |
689 | 0 | free(expr_src); free(buf); free(dedented); |
690 | 0 | return false; |
691 | 0 | } |
692 | 15 | } |
693 | 9 | free(expr_src); |
694 | 9 | pos++; /* skip closing } */ |
695 | | |
696 | | /* Reset buffer for next segment */ |
697 | 9 | buf_cap = 64; |
698 | 9 | buf_len = 0; |
699 | 9 | buf = malloc(buf_cap); |
700 | 9 | continue; |
701 | 9 | } |
702 | | |
703 | | /* Check for escape sequence */ |
704 | 453 | if (dedented[pos] == '\\' && pos + 1 < dedented_len) { |
705 | 3 | pos++; /* skip backslash */ |
706 | 3 | char esc = dedented[pos++]; |
707 | 3 | char c; |
708 | 3 | switch (esc) { |
709 | 0 | case 'n': c = '\n'; break; |
710 | 3 | case 't': c = '\t'; break; |
711 | 0 | case 'r': c = '\r'; break; |
712 | 0 | case '0': c = '\0'; break; |
713 | 0 | case '\\': c = '\\'; break; |
714 | 0 | case '"': c = '"'; break; |
715 | 0 | case '\'': c = '\''; break; |
716 | 0 | case '$': c = '$'; break; |
717 | 0 | case 'x': { |
718 | 0 | if (pos + 1 >= dedented_len) { |
719 | 0 | free(buf); free(dedented); |
720 | 0 | *err = NULL; |
721 | 0 | (void)asprintf(err, "%zu:%zu: incomplete \\x escape in triple-quoted string", |
722 | 0 | line, col); |
723 | 0 | return false; |
724 | 0 | } |
725 | 0 | char h1 = dedented[pos++]; |
726 | 0 | char h2 = dedented[pos++]; |
727 | 0 | int d1 = -1, d2 = -1; |
728 | 0 | if (h1 >= '0' && h1 <= '9') d1 = h1 - '0'; |
729 | 0 | else if (h1 >= 'a' && h1 <= 'f') d1 = h1 - 'a' + 10; |
730 | 0 | else if (h1 >= 'A' && h1 <= 'F') d1 = h1 - 'A' + 10; |
731 | 0 | if (h2 >= '0' && h2 <= '9') d2 = h2 - '0'; |
732 | 0 | else if (h2 >= 'a' && h2 <= 'f') d2 = h2 - 'a' + 10; |
733 | 0 | else if (h2 >= 'A' && h2 <= 'F') d2 = h2 - 'A' + 10; |
734 | 0 | if (d1 < 0 || d2 < 0) { |
735 | 0 | free(buf); free(dedented); |
736 | 0 | *err = NULL; |
737 | 0 | (void)asprintf(err, "%zu:%zu: invalid hex escape in triple-quoted string", |
738 | 0 | line, col); |
739 | 0 | return false; |
740 | 0 | } |
741 | 0 | c = (char)((d1 << 4) | d2); |
742 | 0 | break; |
743 | 0 | } |
744 | 0 | default: c = esc; break; |
745 | 3 | } |
746 | 3 | if (buf_len + 1 >= buf_cap) { |
747 | 0 | buf_cap *= 2; |
748 | 0 | buf = realloc(buf, buf_cap); |
749 | 0 | } |
750 | 3 | buf[buf_len++] = c; |
751 | 3 | continue; |
752 | 3 | } |
753 | | |
754 | | /* Regular character */ |
755 | 450 | char c = dedented[pos++]; |
756 | 450 | if (buf_len + 1 >= buf_cap) { |
757 | 0 | buf_cap *= 2; |
758 | 0 | buf = realloc(buf, buf_cap); |
759 | 0 | } |
760 | 450 | buf[buf_len++] = c; |
761 | 450 | } |
762 | | |
763 | | /* Emit final token */ |
764 | 27 | buf[buf_len] = '\0'; |
765 | 27 | if (has_interp) { |
766 | 6 | Token seg = token_str(TOK_INTERP_END, buf, line, col); |
767 | 6 | lat_vec_push(tokens, &seg); |
768 | 21 | } else { |
769 | 21 | Token seg = token_str(TOK_STRING_LIT, buf, line, col); |
770 | 21 | lat_vec_push(tokens, &seg); |
771 | 21 | } |
772 | 27 | free(dedented); |
773 | 27 | return true; |
774 | 27 | } |
775 | | |
776 | | /* Lex one token (or multiple for interpolated strings) and push to tokens. */ |
777 | 434k | static bool lex_one(Lexer *lex, LatVec *tokens, char **err) { |
778 | 434k | skip_whitespace_and_comments(lex); |
779 | 434k | if (lex_peek(lex) == '"') { |
780 | 18.2k | if (lex_peek_ahead(lex, 1) == '"' && lex_peek_ahead(lex, 2) == '"') { |
781 | 27 | return lex_triple_quote_string(lex, tokens, err); |
782 | 27 | } |
783 | 18.1k | return lex_string_or_interp(lex, tokens, err); |
784 | 18.2k | } |
785 | 416k | if (lex_peek(lex) == '\'') { |
786 | 30 | return lex_single_quote_string(lex, tokens, err); |
787 | 30 | } |
788 | 416k | Token tok; |
789 | 416k | if (!next_token(lex, &tok, err)) return false; |
790 | 416k | lat_vec_push(tokens, &tok); |
791 | 416k | return true; |
792 | 416k | } |
793 | | |
794 | 2.88k | LatVec lexer_tokenize(Lexer *lex, char **err) { |
795 | 2.88k | LatVec tokens = lat_vec_new(sizeof(Token)); |
796 | 2.88k | *err = NULL; |
797 | | |
798 | 437k | for (;;) { |
799 | 437k | skip_whitespace_and_comments(lex); |
800 | 437k | if (lex->pos >= lex->len) { |
801 | 2.88k | Token eof = token_simple(TOK_EOF, lex->line, lex->col); |
802 | 2.88k | lat_vec_push(&tokens, &eof); |
803 | 2.88k | break; |
804 | 2.88k | } |
805 | 434k | if (!lex_one(lex, &tokens, err)) { |
806 | | /* Free tokens on error */ |
807 | 24 | for (size_t i = 0; i < tokens.len; i++) { |
808 | 21 | token_free(lat_vec_get(&tokens, i)); |
809 | 21 | } |
810 | 3 | lat_vec_free(&tokens); |
811 | 3 | return lat_vec_new(sizeof(Token)); |
812 | 3 | } |
813 | 434k | } |
814 | | |
815 | 2.88k | return tokens; |
816 | 2.88k | } |