Coverage Report

Created: 2026-02-23 20:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/Users/alexjokela/projects/lattice/src/lexer.c
Line
Count
Source
1
#include "lexer.h"
2
#include <stdlib.h>
3
#include <string.h>
4
#include <stdio.h>
5
#include <ctype.h>
6
7
2.89k
Lexer lexer_new(const char *source) {
8
2.89k
    Lexer lex;
9
2.89k
    lex.source = source;
10
2.89k
    lex.len = strlen(source);
11
2.89k
    lex.pos = 0;
12
2.89k
    lex.line = 1;
13
2.89k
    lex.col = 1;
14
2.89k
    return lex;
15
2.89k
}
16
17
7.57M
static char lex_peek(const Lexer *lex) {
18
7.57M
    if (lex->pos >= lex->len) return '\0';
19
7.56M
    return lex->source[lex->pos];
20
7.57M
}
21
22
46.7k
static char lex_peek_ahead(const Lexer *lex, size_t offset) {
23
46.7k
    size_t idx = lex->pos + offset;
24
46.7k
    if (idx >= lex->len) return '\0';
25
46.7k
    return lex->source[idx];
26
46.7k
}
27
28
3.06M
static char lex_advance(Lexer *lex) {
29
3.06M
    if (lex->pos >= lex->len) return '\0';
30
3.06M
    char ch = lex->source[lex->pos++];
31
3.06M
    if (ch == '\n') {
32
104k
        lex->line++;
33
104k
        lex->col = 1;
34
2.96M
    } else {
35
2.96M
        lex->col++;
36
2.96M
    }
37
3.06M
    return ch;
38
3.06M
}
39
40
872k
static void skip_whitespace_and_comments(Lexer *lex) {
41
900k
    for (;;) {
42
        /* Skip whitespace */
43
1.58M
        while (lex->pos < lex->len && isspace((unsigned char)lex_peek(lex))) {
44
686k
            lex_advance(lex);
45
686k
        }
46
        /* Line comment */
47
900k
        if (lex_peek(lex) == '/' && lex_peek_ahead(lex, 1) == '/') {
48
1.22M
            while (lex->pos < lex->len && lex_peek(lex) != '\n') {
49
1.19M
                lex_advance(lex);
50
1.19M
            }
51
27.4k
            continue;
52
27.4k
        }
53
        /* Block comment (nestable) */
54
872k
        if (lex_peek(lex) == '/' && lex_peek_ahead(lex, 1) == '*') {
55
0
            lex_advance(lex); /* / */
56
0
            lex_advance(lex); /* * */
57
0
            int depth = 1;
58
0
            while (depth > 0 && lex->pos < lex->len) {
59
0
                char c = lex_advance(lex);
60
0
                if (c == '/' && lex_peek(lex) == '*') {
61
0
                    lex_advance(lex);
62
0
                    depth++;
63
0
                } else if (c == '*' && lex_peek(lex) == '/') {
64
0
                    lex_advance(lex);
65
0
                    depth--;
66
0
                }
67
0
            }
68
0
            continue;
69
0
        }
70
872k
        break;
71
872k
    }
72
872k
}
73
74
184k
static char *read_ident(Lexer *lex) {
75
184k
    size_t start = lex->pos;
76
946k
    while (lex->pos < lex->len && (isalnum((unsigned char)lex_peek(lex)) || lex_peek(lex) == '_')) {
77
762k
        lex_advance(lex);
78
762k
    }
79
184k
    size_t len = lex->pos - start;
80
184k
    char *s = malloc(len + 1);
81
184k
    memcpy(s, lex->source + start, len);
82
184k
    s[len] = '\0';
83
184k
    return s;
84
184k
}
85
86
184k
static TokenType keyword_lookup(const char *ident) {
87
184k
    if (strcmp(ident, "flux") == 0)     return TOK_FLUX;
88
181k
    if (strcmp(ident, "fix") == 0)      return TOK_FIX;
89
181k
    if (strcmp(ident, "let") == 0)      return TOK_LET;
90
171k
    if (strcmp(ident, "freeze") == 0)   return TOK_FREEZE;
91
170k
    if (strcmp(ident, "thaw") == 0)     return TOK_THAW;
92
170k
    if (strcmp(ident, "forge") == 0)    return TOK_FORGE;
93
170k
    if (strcmp(ident, "fn") == 0)       return TOK_FN;
94
162k
    if (strcmp(ident, "struct") == 0)   return TOK_STRUCT;
95
162k
    if (strcmp(ident, "if") == 0)       return TOK_IF;
96
155k
    if (strcmp(ident, "else") == 0)     return TOK_ELSE;
97
154k
    if (strcmp(ident, "for") == 0)      return TOK_FOR;
98
153k
    if (strcmp(ident, "in") == 0)       return TOK_IN;
99
152k
    if (strcmp(ident, "while") == 0)    return TOK_WHILE;
100
152k
    if (strcmp(ident, "loop") == 0)     return TOK_LOOP;
101
151k
    if (strcmp(ident, "return") == 0)   return TOK_RETURN;
102
142k
    if (strcmp(ident, "break") == 0)    return TOK_BREAK;
103
142k
    if (strcmp(ident, "continue") == 0) return TOK_CONTINUE;
104
142k
    if (strcmp(ident, "spawn") == 0)    return TOK_SPAWN;
105
142k
    if (strcmp(ident, "true") == 0)     return TOK_TRUE;
106
141k
    if (strcmp(ident, "false") == 0)    return TOK_FALSE;
107
139k
    if (strcmp(ident, "nil") == 0)      return TOK_NIL;
108
138k
    if (strcmp(ident, "clone") == 0)    return TOK_CLONE;
109
138k
    if (strcmp(ident, "anneal") == 0)   return TOK_ANNEAL;
110
138k
    if (strcmp(ident, "print") == 0)    return TOK_PRINT;
111
135k
    if (strcmp(ident, "try") == 0)      return TOK_TRY;
112
135k
    if (strcmp(ident, "catch") == 0)    return TOK_CATCH;
113
134k
    if (strcmp(ident, "scope") == 0)    return TOK_SCOPE;
114
134k
    if (strcmp(ident, "test") == 0)     return TOK_TEST;
115
134k
    if (strcmp(ident, "match") == 0)    return TOK_MATCH;
116
134k
    if (strcmp(ident, "enum") == 0)     return TOK_ENUM;
117
134k
    if (strcmp(ident, "import") == 0)   return TOK_IMPORT;
118
134k
    if (strcmp(ident, "from") == 0)     return TOK_FROM;
119
134k
    if (strcmp(ident, "as") == 0)       return TOK_AS;
120
134k
    if (strcmp(ident, "crystallize") == 0) return TOK_CRYSTALLIZE;
121
134k
    if (strcmp(ident, "sublimate") == 0) return TOK_SUBLIMATE;
122
134k
    if (strcmp(ident, "defer") == 0) return TOK_DEFER;
123
134k
    if (strcmp(ident, "trait") == 0) return TOK_TRAIT;
124
134k
    if (strcmp(ident, "impl") == 0) return TOK_IMPL;
125
134k
    if (strcmp(ident, "export") == 0) return TOK_EXPORT;
126
134k
    return TOK_IDENT;
127
134k
}
128
129
/* Forward declarations for mutual recursion (string interpolation) */
130
static bool lex_string_or_interp(Lexer *lex, LatVec *tokens, char **err);
131
static bool lex_triple_quote_string(Lexer *lex, LatVec *tokens, char **err);
132
static bool lex_one(Lexer *lex, LatVec *tokens, char **err);
133
134
416k
static bool next_token(Lexer *lex, Token *out, char **err) {
135
416k
    size_t line = lex->line;
136
416k
    size_t col = lex->col;
137
416k
    char ch = lex_peek(lex);
138
139
    /* Mode directive: #mode */
140
416k
    if (ch == '#') {
141
9
        lex_advance(lex);
142
9
        char *word = read_ident(lex);
143
9
        if (strcmp(word, "mode") != 0) {
144
0
            *err = NULL;
145
0
            (void)asprintf(err, "%zu:%zu: unexpected directive '#%s'", line, col, word);
146
0
            free(word);
147
0
            return false;
148
0
        }
149
9
        free(word);
150
9
        skip_whitespace_and_comments(lex);
151
9
        char *mode = read_ident(lex);
152
9
        if (strcmp(mode, "casual") != 0 && strcmp(mode, "strict") != 0) {
153
0
            *err = NULL;
154
0
            (void)asprintf(err, "%zu:%zu: expected 'casual' or 'strict' after #mode, got '%s'", line, col, mode);
155
0
            free(mode);
156
0
            return false;
157
0
        }
158
9
        *out = token_str(TOK_MODE_DIRECTIVE, mode, line, col);
159
9
        return true;
160
9
    }
161
162
    /* String literals are handled by lex_string_or_interp() via lex_one() */
163
164
    /* Number literal */
165
416k
    if (isdigit((unsigned char)ch)) {
166
10.5k
        size_t start = lex->pos;
167
10.5k
        bool is_float = false;
168
22.8k
        while (lex->pos < lex->len && isdigit((unsigned char)lex_peek(lex))) {
169
12.2k
            lex_advance(lex);
170
12.2k
        }
171
10.5k
        if (lex_peek(lex) == '.' && isdigit((unsigned char)lex_peek_ahead(lex, 1))) {
172
324
            is_float = true;
173
324
            lex_advance(lex); /* '.' */
174
843
            while (lex->pos < lex->len && isdigit((unsigned char)lex_peek(lex))) {
175
519
                lex_advance(lex);
176
519
            }
177
324
        }
178
10.5k
        size_t num_len = lex->pos - start;
179
10.5k
        char *num_str = malloc(num_len + 1);
180
10.5k
        memcpy(num_str, lex->source + start, num_len);
181
10.5k
        num_str[num_len] = '\0';
182
10.5k
        if (is_float) {
183
324
            double val = strtod(num_str, NULL);
184
324
            free(num_str);
185
324
            *out = token_float(val, line, col);
186
10.2k
        } else {
187
10.2k
            int64_t val = strtoll(num_str, NULL, 10);
188
10.2k
            free(num_str);
189
10.2k
            *out = token_int(val, line, col);
190
10.2k
        }
191
10.5k
        return true;
192
10.5k
    }
193
194
    /* Identifiers and keywords */
195
406k
    if (isalpha((unsigned char)ch) || ch == '_') {
196
184k
        char *ident = read_ident(lex);
197
184k
        TokenType type = keyword_lookup(ident);
198
184k
        if (type != TOK_IDENT) {
199
50.5k
            free(ident);
200
50.5k
            *out = token_simple(type, line, col);
201
134k
        } else {
202
134k
            *out = token_str(TOK_IDENT, ident, line, col);
203
134k
        }
204
184k
        return true;
205
184k
    }
206
207
    /* Operators and punctuation */
208
221k
    lex_advance(lex);
209
221k
    switch (ch) {
210
30
        case '~': *out = token_simple(TOK_TILDE, line, col); return true;
211
3.57k
        case '+':
212
3.57k
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_PLUS_EQ, line, col); }
213
3.51k
            else { *out = token_simple(TOK_PLUS, line, col); }
214
3.57k
            return true;
215
30
        case '%':
216
30
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_PERCENT_EQ, line, col); }
217
27
            else { *out = token_simple(TOK_PERCENT, line, col); }
218
30
            return true;
219
45.0k
        case '(': *out = token_simple(TOK_LPAREN, line, col); return true;
220
45.0k
        case ')': *out = token_simple(TOK_RPAREN, line, col); return true;
221
21.1k
        case '{': *out = token_simple(TOK_LBRACE, line, col); return true;
222
21.1k
        case '}': *out = token_simple(TOK_RBRACE, line, col); return true;
223
4.97k
        case '[': *out = token_simple(TOK_LBRACKET, line, col); return true;
224
4.97k
        case ']': *out = token_simple(TOK_RBRACKET, line, col); return true;
225
16.6k
        case ',': *out = token_simple(TOK_COMMA, line, col); return true;
226
24
        case ';': *out = token_simple(TOK_SEMICOLON, line, col); return true;
227
42
        case '/':
228
42
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_SLASH_EQ, line, col); }
229
39
            else { *out = token_simple(TOK_SLASH, line, col); }
230
42
            return true;
231
126
        case '*':
232
126
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_STAR_EQ, line, col); }
233
123
            else { *out = token_simple(TOK_STAR, line, col); }
234
126
            return true;
235
108
        case '&':
236
108
            if (lex_peek(lex) == '&') { lex_advance(lex); *out = token_simple(TOK_AND, line, col); }
237
15
            else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_AMP_EQ, line, col); }
238
12
            else { *out = token_simple(TOK_AMPERSAND, line, col); }
239
108
            return true;
240
3.86k
        case '|':
241
3.86k
            if (lex_peek(lex) == '|') { lex_advance(lex); *out = token_simple(TOK_OR, line, col); }
242
3.85k
            else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_PIPE_EQ, line, col); }
243
3.85k
            else { *out = token_simple(TOK_PIPE, line, col); }
244
3.86k
            return true;
245
12
        case '^':
246
12
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_CARET_EQ, line, col); }
247
9
            else { *out = token_simple(TOK_CARET, line, col); }
248
12
            return true;
249
19.1k
        case '=':
250
19.1k
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_EQEQ, line, col); }
251
16.3k
            else if (lex_peek(lex) == '>') { lex_advance(lex); *out = token_simple(TOK_FATARROW, line, col); }
252
16.2k
            else { *out = token_simple(TOK_EQ, line, col); }
253
19.1k
            return true;
254
1.36k
        case '!':
255
1.36k
            if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_BANGEQ, line, col); }
256
561
            else { *out = token_simple(TOK_BANG, line, col); }
257
1.36k
            return true;
258
807
        case '<':
259
807
            if (lex_peek(lex) == '<') {
260
9
                lex_advance(lex);
261
9
                if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_LSHIFT_EQ, line, col); }
262
6
                else { *out = token_simple(TOK_LSHIFT, line, col); }
263
9
            }
264
798
            else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_LTEQ, line, col); }
265
576
            else { *out = token_simple(TOK_LT, line, col); }
266
807
            return true;
267
1.03k
        case '>':
268
1.03k
            if (lex_peek(lex) == '>') {
269
6
                lex_advance(lex);
270
6
                if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_RSHIFT_EQ, line, col); }
271
3
                else { *out = token_simple(TOK_RSHIFT, line, col); }
272
6
            }
273
1.02k
            else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_GTEQ, line, col); }
274
639
            else { *out = token_simple(TOK_GT, line, col); }
275
1.03k
            return true;
276
5.04k
        case '-':
277
5.04k
            if (lex_peek(lex) == '>') { lex_advance(lex); *out = token_simple(TOK_ARROW, line, col); }
278
393
            else if (lex_peek(lex) == '=') { lex_advance(lex); *out = token_simple(TOK_MINUS_EQ, line, col); }
279
390
            else { *out = token_simple(TOK_MINUS, line, col); }
280
5.04k
            return true;
281
16.3k
        case '.':
282
16.3k
            if (lex_peek(lex) == '.') {
283
456
                lex_advance(lex);
284
456
                if (lex_peek(lex) == '.') { lex_advance(lex); *out = token_simple(TOK_DOTDOTDOT, line, col); }
285
48
                else { *out = token_simple(TOK_DOTDOT, line, col); }
286
456
            }
287
15.9k
            else { *out = token_simple(TOK_DOT, line, col); }
288
16.3k
            return true;
289
10.7k
        case ':':
290
10.7k
            if (lex_peek(lex) == ':') { lex_advance(lex); *out = token_simple(TOK_COLONCOLON, line, col); }
291
9.11k
            else { *out = token_simple(TOK_COLON, line, col); }
292
10.7k
            return true;
293
66
        case '?':
294
66
            if (lex_peek(lex) == '?') { lex_advance(lex); *out = token_simple(TOK_QUESTION_QUESTION, line, col); }
295
48
            else if (lex_peek(lex) == '.') { lex_advance(lex); *out = token_simple(TOK_QUESTION_DOT, line, col); }
296
21
            else if (lex_peek(lex) == '[') { lex_advance(lex); *out = token_simple(TOK_QUESTION_LBRACKET, line, col); }
297
18
            else { *out = token_simple(TOK_QUESTION, line, col); }
298
66
            return true;
299
0
        default:
300
0
            *err = NULL;
301
0
            (void)asprintf(err, "%zu:%zu: unexpected character '%c'", line, col, ch);
302
0
            return false;
303
221k
    }
304
221k
}
305
306
/* Helper: scan escape sequence inside a string, appending to buf.
307
 * On entry, the backslash has already been consumed. Returns false on error. */
308
static bool lex_string_escape(Lexer *lex, char **buf, size_t *buf_len,
309
708
                               size_t *buf_cap, size_t line, size_t col, char **err) {
310
708
    if (lex->pos >= lex->len) {
311
0
        *err = NULL;
312
0
        (void)asprintf(err, "%zu:%zu: unterminated string escape", line, col);
313
0
        return false;
314
0
    }
315
708
    char esc = lex_advance(lex);
316
708
    char c;
317
708
    switch (esc) {
318
228
        case 'n':  c = '\n'; break;
319
60
        case 't':  c = '\t'; break;
320
33
        case 'r':  c = '\r'; break;
321
3
        case '0':  c = '\0'; break;
322
93
        case '\\': c = '\\'; break;
323
198
        case '"':  c = '"';  break;
324
3
        case '\'': c = '\''; break;
325
3
        case '$':  c = '$';  break;
326
9
        case 'x': {
327
9
            if (lex->pos + 1 >= lex->len) {
328
0
                *err = NULL;
329
0
                (void)asprintf(err, "%zu:%zu: incomplete \\x escape", line, col);
330
0
                return false;
331
0
            }
332
9
            char h1 = lex_advance(lex);
333
9
            char h2 = lex_advance(lex);
334
9
            int d1 = -1, d2 = -1;
335
9
            if (h1 >= '0' && h1 <= '9') d1 = h1 - '0';
336
3
            else if (h1 >= 'a' && h1 <= 'f') d1 = h1 - 'a' + 10;
337
3
            else if (h1 >= 'A' && h1 <= 'F') d1 = h1 - 'A' + 10;
338
9
            if (h2 >= '0' && h2 <= '9') d2 = h2 - '0';
339
6
            else if (h2 >= 'a' && h2 <= 'f') d2 = h2 - 'a' + 10;
340
3
            else if (h2 >= 'A' && h2 <= 'F') d2 = h2 - 'A' + 10;
341
9
            if (d1 < 0 || d2 < 0) {
342
3
                *err = NULL;
343
3
                (void)asprintf(err, "%zu:%zu: invalid hex escape '\\x%c%c'", line, col, h1, h2);
344
3
                return false;
345
3
            }
346
6
            c = (char)((d1 << 4) | d2);
347
6
            break;
348
9
        }
349
78
        default: c = esc; break;
350
708
    }
351
705
    if (*buf_len + 1 >= *buf_cap) {
352
0
        *buf_cap *= 2;
353
0
        *buf = realloc(*buf, *buf_cap);
354
0
    }
355
705
    (*buf)[(*buf_len)++] = c;
356
705
    return true;
357
708
}
358
359
/* Scan a string literal, handling interpolation with ${...}.
360
 * On entry, lex is positioned at the opening '"'.
361
 * Pushes TOK_STRING_LIT (no interpolation) or
362
 * TOK_INTERP_START / expression tokens / TOK_INTERP_MID / ... / TOK_INTERP_END. */
363
18.1k
static bool lex_string_or_interp(Lexer *lex, LatVec *tokens, char **err) {
364
18.1k
    size_t line = lex->line;
365
18.1k
    size_t col = lex->col;
366
18.1k
    lex_advance(lex); /* consume opening " */
367
368
18.1k
    bool has_interp = false;
369
18.1k
    size_t buf_cap = 64;
370
18.1k
    size_t buf_len = 0;
371
18.1k
    char *buf = malloc(buf_cap);
372
373
155k
    for (;;) {
374
155k
        if (lex->pos >= lex->len) {
375
0
            free(buf);
376
0
            *err = NULL;
377
0
            (void)asprintf(err, "%zu:%zu: unterminated string literal", line, col);
378
0
            return false;
379
0
        }
380
381
        /* Check for interpolation: ${ */
382
155k
        if (lex_peek(lex) == '$' && lex_peek_ahead(lex, 1) == '{') {
383
            /* Emit accumulated text as INTERP_START or INTERP_MID */
384
69
            buf[buf_len] = '\0';
385
69
            TokenType seg_type = has_interp ? TOK_INTERP_MID : TOK_INTERP_START;
386
69
            Token seg = token_str(seg_type, buf, line, col);
387
69
            lat_vec_push(tokens, &seg);
388
69
            has_interp = true;
389
390
69
            lex_advance(lex); /* consume $ */
391
69
            lex_advance(lex); /* consume { */
392
393
            /* Lex expression tokens until brace depth returns to 0 */
394
69
            int depth = 1;
395
198
            while (depth > 0) {
396
198
                skip_whitespace_and_comments(lex);
397
198
                if (lex->pos >= lex->len) {
398
0
                    *err = NULL;
399
0
                    (void)asprintf(err, "%zu:%zu: unterminated string interpolation", line, col);
400
0
                    return false;
401
0
                }
402
                /* End of interpolation */
403
198
                if (lex_peek(lex) == '}' && depth == 1) {
404
69
                    lex_advance(lex); /* consume closing } */
405
69
                    break;
406
69
                }
407
                /* Lex one token (handles nested strings with interpolation) */
408
129
                size_t before = tokens->len;
409
129
                if (!lex_one(lex, tokens, err)) return false;
410
                /* Track brace depth */
411
129
                if (tokens->len > before) {
412
129
                    Token *last = lat_vec_get(tokens, tokens->len - 1);
413
129
                    if (last->type == TOK_LBRACE) depth++;
414
129
                    else if (last->type == TOK_RBRACE) depth--;
415
129
                }
416
129
            }
417
418
            /* Reset buffer for next string segment */
419
69
            buf_cap = 64;
420
69
            buf_len = 0;
421
69
            buf = malloc(buf_cap);
422
69
            continue;
423
69
        }
424
425
        /* Check for end of string */
426
155k
        if (lex_peek(lex) == '"') {
427
18.1k
            lex_advance(lex); /* consume closing " */
428
18.1k
            buf[buf_len] = '\0';
429
18.1k
            if (has_interp) {
430
42
                Token seg = token_str(TOK_INTERP_END, buf, line, col);
431
42
                lat_vec_push(tokens, &seg);
432
18.1k
            } else {
433
18.1k
                Token seg = token_str(TOK_STRING_LIT, buf, line, col);
434
18.1k
                lat_vec_push(tokens, &seg);
435
18.1k
            }
436
18.1k
            return true;
437
18.1k
        }
438
439
        /* Check for escape sequence */
440
137k
        if (lex_peek(lex) == '\\') {
441
699
            lex_advance(lex); /* consume backslash */
442
699
            if (!lex_string_escape(lex, &buf, &buf_len, &buf_cap, line, col, err)) {
443
3
                free(buf);
444
3
                return false;
445
3
            }
446
696
            continue;
447
699
        }
448
449
        /* Regular character */
450
136k
        char c = lex_advance(lex);
451
136k
        if (buf_len + 1 >= buf_cap) {
452
9
            buf_cap *= 2;
453
9
            buf = realloc(buf, buf_cap);
454
9
        }
455
136k
        buf[buf_len++] = c;
456
136k
    }
457
18.1k
}
458
459
/* Scan a single-quoted string literal (no interpolation).
460
 * On entry, lex is positioned at the opening '\''.
461
 * Pushes a TOK_STRING_LIT token. */
462
30
static bool lex_single_quote_string(Lexer *lex, LatVec *tokens, char **err) {
463
30
    size_t line = lex->line;
464
30
    size_t col = lex->col;
465
30
    lex_advance(lex); /* consume opening ' */
466
467
30
    size_t buf_cap = 64;
468
30
    size_t buf_len = 0;
469
30
    char *buf = malloc(buf_cap);
470
471
408
    for (;;) {
472
408
        if (lex->pos >= lex->len) {
473
0
            free(buf);
474
0
            *err = NULL;
475
0
            (void)asprintf(err, "%zu:%zu: unterminated string literal", line, col);
476
0
            return false;
477
0
        }
478
479
        /* End of string */
480
408
        if (lex_peek(lex) == '\'') {
481
30
            lex_advance(lex); /* consume closing ' */
482
30
            buf[buf_len] = '\0';
483
30
            Token seg = token_str(TOK_STRING_LIT, buf, line, col);
484
30
            lat_vec_push(tokens, &seg);
485
30
            return true;
486
30
        }
487
488
        /* Escape sequence */
489
378
        if (lex_peek(lex) == '\\') {
490
9
            lex_advance(lex); /* consume backslash */
491
9
            if (!lex_string_escape(lex, &buf, &buf_len, &buf_cap, line, col, err)) {
492
0
                free(buf);
493
0
                return false;
494
0
            }
495
9
            continue;
496
9
        }
497
498
        /* Regular character */
499
369
        char c = lex_advance(lex);
500
369
        if (buf_len + 1 >= buf_cap) {
501
0
            buf_cap *= 2;
502
0
            buf = realloc(buf, buf_cap);
503
0
        }
504
369
        buf[buf_len++] = c;
505
369
    }
506
30
}
507
508
/* Dedent a raw triple-quoted string based on closing indentation.
509
 * If the closing """ is on its own line with only whitespace before it,
510
 * that whitespace count is used as the dedent level. Returns a new
511
 * heap-allocated string. Caller must free. */
512
27
static char *dedent_triple_string(const char *raw, size_t raw_len, size_t *out_len) {
513
    /* Find last newline in raw content */
514
27
    size_t last_nl = raw_len; /* sentinel: no newline found */
515
189
    for (size_t i = raw_len; i > 0; i--) {
516
177
        if (raw[i - 1] == '\n') {
517
15
            last_nl = i - 1;
518
15
            break;
519
15
        }
520
177
    }
521
522
27
    size_t closing_indent = 0;
523
27
    size_t content_end = raw_len;
524
525
27
    if (last_nl < raw_len) {
526
        /* Check if everything after last newline is whitespace */
527
15
        bool all_ws = true;
528
15
        size_t ws_count = 0;
529
75
        for (size_t i = last_nl + 1; i < raw_len; i++) {
530
60
            if (raw[i] == ' ') ws_count++;
531
0
            else if (raw[i] == '\t') ws_count += 4;
532
0
            else { all_ws = false; break; }
533
60
        }
534
15
        if (all_ws) {
535
15
            closing_indent = ws_count;
536
15
            content_end = last_nl; /* exclude trailing \n + whitespace */
537
15
        }
538
15
    }
539
540
27
    if (closing_indent == 0) {
541
        /* No dedenting needed */
542
12
        char *result = malloc(content_end + 1);
543
12
        memcpy(result, raw, content_end);
544
12
        result[content_end] = '\0';
545
12
        *out_len = content_end;
546
12
        return result;
547
12
    }
548
549
    /* Dedent: strip up to closing_indent whitespace from start of each line */
550
15
    size_t result_cap = content_end + 1;
551
15
    char *result = malloc(result_cap);
552
15
    size_t result_len = 0;
553
15
    size_t i = 0;
554
15
    bool at_line_start = true;
555
556
438
    while (i < content_end) {
557
423
        if (at_line_start) {
558
27
            size_t skipped = 0;
559
135
            while (i < content_end && skipped < closing_indent) {
560
108
                if (raw[i] == ' ') { skipped++; i++; }
561
0
                else if (raw[i] == '\t') { skipped += 4; i++; }
562
0
                else break;
563
108
            }
564
27
            at_line_start = false;
565
27
        }
566
423
        if (i >= content_end) break;
567
423
        char c = raw[i++];
568
423
        if (c == '\n') at_line_start = true;
569
423
        if (result_len + 1 >= result_cap) {
570
0
            result_cap *= 2;
571
0
            result = realloc(result, result_cap);
572
0
        }
573
423
        result[result_len++] = c;
574
423
    }
575
576
15
    result[result_len] = '\0';
577
15
    *out_len = result_len;
578
15
    return result;
579
27
}
580
581
/* Scan a triple-quoted string literal with optional interpolation and dedenting.
582
 * On entry, lex is positioned at the first '"' of opening """.
583
 * Supports ${...} interpolation like double-quoted strings.
584
 * Dedents based on closing """ indentation. */
585
27
static bool lex_triple_quote_string(Lexer *lex, LatVec *tokens, char **err) {
586
27
    size_t line = lex->line;
587
27
    size_t col = lex->col;
588
589
    /* Consume opening """ */
590
27
    lex_advance(lex); lex_advance(lex); lex_advance(lex);
591
592
    /* Skip optional newline immediately after """ */
593
27
    if (lex_peek(lex) == '\n') {
594
15
        lex_advance(lex);
595
15
    } else if (lex_peek(lex) == '\r' && lex_peek_ahead(lex, 1) == '\n') {
596
0
        lex_advance(lex); lex_advance(lex);
597
0
    }
598
599
    /* Collect raw content until closing """ */
600
27
    size_t raw_cap = 256;
601
27
    size_t raw_len = 0;
602
27
    char *raw = malloc(raw_cap);
603
604
735
    for (;;) {
605
735
        if (lex->pos >= lex->len) {
606
0
            free(raw);
607
0
            *err = NULL;
608
0
            (void)asprintf(err, "%zu:%zu: unterminated triple-quoted string", line, col);
609
0
            return false;
610
0
        }
611
735
        if (lex_peek(lex) == '"' && lex_peek_ahead(lex, 1) == '"' &&
612
735
            lex_peek_ahead(lex, 2) == '"') {
613
27
            lex_advance(lex); lex_advance(lex); lex_advance(lex);
614
27
            break;
615
27
        }
616
708
        char c = lex_advance(lex);
617
708
        if (raw_len + 1 >= raw_cap) {
618
0
            raw_cap *= 2;
619
0
            raw = realloc(raw, raw_cap);
620
0
        }
621
708
        raw[raw_len++] = c;
622
708
    }
623
27
    raw[raw_len] = '\0';
624
625
    /* Dedent */
626
27
    size_t dedented_len = 0;
627
27
    char *dedented = dedent_triple_string(raw, raw_len, &dedented_len);
628
27
    free(raw);
629
630
    /* Process dedented content for escapes and interpolation */
631
27
    bool has_interp = false;
632
27
    size_t buf_cap = 64;
633
27
    size_t buf_len = 0;
634
27
    char *buf = malloc(buf_cap);
635
27
    size_t pos = 0;
636
637
489
    while (pos < dedented_len) {
638
        /* Check for interpolation ${ */
639
462
        if (dedented[pos] == '$' && pos + 1 < dedented_len && dedented[pos + 1] == '{') {
640
9
            buf[buf_len] = '\0';
641
9
            TokenType seg_type = has_interp ? TOK_INTERP_MID : TOK_INTERP_START;
642
9
            Token seg = token_str(seg_type, buf, line, col);
643
9
            lat_vec_push(tokens, &seg);
644
9
            has_interp = true;
645
9
            pos += 2; /* skip ${ */
646
647
            /* Extract expression text by finding matching } */
648
9
            int depth = 1;
649
9
            size_t expr_start = pos;
650
51
            while (pos < dedented_len && depth > 0) {
651
51
                char ec = dedented[pos];
652
51
                if (ec == '{') { depth++; }
653
51
                else if (ec == '}') { depth--; if (depth == 0) break; }
654
42
                else if (ec == '"') {
655
0
                    pos++;
656
0
                    while (pos < dedented_len && dedented[pos] != '"') {
657
0
                        if (dedented[pos] == '\\') pos++;
658
0
                        pos++;
659
0
                    }
660
42
                } else if (ec == '\'') {
661
0
                    pos++;
662
0
                    while (pos < dedented_len && dedented[pos] != '\'') {
663
0
                        if (dedented[pos] == '\\') pos++;
664
0
                        pos++;
665
0
                    }
666
0
                }
667
42
                pos++;
668
42
            }
669
670
9
            if (depth != 0) {
671
0
                free(buf); free(dedented);
672
0
                *err = NULL;
673
0
                (void)asprintf(err, "%zu:%zu: unterminated interpolation in triple-quoted string",
674
0
                               line, col);
675
0
                return false;
676
0
            }
677
678
            /* Lex expression tokens via a sub-lexer */
679
9
            size_t expr_len = pos - expr_start;
680
9
            char *expr_src = malloc(expr_len + 1);
681
9
            memcpy(expr_src, dedented + expr_start, expr_len);
682
9
            expr_src[expr_len] = '\0';
683
684
9
            Lexer expr_lex = lexer_new(expr_src);
685
24
            for (;;) {
686
24
                skip_whitespace_and_comments(&expr_lex);
687
24
                if (expr_lex.pos >= expr_lex.len) break;
688
15
                if (!lex_one(&expr_lex, tokens, err)) {
689
0
                    free(expr_src); free(buf); free(dedented);
690
0
                    return false;
691
0
                }
692
15
            }
693
9
            free(expr_src);
694
9
            pos++; /* skip closing } */
695
696
            /* Reset buffer for next segment */
697
9
            buf_cap = 64;
698
9
            buf_len = 0;
699
9
            buf = malloc(buf_cap);
700
9
            continue;
701
9
        }
702
703
        /* Check for escape sequence */
704
453
        if (dedented[pos] == '\\' && pos + 1 < dedented_len) {
705
3
            pos++; /* skip backslash */
706
3
            char esc = dedented[pos++];
707
3
            char c;
708
3
            switch (esc) {
709
0
                case 'n':  c = '\n'; break;
710
3
                case 't':  c = '\t'; break;
711
0
                case 'r':  c = '\r'; break;
712
0
                case '0':  c = '\0'; break;
713
0
                case '\\': c = '\\'; break;
714
0
                case '"':  c = '"';  break;
715
0
                case '\'': c = '\''; break;
716
0
                case '$':  c = '$';  break;
717
0
                case 'x': {
718
0
                    if (pos + 1 >= dedented_len) {
719
0
                        free(buf); free(dedented);
720
0
                        *err = NULL;
721
0
                        (void)asprintf(err, "%zu:%zu: incomplete \\x escape in triple-quoted string",
722
0
                                       line, col);
723
0
                        return false;
724
0
                    }
725
0
                    char h1 = dedented[pos++];
726
0
                    char h2 = dedented[pos++];
727
0
                    int d1 = -1, d2 = -1;
728
0
                    if (h1 >= '0' && h1 <= '9') d1 = h1 - '0';
729
0
                    else if (h1 >= 'a' && h1 <= 'f') d1 = h1 - 'a' + 10;
730
0
                    else if (h1 >= 'A' && h1 <= 'F') d1 = h1 - 'A' + 10;
731
0
                    if (h2 >= '0' && h2 <= '9') d2 = h2 - '0';
732
0
                    else if (h2 >= 'a' && h2 <= 'f') d2 = h2 - 'a' + 10;
733
0
                    else if (h2 >= 'A' && h2 <= 'F') d2 = h2 - 'A' + 10;
734
0
                    if (d1 < 0 || d2 < 0) {
735
0
                        free(buf); free(dedented);
736
0
                        *err = NULL;
737
0
                        (void)asprintf(err, "%zu:%zu: invalid hex escape in triple-quoted string",
738
0
                                       line, col);
739
0
                        return false;
740
0
                    }
741
0
                    c = (char)((d1 << 4) | d2);
742
0
                    break;
743
0
                }
744
0
                default: c = esc; break;
745
3
            }
746
3
            if (buf_len + 1 >= buf_cap) {
747
0
                buf_cap *= 2;
748
0
                buf = realloc(buf, buf_cap);
749
0
            }
750
3
            buf[buf_len++] = c;
751
3
            continue;
752
3
        }
753
754
        /* Regular character */
755
450
        char c = dedented[pos++];
756
450
        if (buf_len + 1 >= buf_cap) {
757
0
            buf_cap *= 2;
758
0
            buf = realloc(buf, buf_cap);
759
0
        }
760
450
        buf[buf_len++] = c;
761
450
    }
762
763
    /* Emit final token */
764
27
    buf[buf_len] = '\0';
765
27
    if (has_interp) {
766
6
        Token seg = token_str(TOK_INTERP_END, buf, line, col);
767
6
        lat_vec_push(tokens, &seg);
768
21
    } else {
769
21
        Token seg = token_str(TOK_STRING_LIT, buf, line, col);
770
21
        lat_vec_push(tokens, &seg);
771
21
    }
772
27
    free(dedented);
773
27
    return true;
774
27
}
775
776
/* Lex one token (or multiple for interpolated strings) and push to tokens. */
777
434k
static bool lex_one(Lexer *lex, LatVec *tokens, char **err) {
778
434k
    skip_whitespace_and_comments(lex);
779
434k
    if (lex_peek(lex) == '"') {
780
18.2k
        if (lex_peek_ahead(lex, 1) == '"' && lex_peek_ahead(lex, 2) == '"') {
781
27
            return lex_triple_quote_string(lex, tokens, err);
782
27
        }
783
18.1k
        return lex_string_or_interp(lex, tokens, err);
784
18.2k
    }
785
416k
    if (lex_peek(lex) == '\'') {
786
30
        return lex_single_quote_string(lex, tokens, err);
787
30
    }
788
416k
    Token tok;
789
416k
    if (!next_token(lex, &tok, err)) return false;
790
416k
    lat_vec_push(tokens, &tok);
791
416k
    return true;
792
416k
}
793
794
2.88k
LatVec lexer_tokenize(Lexer *lex, char **err) {
795
2.88k
    LatVec tokens = lat_vec_new(sizeof(Token));
796
2.88k
    *err = NULL;
797
798
437k
    for (;;) {
799
437k
        skip_whitespace_and_comments(lex);
800
437k
        if (lex->pos >= lex->len) {
801
2.88k
            Token eof = token_simple(TOK_EOF, lex->line, lex->col);
802
2.88k
            lat_vec_push(&tokens, &eof);
803
2.88k
            break;
804
2.88k
        }
805
434k
        if (!lex_one(lex, &tokens, err)) {
806
            /* Free tokens on error */
807
24
            for (size_t i = 0; i < tokens.len; i++) {
808
21
                token_free(lat_vec_get(&tokens, i));
809
21
            }
810
3
            lat_vec_free(&tokens);
811
3
            return lat_vec_new(sizeof(Token));
812
3
        }
813
434k
    }
814
815
2.88k
    return tokens;
816
2.88k
}