nginx_lint_parser/
lexer_rowan.rs

1//! Rowan-compatible lexer for nginx configuration files.
2//!
3//! Produces a flat sequence of `(SyntaxKind, &str)` pairs where every byte
4//! of the input is covered (whitespace and newlines are explicit tokens).
5//! This is the input expected by the rowan-based [`parser`](crate::parser).
6
7use crate::syntax_kind::SyntaxKind;
8
9/// Tokenise `source` into a lossless sequence of `(SyntaxKind, text)` pairs.
10///
11/// Every byte of the input is represented exactly once, so
12/// `tokens.iter().map(|(_, t)| *t).collect::<String>() == source` always holds.
13pub fn tokenize(source: &str) -> Vec<(SyntaxKind, &str)> {
14    let mut lexer = RowanLexer::new(source);
15    lexer.tokenize_all()
16}
17
18/// Internal lexer state.
19struct RowanLexer<'a> {
20    source: &'a str,
21    pos: usize,
22    tokens: Vec<(SyntaxKind, &'a str)>,
23}
24
25impl<'a> RowanLexer<'a> {
26    fn new(source: &'a str) -> Self {
27        Self {
28            source,
29            pos: 0,
30            tokens: Vec::new(),
31        }
32    }
33
34    fn remaining(&self) -> &'a str {
35        &self.source[self.pos..]
36    }
37
38    fn peek(&self) -> Option<char> {
39        self.remaining().chars().next()
40    }
41
42    /// Peek at the character at offset `n` from current position.
43    fn peek_at(&self, n: usize) -> Option<char> {
44        self.remaining().chars().nth(n)
45    }
46
47    fn at_end(&self) -> bool {
48        self.pos >= self.source.len()
49    }
50
51    /// Advance by one character (its UTF-8 byte length) and return it.
52    fn advance_char(&mut self) -> Option<char> {
53        let ch = self.peek()?;
54        self.pos += ch.len_utf8();
55        Some(ch)
56    }
57
58    fn emit(&mut self, kind: SyntaxKind, start: usize) {
59        let text = &self.source[start..self.pos];
60        if !text.is_empty() {
61            self.tokens.push((kind, text));
62        }
63    }
64
65    fn tokenize_all(&mut self) -> Vec<(SyntaxKind, &'a str)> {
66        // Track whether the previous non-whitespace token was whitespace-preceded
67        // for comment detection (# is only a comment after whitespace or at line start).
68        let mut at_line_start = true;
69
70        while !self.at_end() {
71            let start = self.pos;
72            let ch = self.peek().unwrap();
73
74            match ch {
75                '\n' => {
76                    self.advance_char();
77                    self.emit(SyntaxKind::NEWLINE, start);
78                    at_line_start = true;
79                }
80                ' ' | '\t' => {
81                    self.eat_whitespace();
82                    self.emit(SyntaxKind::WHITESPACE, start);
83                    // at_line_start stays as-is (whitespace doesn't reset it)
84                }
85                '#' if at_line_start || self.preceded_by_whitespace() => {
86                    self.eat_comment();
87                    self.emit(SyntaxKind::COMMENT, start);
88                    at_line_start = false;
89                }
90                ';' => {
91                    self.advance_char();
92                    self.emit(SyntaxKind::SEMICOLON, start);
93                    at_line_start = false;
94                }
95                '{' => {
96                    self.advance_char();
97                    self.emit(SyntaxKind::L_BRACE, start);
98                    at_line_start = false;
99                }
100                '}' => {
101                    self.advance_char();
102                    self.emit(SyntaxKind::R_BRACE, start);
103                    at_line_start = false;
104                }
105                '"' => {
106                    self.eat_double_quoted_string();
107                    self.emit(SyntaxKind::DOUBLE_QUOTED_STRING, start);
108                    at_line_start = false;
109                }
110                '\'' => {
111                    self.eat_single_quoted_string();
112                    self.emit(SyntaxKind::SINGLE_QUOTED_STRING, start);
113                    at_line_start = false;
114                }
115                '$' => {
116                    self.eat_variable();
117                    self.emit(SyntaxKind::VARIABLE, start);
118                    at_line_start = false;
119                }
120                _ if is_ident_start(ch) => {
121                    // Could be IDENT or ARGUMENT (identifiers that continue
122                    // with argument-chars like / or . become ARGUMENT).
123                    self.eat_ident_or_argument();
124                    let text = &self.source[start..self.pos];
125                    let kind = if text
126                        .chars()
127                        .all(|c| is_ident_continue(c) || is_ident_start(c))
128                    {
129                        SyntaxKind::IDENT
130                    } else {
131                        SyntaxKind::ARGUMENT
132                    };
133                    self.tokens.push((kind, text));
134                    at_line_start = false;
135                }
136                _ if is_argument_char(ch) => {
137                    self.eat_argument(ch);
138                    self.emit(SyntaxKind::ARGUMENT, start);
139                    at_line_start = false;
140                }
141                _ => {
142                    // Unknown character — emit as ERROR token.
143                    self.advance_char();
144                    self.emit(SyntaxKind::ERROR, start);
145                    at_line_start = false;
146                }
147            }
148        }
149
150        std::mem::take(&mut self.tokens)
151    }
152
153    // ── whitespace / comment ────────────────────────────────────────
154
155    fn eat_whitespace(&mut self) {
156        while let Some(ch) = self.peek() {
157            if ch == ' ' || ch == '\t' {
158                self.advance_char();
159            } else {
160                break;
161            }
162        }
163    }
164
165    fn eat_comment(&mut self) {
166        // Consume '#' and everything until (but not including) '\n'.
167        while let Some(ch) = self.peek() {
168            if ch == '\n' {
169                break;
170            }
171            self.advance_char();
172        }
173    }
174
175    /// Check if the immediately preceding token was whitespace or we are at
176    /// the beginning of a line.
177    fn preceded_by_whitespace(&self) -> bool {
178        matches!(
179            self.tokens.last(),
180            Some((SyntaxKind::WHITESPACE, _)) | Some((SyntaxKind::NEWLINE, _)) | None
181        )
182    }
183
184    // ── strings ─────────────────────────────────────────────────────
185
186    fn eat_double_quoted_string(&mut self) {
187        // Opening quote
188        self.advance_char(); // "
189        loop {
190            match self.peek() {
191                None => break, // Unterminated at EOF
192                Some('\\') => {
193                    self.advance_char(); // backslash
194                    self.advance_char(); // escaped char (if any)
195                }
196                Some('"') => {
197                    self.advance_char(); // closing quote
198                    break;
199                }
200                Some(_) => {
201                    self.advance_char();
202                }
203            }
204        }
205    }
206
207    fn eat_single_quoted_string(&mut self) {
208        self.advance_char(); // opening '
209        loop {
210            match self.peek() {
211                None => break, // Unterminated at EOF
212                Some('\\') => {
213                    self.advance_char();
214                    self.advance_char();
215                }
216                Some('\'') => {
217                    self.advance_char();
218                    break;
219                }
220                Some(_) => {
221                    self.advance_char();
222                }
223            }
224        }
225    }
226
227    // ── variable ────────────────────────────────────────────────────
228
229    fn eat_variable(&mut self) {
230        self.advance_char(); // '$'
231        if self.peek() == Some('{') {
232            // ${var} syntax
233            self.advance_char(); // '{'
234            while let Some(ch) = self.peek() {
235                if ch == '}' {
236                    self.advance_char();
237                    break;
238                }
239                self.advance_char();
240            }
241        } else {
242            // $var syntax
243            while let Some(ch) = self.peek() {
244                if ch.is_alphanumeric() || ch == '_' {
245                    self.advance_char();
246                } else {
247                    break;
248                }
249            }
250        }
251    }
252
253    // ── identifier / argument ───────────────────────────────────────
254
255    fn eat_ident_or_argument(&mut self) {
256        // Read identifier characters first
257        while let Some(ch) = self.peek() {
258            if is_ident_continue(ch) || is_ident_start(ch) {
259                self.advance_char();
260            } else {
261                break;
262            }
263        }
264        // Continue reading argument characters if present
265        self.eat_argument_continuation();
266    }
267
268    fn eat_argument(&mut self, _first: char) {
269        self.advance_char();
270        self.eat_argument_continuation();
271    }
272
273    /// Continue reading argument characters including regex quantifiers
274    /// like `{8,}` and escaped braces like `\{` and `\}`.
275    fn eat_argument_continuation(&mut self) {
276        while let Some(ch) = self.peek() {
277            if is_argument_char(ch) || is_ident_continue(ch) || is_ident_start(ch) {
278                // Check for escaped brace
279                if ch == '\\' && matches!(self.peek_at(1), Some('{') | Some('}')) {
280                    self.advance_char(); // '\'
281                    self.advance_char(); // '{' or '}'
282                    continue;
283                }
284                self.advance_char();
285            } else if ch == '{' {
286                // Check for regex quantifier
287                if let Some(len) = self.peek_regex_quantifier() {
288                    for _ in 0..len {
289                        self.advance_char();
290                    }
291                } else {
292                    break;
293                }
294            } else if ch == '$' {
295                // Regex end anchor vs variable
296                if self.is_regex_end_anchor() {
297                    self.advance_char();
298                } else {
299                    break;
300                }
301            } else {
302                break;
303            }
304        }
305    }
306
307    /// Check if `$` at current position is a regex end anchor rather than
308    /// a variable reference.
309    fn is_regex_end_anchor(&self) -> bool {
310        let remaining = self.remaining();
311        let mut chars = remaining.chars();
312        if chars.next() != Some('$') {
313            return false;
314        }
315        match chars.next() {
316            None => true,
317            Some(c) if c.is_whitespace() => true,
318            Some('{') => false, // ${var}
319            Some(c) if c.is_alphanumeric() => false,
320            Some('_') => false,
321            _ => true,
322        }
323    }
324
325    /// Look ahead at a potential regex quantifier like `{8}`, `{1,3}`,
326    /// `{8,}`.  Returns the byte-length if found, `None` otherwise.
327    fn peek_regex_quantifier(&self) -> Option<usize> {
328        let remaining = self.remaining();
329        if !remaining.starts_with('{') {
330            return None;
331        }
332        let mut chars = remaining.char_indices().peekable();
333        chars.next(); // '{'
334
335        // Must have at least one digit
336        match chars.peek() {
337            Some((_, ch)) if ch.is_ascii_digit() => {
338                chars.next();
339            }
340            _ => return None,
341        }
342        // More digits
343        while let Some(&(_, ch)) = chars.peek() {
344            if ch.is_ascii_digit() {
345                chars.next();
346            } else {
347                break;
348            }
349        }
350        match chars.peek() {
351            Some(&(_, '}')) => {
352                // Byte length from '{' up to and including '}'
353                chars.next();
354                let end_offset = chars.peek().map(|(i, _)| *i).unwrap_or(remaining.len());
355                Some(end_offset)
356            }
357            Some(&(_, ',')) => {
358                chars.next();
359                while let Some(&(_, ch)) = chars.peek() {
360                    if ch.is_ascii_digit() {
361                        chars.next();
362                    } else {
363                        break;
364                    }
365                }
366                if chars.peek().map(|(_, ch)| *ch) == Some('}') {
367                    chars.next();
368                    let end_offset = chars.peek().map(|(i, _)| *i).unwrap_or(remaining.len());
369                    Some(end_offset)
370                } else {
371                    None
372                }
373            }
374            _ => None,
375        }
376    }
377}
378
379// ── Character classification (mirrors existing lexer.rs) ────────────────
380
381fn is_ident_start(ch: char) -> bool {
382    ch.is_alphabetic() || ch == '_'
383}
384
385fn is_ident_continue(ch: char) -> bool {
386    ch.is_alphanumeric() || ch == '_' || ch == '-'
387}
388
389fn is_argument_char(ch: char) -> bool {
390    !ch.is_whitespace() && !matches!(ch, ';' | '{' | '}' | '"' | '\'' | '$')
391}
392
393// ── Tests ───────────────────────────────────────────────────────────────
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398
399    /// Helper: collect just the kinds.
400    fn kinds(source: &str) -> Vec<SyntaxKind> {
401        tokenize(source).into_iter().map(|(k, _)| k).collect()
402    }
403
404    /// The concatenation of all token texts must equal the original source.
405    fn assert_lossless(source: &str) {
406        let tokens = tokenize(source);
407        let reconstructed: String = tokens.iter().map(|(_, t)| *t).collect();
408        assert_eq!(reconstructed, source, "lossless round-trip failed");
409    }
410
411    #[test]
412    fn empty_input() {
413        assert_eq!(tokenize(""), vec![]);
414    }
415
416    #[test]
417    fn simple_directive() {
418        let tokens = tokenize("listen 80;");
419        assert_eq!(
420            tokens,
421            vec![
422                (SyntaxKind::IDENT, "listen"),
423                (SyntaxKind::WHITESPACE, " "),
424                (SyntaxKind::ARGUMENT, "80"),
425                (SyntaxKind::SEMICOLON, ";"),
426            ]
427        );
428    }
429
430    #[test]
431    fn block_directive() {
432        let tokens = tokenize("http { }");
433        assert_eq!(
434            tokens,
435            vec![
436                (SyntaxKind::IDENT, "http"),
437                (SyntaxKind::WHITESPACE, " "),
438                (SyntaxKind::L_BRACE, "{"),
439                (SyntaxKind::WHITESPACE, " "),
440                (SyntaxKind::R_BRACE, "}"),
441            ]
442        );
443    }
444
445    #[test]
446    fn double_quoted_string() {
447        let tokens = tokenize(r#"return 200 "hello world";"#);
448        assert_eq!(
449            tokens,
450            vec![
451                (SyntaxKind::IDENT, "return"),
452                (SyntaxKind::WHITESPACE, " "),
453                (SyntaxKind::ARGUMENT, "200"),
454                (SyntaxKind::WHITESPACE, " "),
455                (SyntaxKind::DOUBLE_QUOTED_STRING, "\"hello world\""),
456                (SyntaxKind::SEMICOLON, ";"),
457            ]
458        );
459    }
460
461    #[test]
462    fn single_quoted_string() {
463        let tokens = tokenize("return 200 'hello world';");
464        assert_eq!(
465            tokens,
466            vec![
467                (SyntaxKind::IDENT, "return"),
468                (SyntaxKind::WHITESPACE, " "),
469                (SyntaxKind::ARGUMENT, "200"),
470                (SyntaxKind::WHITESPACE, " "),
471                (SyntaxKind::SINGLE_QUOTED_STRING, "'hello world'"),
472                (SyntaxKind::SEMICOLON, ";"),
473            ]
474        );
475    }
476
477    #[test]
478    fn variable() {
479        let tokens = tokenize("set $var value;");
480        assert_eq!(
481            tokens,
482            vec![
483                (SyntaxKind::IDENT, "set"),
484                (SyntaxKind::WHITESPACE, " "),
485                (SyntaxKind::VARIABLE, "$var"),
486                (SyntaxKind::WHITESPACE, " "),
487                (SyntaxKind::IDENT, "value"),
488                (SyntaxKind::SEMICOLON, ";"),
489            ]
490        );
491    }
492
493    #[test]
494    fn variable_braces() {
495        let tokens = tokenize("return 200 ${request_uri};");
496        assert_eq!(
497            tokens,
498            vec![
499                (SyntaxKind::IDENT, "return"),
500                (SyntaxKind::WHITESPACE, " "),
501                (SyntaxKind::ARGUMENT, "200"),
502                (SyntaxKind::WHITESPACE, " "),
503                (SyntaxKind::VARIABLE, "${request_uri}"),
504                (SyntaxKind::SEMICOLON, ";"),
505            ]
506        );
507    }
508
509    #[test]
510    fn comment() {
511        let tokens = tokenize("# this is a comment\nlisten 80;");
512        assert_eq!(
513            tokens,
514            vec![
515                (SyntaxKind::COMMENT, "# this is a comment"),
516                (SyntaxKind::NEWLINE, "\n"),
517                (SyntaxKind::IDENT, "listen"),
518                (SyntaxKind::WHITESPACE, " "),
519                (SyntaxKind::ARGUMENT, "80"),
520                (SyntaxKind::SEMICOLON, ";"),
521            ]
522        );
523    }
524
525    #[test]
526    fn path_argument() {
527        let tokens = tokenize("root /var/www/html;");
528        assert_eq!(
529            tokens,
530            vec![
531                (SyntaxKind::IDENT, "root"),
532                (SyntaxKind::WHITESPACE, " "),
533                (SyntaxKind::ARGUMENT, "/var/www/html"),
534                (SyntaxKind::SEMICOLON, ";"),
535            ]
536        );
537    }
538
539    #[test]
540    fn newlines_and_whitespace() {
541        let source = "http {\n    listen 80;\n}";
542        assert_lossless(source);
543        let tokens = tokenize(source);
544        assert_eq!(
545            tokens,
546            vec![
547                (SyntaxKind::IDENT, "http"),
548                (SyntaxKind::WHITESPACE, " "),
549                (SyntaxKind::L_BRACE, "{"),
550                (SyntaxKind::NEWLINE, "\n"),
551                (SyntaxKind::WHITESPACE, "    "),
552                (SyntaxKind::IDENT, "listen"),
553                (SyntaxKind::WHITESPACE, " "),
554                (SyntaxKind::ARGUMENT, "80"),
555                (SyntaxKind::SEMICOLON, ";"),
556                (SyntaxKind::NEWLINE, "\n"),
557                (SyntaxKind::R_BRACE, "}"),
558            ]
559        );
560    }
561
562    #[test]
563    fn regex_quantifier() {
564        let tokens = tokenize(r"location ~ ^/[a-z]{8}$ {");
565        assert_eq!(
566            tokens,
567            vec![
568                (SyntaxKind::IDENT, "location"),
569                (SyntaxKind::WHITESPACE, " "),
570                (SyntaxKind::ARGUMENT, "~"),
571                (SyntaxKind::WHITESPACE, " "),
572                (SyntaxKind::ARGUMENT, "^/[a-z]{8}$"),
573                (SyntaxKind::WHITESPACE, " "),
574                (SyntaxKind::L_BRACE, "{"),
575            ]
576        );
577    }
578
579    #[test]
580    fn regex_quantifier_range() {
581        let tokens = tokenize(r"location ~ ^/[0-9]{1,3}$ {");
582        assert_eq!(
583            tokens,
584            vec![
585                (SyntaxKind::IDENT, "location"),
586                (SyntaxKind::WHITESPACE, " "),
587                (SyntaxKind::ARGUMENT, "~"),
588                (SyntaxKind::WHITESPACE, " "),
589                (SyntaxKind::ARGUMENT, "^/[0-9]{1,3}$"),
590                (SyntaxKind::WHITESPACE, " "),
591                (SyntaxKind::L_BRACE, "{"),
592            ]
593        );
594    }
595
596    #[test]
597    fn escaped_braces_in_regex() {
598        let tokens = tokenize(r"location ~ ^/nested/\{[a-z]+\}$ {");
599        assert_eq!(
600            tokens,
601            vec![
602                (SyntaxKind::IDENT, "location"),
603                (SyntaxKind::WHITESPACE, " "),
604                (SyntaxKind::ARGUMENT, "~"),
605                (SyntaxKind::WHITESPACE, " "),
606                (SyntaxKind::ARGUMENT, r"^/nested/\{[a-z]+\}$"),
607                (SyntaxKind::WHITESPACE, " "),
608                (SyntaxKind::L_BRACE, "{"),
609            ]
610        );
611    }
612
613    #[test]
614    fn hash_in_argument() {
615        let tokens = tokenize("location ~* foo#bar {");
616        assert_eq!(
617            tokens,
618            vec![
619                (SyntaxKind::IDENT, "location"),
620                (SyntaxKind::WHITESPACE, " "),
621                (SyntaxKind::ARGUMENT, "~*"),
622                (SyntaxKind::WHITESPACE, " "),
623                (SyntaxKind::ARGUMENT, "foo#bar"),
624                (SyntaxKind::WHITESPACE, " "),
625                (SyntaxKind::L_BRACE, "{"),
626            ]
627        );
628    }
629
630    #[test]
631    fn hash_comment_after_whitespace() {
632        let tokens = tokenize("listen 80; # this is a comment");
633        assert_eq!(
634            tokens,
635            vec![
636                (SyntaxKind::IDENT, "listen"),
637                (SyntaxKind::WHITESPACE, " "),
638                (SyntaxKind::ARGUMENT, "80"),
639                (SyntaxKind::SEMICOLON, ";"),
640                (SyntaxKind::WHITESPACE, " "),
641                (SyntaxKind::COMMENT, "# this is a comment"),
642            ]
643        );
644    }
645
646    #[test]
647    fn escape_in_double_quoted_string() {
648        let tokens = tokenize(r#"return 200 "hello\nworld";"#);
649        assert_eq!(
650            tokens,
651            vec![
652                (SyntaxKind::IDENT, "return"),
653                (SyntaxKind::WHITESPACE, " "),
654                (SyntaxKind::ARGUMENT, "200"),
655                (SyntaxKind::WHITESPACE, " "),
656                (SyntaxKind::DOUBLE_QUOTED_STRING, r#""hello\nworld""#),
657                (SyntaxKind::SEMICOLON, ";"),
658            ]
659        );
660    }
661
662    #[test]
663    fn lossless_complex_config() {
664        let source = r#"http {
665    # Main server
666    server {
667        listen 80;
668        server_name example.com;
669        location / {
670            proxy_pass http://backend;
671        }
672    }
673}
674"#;
675        assert_lossless(source);
676    }
677
678    #[test]
679    fn lossless_utf8() {
680        let source = "# これは日本語コメント\nlisten 80;\n";
681        assert_lossless(source);
682    }
683
684    #[test]
685    fn glob_pattern() {
686        let tokens = tokenize("include /etc/nginx/conf.d/*.conf;");
687        assert_eq!(
688            tokens,
689            vec![
690                (SyntaxKind::IDENT, "include"),
691                (SyntaxKind::WHITESPACE, " "),
692                (SyntaxKind::ARGUMENT, "/etc/nginx/conf.d/*.conf"),
693                (SyntaxKind::SEMICOLON, ";"),
694            ]
695        );
696    }
697
698    #[test]
699    fn extension_directive() {
700        let tokens = tokenize(r#"more_set_headers "Server: Custom";"#);
701        assert_eq!(
702            tokens,
703            vec![
704                (SyntaxKind::IDENT, "more_set_headers"),
705                (SyntaxKind::WHITESPACE, " "),
706                (SyntaxKind::DOUBLE_QUOTED_STRING, "\"Server: Custom\""),
707                (SyntaxKind::SEMICOLON, ";"),
708            ]
709        );
710    }
711
712    #[test]
713    fn hash_in_regex_pattern() {
714        let tokens = tokenize(r"location ~* (?:#.*#|\.bak)$ {");
715        assert_eq!(
716            tokens,
717            vec![
718                (SyntaxKind::IDENT, "location"),
719                (SyntaxKind::WHITESPACE, " "),
720                (SyntaxKind::ARGUMENT, "~*"),
721                (SyntaxKind::WHITESPACE, " "),
722                (SyntaxKind::ARGUMENT, r"(?:#.*#|\.bak)$"),
723                (SyntaxKind::WHITESPACE, " "),
724                (SyntaxKind::L_BRACE, "{"),
725            ]
726        );
727    }
728
729    #[test]
730    fn ident_classification() {
731        // Pure identifiers should be IDENT
732        let _tokens = tokenize("server_name example;");
733        assert_eq!(kinds("server_name"), vec![SyntaxKind::IDENT]);
734        // Identifiers with argument chars become ARGUMENT
735        let toks = tokenize("text/plain");
736        assert_eq!(toks, vec![(SyntaxKind::ARGUMENT, "text/plain")]);
737    }
738}