Skip to main content

nginx_lint_parser/
lexer_rowan.rs

1//! Rowan-compatible lexer for nginx configuration files.
2//!
3//! Produces a flat sequence of `(SyntaxKind, &str)` pairs where every byte
4//! of the input is covered (whitespace and newlines are explicit tokens).
5//! This is the input expected by the rowan-based [`parser`](crate::parser).
6
7use crate::syntax_kind::SyntaxKind;
8
9/// Tokenise `source` into a lossless sequence of `(SyntaxKind, text)` pairs.
10///
11/// Every byte of the input is represented exactly once, so
12/// `tokens.iter().map(|(_, t)| *t).collect::<String>() == source` always holds.
13pub fn tokenize(source: &str) -> Vec<(SyntaxKind, &str)> {
14    let mut lexer = RowanLexer::new(source);
15    lexer.tokenize_all()
16}
17
18/// Internal lexer state.
19struct RowanLexer<'a> {
20    source: &'a str,
21    pos: usize,
22    tokens: Vec<(SyntaxKind, &'a str)>,
23}
24
25impl<'a> RowanLexer<'a> {
26    fn new(source: &'a str) -> Self {
27        Self {
28            source,
29            pos: 0,
30            tokens: Vec::new(),
31        }
32    }
33
34    fn remaining(&self) -> &'a str {
35        &self.source[self.pos..]
36    }
37
38    fn peek(&self) -> Option<char> {
39        self.remaining().chars().next()
40    }
41
42    /// Peek at the character at offset `n` from current position.
43    fn peek_at(&self, n: usize) -> Option<char> {
44        self.remaining().chars().nth(n)
45    }
46
47    fn at_end(&self) -> bool {
48        self.pos >= self.source.len()
49    }
50
51    /// Advance by one character (its UTF-8 byte length) and return it.
52    fn advance_char(&mut self) -> Option<char> {
53        let ch = self.peek()?;
54        self.pos += ch.len_utf8();
55        Some(ch)
56    }
57
58    fn emit(&mut self, kind: SyntaxKind, start: usize) {
59        let text = &self.source[start..self.pos];
60        if !text.is_empty() {
61            self.tokens.push((kind, text));
62        }
63    }
64
65    fn tokenize_all(&mut self) -> Vec<(SyntaxKind, &'a str)> {
66        // Track whether the previous non-whitespace token was whitespace-preceded
67        // for comment detection (# is only a comment after whitespace or at line start).
68        let mut at_line_start = true;
69
70        while !self.at_end() {
71            let start = self.pos;
72            let ch = self.peek().unwrap();
73
74            match ch {
75                '\n' => {
76                    self.advance_char();
77                    self.emit(SyntaxKind::NEWLINE, start);
78                    at_line_start = true;
79                }
80                ' ' | '\t' => {
81                    self.eat_whitespace();
82                    self.emit(SyntaxKind::WHITESPACE, start);
83                    // at_line_start stays as-is (whitespace doesn't reset it)
84                }
85                '#' if at_line_start || self.preceded_by_whitespace() => {
86                    self.eat_comment();
87                    self.emit(SyntaxKind::COMMENT, start);
88                    at_line_start = false;
89                }
90                ';' => {
91                    self.advance_char();
92                    self.emit(SyntaxKind::SEMICOLON, start);
93                    at_line_start = false;
94                }
95                '{' => {
96                    self.advance_char();
97                    self.emit(SyntaxKind::L_BRACE, start);
98                    at_line_start = false;
99                }
100                '}' => {
101                    self.advance_char();
102                    self.emit(SyntaxKind::R_BRACE, start);
103                    at_line_start = false;
104                }
105                '"' => {
106                    self.eat_double_quoted_string();
107                    self.emit(SyntaxKind::DOUBLE_QUOTED_STRING, start);
108                    at_line_start = false;
109                }
110                '\'' => {
111                    self.eat_single_quoted_string();
112                    self.emit(SyntaxKind::SINGLE_QUOTED_STRING, start);
113                    at_line_start = false;
114                }
115                '$' => {
116                    self.eat_variable();
117                    self.emit(SyntaxKind::VARIABLE, start);
118                    at_line_start = false;
119                }
120                _ if is_ident_start(ch) => {
121                    // Could be IDENT or ARGUMENT (identifiers that continue
122                    // with argument-chars like / or . become ARGUMENT).
123                    self.eat_ident_or_argument();
124                    let text = &self.source[start..self.pos];
125                    let kind = if text
126                        .chars()
127                        .all(|c| is_ident_continue(c) || is_ident_start(c))
128                    {
129                        SyntaxKind::IDENT
130                    } else {
131                        SyntaxKind::ARGUMENT
132                    };
133                    self.tokens.push((kind, text));
134                    at_line_start = false;
135                }
136                _ if is_argument_char(ch) => {
137                    self.eat_argument(ch);
138                    self.emit(SyntaxKind::ARGUMENT, start);
139                    at_line_start = false;
140                }
141                _ => {
142                    // Unknown character — emit as ERROR token.
143                    self.advance_char();
144                    self.emit(SyntaxKind::ERROR, start);
145                    at_line_start = false;
146                }
147            }
148        }
149
150        std::mem::take(&mut self.tokens)
151    }
152
153    // ── whitespace / comment ────────────────────────────────────────
154
155    fn eat_whitespace(&mut self) {
156        while let Some(ch) = self.peek() {
157            if ch == ' ' || ch == '\t' {
158                self.advance_char();
159            } else {
160                break;
161            }
162        }
163    }
164
165    fn eat_comment(&mut self) {
166        // Consume '#' and everything until (but not including) '\n'.
167        while let Some(ch) = self.peek() {
168            if ch == '\n' {
169                break;
170            }
171            self.advance_char();
172        }
173    }
174
175    /// Check if the immediately preceding token was whitespace or we are at
176    /// the beginning of a line.
177    fn preceded_by_whitespace(&self) -> bool {
178        matches!(
179            self.tokens.last(),
180            Some((SyntaxKind::WHITESPACE, _)) | Some((SyntaxKind::NEWLINE, _)) | None
181        )
182    }
183
184    // ── strings ─────────────────────────────────────────────────────
185
186    fn eat_double_quoted_string(&mut self) {
187        // Opening quote
188        self.advance_char(); // "
189        loop {
190            match self.peek() {
191                None => break, // Unterminated at EOF
192                Some('\\') => {
193                    self.advance_char(); // backslash
194                    self.advance_char(); // escaped char (if any)
195                }
196                Some('"') => {
197                    self.advance_char(); // closing quote
198                    break;
199                }
200                Some(_) => {
201                    self.advance_char();
202                }
203            }
204        }
205    }
206
207    fn eat_single_quoted_string(&mut self) {
208        self.advance_char(); // opening '
209        loop {
210            match self.peek() {
211                None => break, // Unterminated at EOF
212                Some('\\') => {
213                    self.advance_char();
214                    self.advance_char();
215                }
216                Some('\'') => {
217                    self.advance_char();
218                    break;
219                }
220                Some(_) => {
221                    self.advance_char();
222                }
223            }
224        }
225    }
226
227    // ── variable ────────────────────────────────────────────────────
228
229    fn eat_variable(&mut self) {
230        self.advance_char(); // '$'
231        if self.peek() == Some('{') {
232            // ${var} syntax
233            self.advance_char(); // '{'
234            while let Some(ch) = self.peek() {
235                if ch == '}' {
236                    self.advance_char();
237                    break;
238                }
239                self.advance_char();
240            }
241        } else if matches!(self.peek(), Some('1'..='9')) {
242            // Positional capture `$1`..`$9`: nginx reads exactly one digit,
243            // so `$1redirect` is the capture `$1` followed by literal
244            // `redirect`, and `$12` is `$1` then `2`. Consume just the digit.
245            self.advance_char();
246        } else {
247            // $var syntax
248            while let Some(ch) = self.peek() {
249                if ch.is_alphanumeric() || ch == '_' {
250                    self.advance_char();
251                } else {
252                    break;
253                }
254            }
255        }
256    }
257
258    // ── identifier / argument ───────────────────────────────────────
259
260    fn eat_ident_or_argument(&mut self) {
261        // Read identifier characters first
262        while let Some(ch) = self.peek() {
263            if is_ident_continue(ch) || is_ident_start(ch) {
264                self.advance_char();
265            } else {
266                break;
267            }
268        }
269        // Continue reading argument characters if present
270        self.eat_argument_continuation();
271    }
272
273    fn eat_argument(&mut self, _first: char) {
274        self.advance_char();
275        self.eat_argument_continuation();
276    }
277
278    /// Continue reading argument characters including regex quantifiers
279    /// like `{8,}` and escaped braces like `\{` and `\}`.
280    fn eat_argument_continuation(&mut self) {
281        while let Some(ch) = self.peek() {
282            if is_argument_char(ch) || is_ident_continue(ch) || is_ident_start(ch) {
283                // Check for escaped brace
284                if ch == '\\' && matches!(self.peek_at(1), Some('{') | Some('}')) {
285                    self.advance_char(); // '\'
286                    self.advance_char(); // '{' or '}'
287                    continue;
288                }
289                self.advance_char();
290            } else if ch == '{' {
291                // Check for regex quantifier
292                if let Some(len) = self.peek_regex_quantifier() {
293                    for _ in 0..len {
294                        self.advance_char();
295                    }
296                } else {
297                    break;
298                }
299            } else if ch == '$' {
300                // Regex end anchor vs variable
301                if self.is_regex_end_anchor() {
302                    self.advance_char();
303                } else {
304                    break;
305                }
306            } else {
307                break;
308            }
309        }
310    }
311
312    /// Check if `$` at current position is a regex end anchor rather than
313    /// a variable reference.
314    fn is_regex_end_anchor(&self) -> bool {
315        let remaining = self.remaining();
316        let mut chars = remaining.chars();
317        if chars.next() != Some('$') {
318            return false;
319        }
320        match chars.next() {
321            None => true,
322            Some(c) if c.is_whitespace() => true,
323            Some('{') => false, // ${var}
324            Some(c) if c.is_alphanumeric() => false,
325            Some('_') => false,
326            _ => true,
327        }
328    }
329
330    /// Look ahead at a potential regex quantifier like `{8}`, `{1,3}`,
331    /// `{8,}`.  Returns the byte-length if found, `None` otherwise.
332    fn peek_regex_quantifier(&self) -> Option<usize> {
333        let remaining = self.remaining();
334        if !remaining.starts_with('{') {
335            return None;
336        }
337        let mut chars = remaining.char_indices().peekable();
338        chars.next(); // '{'
339
340        // Must have at least one digit
341        match chars.peek() {
342            Some((_, ch)) if ch.is_ascii_digit() => {
343                chars.next();
344            }
345            _ => return None,
346        }
347        // More digits
348        while let Some(&(_, ch)) = chars.peek() {
349            if ch.is_ascii_digit() {
350                chars.next();
351            } else {
352                break;
353            }
354        }
355        match chars.peek() {
356            Some(&(_, '}')) => {
357                // Byte length from '{' up to and including '}'
358                chars.next();
359                let end_offset = chars.peek().map(|(i, _)| *i).unwrap_or(remaining.len());
360                Some(end_offset)
361            }
362            Some(&(_, ',')) => {
363                chars.next();
364                while let Some(&(_, ch)) = chars.peek() {
365                    if ch.is_ascii_digit() {
366                        chars.next();
367                    } else {
368                        break;
369                    }
370                }
371                if chars.peek().map(|(_, ch)| *ch) == Some('}') {
372                    chars.next();
373                    let end_offset = chars.peek().map(|(i, _)| *i).unwrap_or(remaining.len());
374                    Some(end_offset)
375                } else {
376                    None
377                }
378            }
379            _ => None,
380        }
381    }
382}
383
384// ── Character classification (mirrors existing lexer.rs) ────────────────
385
386fn is_ident_start(ch: char) -> bool {
387    ch.is_alphabetic() || ch == '_'
388}
389
390fn is_ident_continue(ch: char) -> bool {
391    ch.is_alphanumeric() || ch == '_' || ch == '-'
392}
393
394fn is_argument_char(ch: char) -> bool {
395    !ch.is_whitespace() && !matches!(ch, ';' | '{' | '}' | '"' | '\'' | '$')
396}
397
398// ── Tests ───────────────────────────────────────────────────────────────
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    /// Helper: collect just the kinds.
405    fn kinds(source: &str) -> Vec<SyntaxKind> {
406        tokenize(source).into_iter().map(|(k, _)| k).collect()
407    }
408
409    /// The concatenation of all token texts must equal the original source.
410    fn assert_lossless(source: &str) {
411        let tokens = tokenize(source);
412        let reconstructed: String = tokens.iter().map(|(_, t)| *t).collect();
413        assert_eq!(reconstructed, source, "lossless round-trip failed");
414    }
415
416    #[test]
417    fn empty_input() {
418        assert_eq!(tokenize(""), vec![]);
419    }
420
421    #[test]
422    fn simple_directive() {
423        let tokens = tokenize("listen 80;");
424        assert_eq!(
425            tokens,
426            vec![
427                (SyntaxKind::IDENT, "listen"),
428                (SyntaxKind::WHITESPACE, " "),
429                (SyntaxKind::ARGUMENT, "80"),
430                (SyntaxKind::SEMICOLON, ";"),
431            ]
432        );
433    }
434
435    #[test]
436    fn block_directive() {
437        let tokens = tokenize("http { }");
438        assert_eq!(
439            tokens,
440            vec![
441                (SyntaxKind::IDENT, "http"),
442                (SyntaxKind::WHITESPACE, " "),
443                (SyntaxKind::L_BRACE, "{"),
444                (SyntaxKind::WHITESPACE, " "),
445                (SyntaxKind::R_BRACE, "}"),
446            ]
447        );
448    }
449
450    #[test]
451    fn double_quoted_string() {
452        let tokens = tokenize(r#"return 200 "hello world";"#);
453        assert_eq!(
454            tokens,
455            vec![
456                (SyntaxKind::IDENT, "return"),
457                (SyntaxKind::WHITESPACE, " "),
458                (SyntaxKind::ARGUMENT, "200"),
459                (SyntaxKind::WHITESPACE, " "),
460                (SyntaxKind::DOUBLE_QUOTED_STRING, "\"hello world\""),
461                (SyntaxKind::SEMICOLON, ";"),
462            ]
463        );
464    }
465
466    #[test]
467    fn single_quoted_string() {
468        let tokens = tokenize("return 200 'hello world';");
469        assert_eq!(
470            tokens,
471            vec![
472                (SyntaxKind::IDENT, "return"),
473                (SyntaxKind::WHITESPACE, " "),
474                (SyntaxKind::ARGUMENT, "200"),
475                (SyntaxKind::WHITESPACE, " "),
476                (SyntaxKind::SINGLE_QUOTED_STRING, "'hello world'"),
477                (SyntaxKind::SEMICOLON, ";"),
478            ]
479        );
480    }
481
482    #[test]
483    fn variable() {
484        let tokens = tokenize("set $var value;");
485        assert_eq!(
486            tokens,
487            vec![
488                (SyntaxKind::IDENT, "set"),
489                (SyntaxKind::WHITESPACE, " "),
490                (SyntaxKind::VARIABLE, "$var"),
491                (SyntaxKind::WHITESPACE, " "),
492                (SyntaxKind::IDENT, "value"),
493                (SyntaxKind::SEMICOLON, ";"),
494            ]
495        );
496    }
497
498    #[test]
499    fn variable_braces() {
500        let tokens = tokenize("return 200 ${request_uri};");
501        assert_eq!(
502            tokens,
503            vec![
504                (SyntaxKind::IDENT, "return"),
505                (SyntaxKind::WHITESPACE, " "),
506                (SyntaxKind::ARGUMENT, "200"),
507                (SyntaxKind::WHITESPACE, " "),
508                (SyntaxKind::VARIABLE, "${request_uri}"),
509                (SyntaxKind::SEMICOLON, ";"),
510            ]
511        );
512    }
513
514    #[test]
515    fn positional_capture_stops_at_one_digit() {
516        // nginx reads `$1`..`$9` as a single-digit positional capture, so
517        // `$1redirect` is the capture `$1` followed by the literal text
518        // `redirect`, not one variable named `1redirect`.
519        let tokens = tokenize("set $x $1redirect;");
520        assert_eq!(
521            tokens,
522            vec![
523                (SyntaxKind::IDENT, "set"),
524                (SyntaxKind::WHITESPACE, " "),
525                (SyntaxKind::VARIABLE, "$x"),
526                (SyntaxKind::WHITESPACE, " "),
527                (SyntaxKind::VARIABLE, "$1"),
528                (SyntaxKind::IDENT, "redirect"),
529                (SyntaxKind::SEMICOLON, ";"),
530            ]
531        );
532    }
533
534    #[test]
535    fn positional_capture_followed_by_digit() {
536        // `$12` is the capture `$1` then a literal `2`; the `$N` form has no
537        // multi-digit captures (use the brace form `${12}` for those).
538        let tokens = tokenize("set $x $12;");
539        assert_eq!(
540            tokens,
541            vec![
542                (SyntaxKind::IDENT, "set"),
543                (SyntaxKind::WHITESPACE, " "),
544                (SyntaxKind::VARIABLE, "$x"),
545                (SyntaxKind::WHITESPACE, " "),
546                (SyntaxKind::VARIABLE, "$1"),
547                (SyntaxKind::ARGUMENT, "2"),
548                (SyntaxKind::SEMICOLON, ";"),
549            ]
550        );
551    }
552
553    #[test]
554    fn positional_capture_then_digit_and_letters() {
555        // `$12redirect` is the capture `$1` then the literal `2redirect`:
556        // the single digit is the capture, the remainder is plain text.
557        let tokens = tokenize("set $x $12redirect;");
558        assert_eq!(
559            tokens,
560            vec![
561                (SyntaxKind::IDENT, "set"),
562                (SyntaxKind::WHITESPACE, " "),
563                (SyntaxKind::VARIABLE, "$x"),
564                (SyntaxKind::WHITESPACE, " "),
565                (SyntaxKind::VARIABLE, "$1"),
566                (SyntaxKind::ARGUMENT, "2redirect"),
567                (SyntaxKind::SEMICOLON, ";"),
568            ]
569        );
570    }
571
572    #[test]
573    fn brace_capture_keeps_multiple_digits() {
574        // The brace form is unaffected: `${12}` stays one variable token.
575        let tokens = tokenize("set $x ${12};");
576        assert_eq!(
577            tokens,
578            vec![
579                (SyntaxKind::IDENT, "set"),
580                (SyntaxKind::WHITESPACE, " "),
581                (SyntaxKind::VARIABLE, "$x"),
582                (SyntaxKind::WHITESPACE, " "),
583                (SyntaxKind::VARIABLE, "${12}"),
584                (SyntaxKind::SEMICOLON, ";"),
585            ]
586        );
587    }
588
589    #[test]
590    fn letter_first_variable_with_digits_unaffected() {
591        // Only digit-FIRST names are single-digit captures; a normal name
592        // containing digits (`$arg_1`) is still one variable token.
593        let tokens = tokenize("set $x $arg_1;");
594        assert_eq!(
595            tokens,
596            vec![
597                (SyntaxKind::IDENT, "set"),
598                (SyntaxKind::WHITESPACE, " "),
599                (SyntaxKind::VARIABLE, "$x"),
600                (SyntaxKind::WHITESPACE, " "),
601                (SyntaxKind::VARIABLE, "$arg_1"),
602                (SyntaxKind::SEMICOLON, ";"),
603            ]
604        );
605    }
606
607    #[test]
608    fn dollar_nine_is_top_of_capture_range() {
609        // `$9` is the top of the single-digit range, so `$9foo` splits into
610        // the capture `$9` and the literal `foo`.
611        let tokens = tokenize("set $x $9foo;");
612        assert_eq!(
613            tokens,
614            vec![
615                (SyntaxKind::IDENT, "set"),
616                (SyntaxKind::WHITESPACE, " "),
617                (SyntaxKind::VARIABLE, "$x"),
618                (SyntaxKind::WHITESPACE, " "),
619                (SyntaxKind::VARIABLE, "$9"),
620                (SyntaxKind::IDENT, "foo"),
621                (SyntaxKind::SEMICOLON, ";"),
622            ]
623        );
624    }
625
626    #[test]
627    fn dollar_zero_is_not_a_single_digit_capture() {
628        // `$0` is below the `$1`..`$9` capture range, so it takes the normal
629        // greedy variable-name path: `$0redirect` is one variable token
630        // (nginx also treats `$0` as a name, not a positional capture).
631        let tokens = tokenize("set $x $0redirect;");
632        assert_eq!(
633            tokens,
634            vec![
635                (SyntaxKind::IDENT, "set"),
636                (SyntaxKind::WHITESPACE, " "),
637                (SyntaxKind::VARIABLE, "$x"),
638                (SyntaxKind::WHITESPACE, " "),
639                (SyntaxKind::VARIABLE, "$0redirect"),
640                (SyntaxKind::SEMICOLON, ";"),
641            ]
642        );
643    }
644
645    #[test]
646    fn comment() {
647        let tokens = tokenize("# this is a comment\nlisten 80;");
648        assert_eq!(
649            tokens,
650            vec![
651                (SyntaxKind::COMMENT, "# this is a comment"),
652                (SyntaxKind::NEWLINE, "\n"),
653                (SyntaxKind::IDENT, "listen"),
654                (SyntaxKind::WHITESPACE, " "),
655                (SyntaxKind::ARGUMENT, "80"),
656                (SyntaxKind::SEMICOLON, ";"),
657            ]
658        );
659    }
660
661    #[test]
662    fn path_argument() {
663        let tokens = tokenize("root /var/www/html;");
664        assert_eq!(
665            tokens,
666            vec![
667                (SyntaxKind::IDENT, "root"),
668                (SyntaxKind::WHITESPACE, " "),
669                (SyntaxKind::ARGUMENT, "/var/www/html"),
670                (SyntaxKind::SEMICOLON, ";"),
671            ]
672        );
673    }
674
675    #[test]
676    fn newlines_and_whitespace() {
677        let source = "http {\n    listen 80;\n}";
678        assert_lossless(source);
679        let tokens = tokenize(source);
680        assert_eq!(
681            tokens,
682            vec![
683                (SyntaxKind::IDENT, "http"),
684                (SyntaxKind::WHITESPACE, " "),
685                (SyntaxKind::L_BRACE, "{"),
686                (SyntaxKind::NEWLINE, "\n"),
687                (SyntaxKind::WHITESPACE, "    "),
688                (SyntaxKind::IDENT, "listen"),
689                (SyntaxKind::WHITESPACE, " "),
690                (SyntaxKind::ARGUMENT, "80"),
691                (SyntaxKind::SEMICOLON, ";"),
692                (SyntaxKind::NEWLINE, "\n"),
693                (SyntaxKind::R_BRACE, "}"),
694            ]
695        );
696    }
697
698    #[test]
699    fn regex_quantifier() {
700        let tokens = tokenize(r"location ~ ^/[a-z]{8}$ {");
701        assert_eq!(
702            tokens,
703            vec![
704                (SyntaxKind::IDENT, "location"),
705                (SyntaxKind::WHITESPACE, " "),
706                (SyntaxKind::ARGUMENT, "~"),
707                (SyntaxKind::WHITESPACE, " "),
708                (SyntaxKind::ARGUMENT, "^/[a-z]{8}$"),
709                (SyntaxKind::WHITESPACE, " "),
710                (SyntaxKind::L_BRACE, "{"),
711            ]
712        );
713    }
714
715    #[test]
716    fn regex_quantifier_range() {
717        let tokens = tokenize(r"location ~ ^/[0-9]{1,3}$ {");
718        assert_eq!(
719            tokens,
720            vec![
721                (SyntaxKind::IDENT, "location"),
722                (SyntaxKind::WHITESPACE, " "),
723                (SyntaxKind::ARGUMENT, "~"),
724                (SyntaxKind::WHITESPACE, " "),
725                (SyntaxKind::ARGUMENT, "^/[0-9]{1,3}$"),
726                (SyntaxKind::WHITESPACE, " "),
727                (SyntaxKind::L_BRACE, "{"),
728            ]
729        );
730    }
731
732    #[test]
733    fn escaped_braces_in_regex() {
734        let tokens = tokenize(r"location ~ ^/nested/\{[a-z]+\}$ {");
735        assert_eq!(
736            tokens,
737            vec![
738                (SyntaxKind::IDENT, "location"),
739                (SyntaxKind::WHITESPACE, " "),
740                (SyntaxKind::ARGUMENT, "~"),
741                (SyntaxKind::WHITESPACE, " "),
742                (SyntaxKind::ARGUMENT, r"^/nested/\{[a-z]+\}$"),
743                (SyntaxKind::WHITESPACE, " "),
744                (SyntaxKind::L_BRACE, "{"),
745            ]
746        );
747    }
748
749    #[test]
750    fn hash_in_argument() {
751        let tokens = tokenize("location ~* foo#bar {");
752        assert_eq!(
753            tokens,
754            vec![
755                (SyntaxKind::IDENT, "location"),
756                (SyntaxKind::WHITESPACE, " "),
757                (SyntaxKind::ARGUMENT, "~*"),
758                (SyntaxKind::WHITESPACE, " "),
759                (SyntaxKind::ARGUMENT, "foo#bar"),
760                (SyntaxKind::WHITESPACE, " "),
761                (SyntaxKind::L_BRACE, "{"),
762            ]
763        );
764    }
765
766    #[test]
767    fn hash_comment_after_whitespace() {
768        let tokens = tokenize("listen 80; # this is a comment");
769        assert_eq!(
770            tokens,
771            vec![
772                (SyntaxKind::IDENT, "listen"),
773                (SyntaxKind::WHITESPACE, " "),
774                (SyntaxKind::ARGUMENT, "80"),
775                (SyntaxKind::SEMICOLON, ";"),
776                (SyntaxKind::WHITESPACE, " "),
777                (SyntaxKind::COMMENT, "# this is a comment"),
778            ]
779        );
780    }
781
782    #[test]
783    fn escape_in_double_quoted_string() {
784        let tokens = tokenize(r#"return 200 "hello\nworld";"#);
785        assert_eq!(
786            tokens,
787            vec![
788                (SyntaxKind::IDENT, "return"),
789                (SyntaxKind::WHITESPACE, " "),
790                (SyntaxKind::ARGUMENT, "200"),
791                (SyntaxKind::WHITESPACE, " "),
792                (SyntaxKind::DOUBLE_QUOTED_STRING, r#""hello\nworld""#),
793                (SyntaxKind::SEMICOLON, ";"),
794            ]
795        );
796    }
797
798    #[test]
799    fn lossless_complex_config() {
800        let source = r#"http {
801    # Main server
802    server {
803        listen 80;
804        server_name example.com;
805        location / {
806            proxy_pass http://backend;
807        }
808    }
809}
810"#;
811        assert_lossless(source);
812    }
813
814    #[test]
815    fn lossless_utf8() {
816        let source = "# これは日本語コメント\nlisten 80;\n";
817        assert_lossless(source);
818    }
819
820    #[test]
821    fn glob_pattern() {
822        let tokens = tokenize("include /etc/nginx/conf.d/*.conf;");
823        assert_eq!(
824            tokens,
825            vec![
826                (SyntaxKind::IDENT, "include"),
827                (SyntaxKind::WHITESPACE, " "),
828                (SyntaxKind::ARGUMENT, "/etc/nginx/conf.d/*.conf"),
829                (SyntaxKind::SEMICOLON, ";"),
830            ]
831        );
832    }
833
834    #[test]
835    fn extension_directive() {
836        let tokens = tokenize(r#"more_set_headers "Server: Custom";"#);
837        assert_eq!(
838            tokens,
839            vec![
840                (SyntaxKind::IDENT, "more_set_headers"),
841                (SyntaxKind::WHITESPACE, " "),
842                (SyntaxKind::DOUBLE_QUOTED_STRING, "\"Server: Custom\""),
843                (SyntaxKind::SEMICOLON, ";"),
844            ]
845        );
846    }
847
848    #[test]
849    fn hash_in_regex_pattern() {
850        let tokens = tokenize(r"location ~* (?:#.*#|\.bak)$ {");
851        assert_eq!(
852            tokens,
853            vec![
854                (SyntaxKind::IDENT, "location"),
855                (SyntaxKind::WHITESPACE, " "),
856                (SyntaxKind::ARGUMENT, "~*"),
857                (SyntaxKind::WHITESPACE, " "),
858                (SyntaxKind::ARGUMENT, r"(?:#.*#|\.bak)$"),
859                (SyntaxKind::WHITESPACE, " "),
860                (SyntaxKind::L_BRACE, "{"),
861            ]
862        );
863    }
864
865    #[test]
866    fn ident_classification() {
867        // Pure identifiers should be IDENT
868        let _tokens = tokenize("server_name example;");
869        assert_eq!(kinds("server_name"), vec![SyntaxKind::IDENT]);
870        // Identifiers with argument chars become ARGUMENT
871        let toks = tokenize("text/plain");
872        assert_eq!(toks, vec![(SyntaxKind::ARGUMENT, "text/plain")]);
873    }
874}