1use crate::syntax_kind::SyntaxKind;
8
9pub fn tokenize(source: &str) -> Vec<(SyntaxKind, &str)> {
14 let mut lexer = RowanLexer::new(source);
15 lexer.tokenize_all()
16}
17
18struct RowanLexer<'a> {
20 source: &'a str,
21 pos: usize,
22 tokens: Vec<(SyntaxKind, &'a str)>,
23}
24
25impl<'a> RowanLexer<'a> {
26 fn new(source: &'a str) -> Self {
27 Self {
28 source,
29 pos: 0,
30 tokens: Vec::new(),
31 }
32 }
33
34 fn remaining(&self) -> &'a str {
35 &self.source[self.pos..]
36 }
37
38 fn peek(&self) -> Option<char> {
39 self.remaining().chars().next()
40 }
41
42 fn peek_at(&self, n: usize) -> Option<char> {
44 self.remaining().chars().nth(n)
45 }
46
47 fn at_end(&self) -> bool {
48 self.pos >= self.source.len()
49 }
50
51 fn advance_char(&mut self) -> Option<char> {
53 let ch = self.peek()?;
54 self.pos += ch.len_utf8();
55 Some(ch)
56 }
57
58 fn emit(&mut self, kind: SyntaxKind, start: usize) {
59 let text = &self.source[start..self.pos];
60 if !text.is_empty() {
61 self.tokens.push((kind, text));
62 }
63 }
64
65 fn tokenize_all(&mut self) -> Vec<(SyntaxKind, &'a str)> {
66 let mut at_line_start = true;
69
70 while !self.at_end() {
71 let start = self.pos;
72 let ch = self.peek().unwrap();
73
74 match ch {
75 '\n' => {
76 self.advance_char();
77 self.emit(SyntaxKind::NEWLINE, start);
78 at_line_start = true;
79 }
80 ' ' | '\t' => {
81 self.eat_whitespace();
82 self.emit(SyntaxKind::WHITESPACE, start);
83 }
85 '#' if at_line_start || self.preceded_by_whitespace() => {
86 self.eat_comment();
87 self.emit(SyntaxKind::COMMENT, start);
88 at_line_start = false;
89 }
90 ';' => {
91 self.advance_char();
92 self.emit(SyntaxKind::SEMICOLON, start);
93 at_line_start = false;
94 }
95 '{' => {
96 self.advance_char();
97 self.emit(SyntaxKind::L_BRACE, start);
98 at_line_start = false;
99 }
100 '}' => {
101 self.advance_char();
102 self.emit(SyntaxKind::R_BRACE, start);
103 at_line_start = false;
104 }
105 '"' => {
106 self.eat_double_quoted_string();
107 self.emit(SyntaxKind::DOUBLE_QUOTED_STRING, start);
108 at_line_start = false;
109 }
110 '\'' => {
111 self.eat_single_quoted_string();
112 self.emit(SyntaxKind::SINGLE_QUOTED_STRING, start);
113 at_line_start = false;
114 }
115 '$' => {
116 self.eat_variable();
117 self.emit(SyntaxKind::VARIABLE, start);
118 at_line_start = false;
119 }
120 _ if is_ident_start(ch) => {
121 self.eat_ident_or_argument();
124 let text = &self.source[start..self.pos];
125 let kind = if text
126 .chars()
127 .all(|c| is_ident_continue(c) || is_ident_start(c))
128 {
129 SyntaxKind::IDENT
130 } else {
131 SyntaxKind::ARGUMENT
132 };
133 self.tokens.push((kind, text));
134 at_line_start = false;
135 }
136 _ if is_argument_char(ch) => {
137 self.eat_argument(ch);
138 self.emit(SyntaxKind::ARGUMENT, start);
139 at_line_start = false;
140 }
141 _ => {
142 self.advance_char();
144 self.emit(SyntaxKind::ERROR, start);
145 at_line_start = false;
146 }
147 }
148 }
149
150 std::mem::take(&mut self.tokens)
151 }
152
153 fn eat_whitespace(&mut self) {
156 while let Some(ch) = self.peek() {
157 if ch == ' ' || ch == '\t' {
158 self.advance_char();
159 } else {
160 break;
161 }
162 }
163 }
164
165 fn eat_comment(&mut self) {
166 while let Some(ch) = self.peek() {
168 if ch == '\n' {
169 break;
170 }
171 self.advance_char();
172 }
173 }
174
175 fn preceded_by_whitespace(&self) -> bool {
178 matches!(
179 self.tokens.last(),
180 Some((SyntaxKind::WHITESPACE, _)) | Some((SyntaxKind::NEWLINE, _)) | None
181 )
182 }
183
184 fn eat_double_quoted_string(&mut self) {
187 self.advance_char(); loop {
190 match self.peek() {
191 None => break, Some('\\') => {
193 self.advance_char(); self.advance_char(); }
196 Some('"') => {
197 self.advance_char(); break;
199 }
200 Some(_) => {
201 self.advance_char();
202 }
203 }
204 }
205 }
206
207 fn eat_single_quoted_string(&mut self) {
208 self.advance_char(); loop {
210 match self.peek() {
211 None => break, Some('\\') => {
213 self.advance_char();
214 self.advance_char();
215 }
216 Some('\'') => {
217 self.advance_char();
218 break;
219 }
220 Some(_) => {
221 self.advance_char();
222 }
223 }
224 }
225 }
226
227 fn eat_variable(&mut self) {
230 self.advance_char(); if self.peek() == Some('{') {
232 self.advance_char(); while let Some(ch) = self.peek() {
235 if ch == '}' {
236 self.advance_char();
237 break;
238 }
239 self.advance_char();
240 }
241 } else {
242 while let Some(ch) = self.peek() {
244 if ch.is_alphanumeric() || ch == '_' {
245 self.advance_char();
246 } else {
247 break;
248 }
249 }
250 }
251 }
252
253 fn eat_ident_or_argument(&mut self) {
256 while let Some(ch) = self.peek() {
258 if is_ident_continue(ch) || is_ident_start(ch) {
259 self.advance_char();
260 } else {
261 break;
262 }
263 }
264 self.eat_argument_continuation();
266 }
267
268 fn eat_argument(&mut self, _first: char) {
269 self.advance_char();
270 self.eat_argument_continuation();
271 }
272
273 fn eat_argument_continuation(&mut self) {
276 while let Some(ch) = self.peek() {
277 if is_argument_char(ch) || is_ident_continue(ch) || is_ident_start(ch) {
278 if ch == '\\' && matches!(self.peek_at(1), Some('{') | Some('}')) {
280 self.advance_char(); self.advance_char(); continue;
283 }
284 self.advance_char();
285 } else if ch == '{' {
286 if let Some(len) = self.peek_regex_quantifier() {
288 for _ in 0..len {
289 self.advance_char();
290 }
291 } else {
292 break;
293 }
294 } else if ch == '$' {
295 if self.is_regex_end_anchor() {
297 self.advance_char();
298 } else {
299 break;
300 }
301 } else {
302 break;
303 }
304 }
305 }
306
307 fn is_regex_end_anchor(&self) -> bool {
310 let remaining = self.remaining();
311 let mut chars = remaining.chars();
312 if chars.next() != Some('$') {
313 return false;
314 }
315 match chars.next() {
316 None => true,
317 Some(c) if c.is_whitespace() => true,
318 Some('{') => false, Some(c) if c.is_alphanumeric() => false,
320 Some('_') => false,
321 _ => true,
322 }
323 }
324
325 fn peek_regex_quantifier(&self) -> Option<usize> {
328 let remaining = self.remaining();
329 if !remaining.starts_with('{') {
330 return None;
331 }
332 let mut chars = remaining.char_indices().peekable();
333 chars.next(); match chars.peek() {
337 Some((_, ch)) if ch.is_ascii_digit() => {
338 chars.next();
339 }
340 _ => return None,
341 }
342 while let Some(&(_, ch)) = chars.peek() {
344 if ch.is_ascii_digit() {
345 chars.next();
346 } else {
347 break;
348 }
349 }
350 match chars.peek() {
351 Some(&(_, '}')) => {
352 chars.next();
354 let end_offset = chars.peek().map(|(i, _)| *i).unwrap_or(remaining.len());
355 Some(end_offset)
356 }
357 Some(&(_, ',')) => {
358 chars.next();
359 while let Some(&(_, ch)) = chars.peek() {
360 if ch.is_ascii_digit() {
361 chars.next();
362 } else {
363 break;
364 }
365 }
366 if chars.peek().map(|(_, ch)| *ch) == Some('}') {
367 chars.next();
368 let end_offset = chars.peek().map(|(i, _)| *i).unwrap_or(remaining.len());
369 Some(end_offset)
370 } else {
371 None
372 }
373 }
374 _ => None,
375 }
376 }
377}
378
379fn is_ident_start(ch: char) -> bool {
382 ch.is_alphabetic() || ch == '_'
383}
384
385fn is_ident_continue(ch: char) -> bool {
386 ch.is_alphanumeric() || ch == '_' || ch == '-'
387}
388
389fn is_argument_char(ch: char) -> bool {
390 !ch.is_whitespace() && !matches!(ch, ';' | '{' | '}' | '"' | '\'' | '$')
391}
392
393#[cfg(test)]
396mod tests {
397 use super::*;
398
399 fn kinds(source: &str) -> Vec<SyntaxKind> {
401 tokenize(source).into_iter().map(|(k, _)| k).collect()
402 }
403
404 fn assert_lossless(source: &str) {
406 let tokens = tokenize(source);
407 let reconstructed: String = tokens.iter().map(|(_, t)| *t).collect();
408 assert_eq!(reconstructed, source, "lossless round-trip failed");
409 }
410
411 #[test]
412 fn empty_input() {
413 assert_eq!(tokenize(""), vec![]);
414 }
415
416 #[test]
417 fn simple_directive() {
418 let tokens = tokenize("listen 80;");
419 assert_eq!(
420 tokens,
421 vec![
422 (SyntaxKind::IDENT, "listen"),
423 (SyntaxKind::WHITESPACE, " "),
424 (SyntaxKind::ARGUMENT, "80"),
425 (SyntaxKind::SEMICOLON, ";"),
426 ]
427 );
428 }
429
430 #[test]
431 fn block_directive() {
432 let tokens = tokenize("http { }");
433 assert_eq!(
434 tokens,
435 vec![
436 (SyntaxKind::IDENT, "http"),
437 (SyntaxKind::WHITESPACE, " "),
438 (SyntaxKind::L_BRACE, "{"),
439 (SyntaxKind::WHITESPACE, " "),
440 (SyntaxKind::R_BRACE, "}"),
441 ]
442 );
443 }
444
445 #[test]
446 fn double_quoted_string() {
447 let tokens = tokenize(r#"return 200 "hello world";"#);
448 assert_eq!(
449 tokens,
450 vec![
451 (SyntaxKind::IDENT, "return"),
452 (SyntaxKind::WHITESPACE, " "),
453 (SyntaxKind::ARGUMENT, "200"),
454 (SyntaxKind::WHITESPACE, " "),
455 (SyntaxKind::DOUBLE_QUOTED_STRING, "\"hello world\""),
456 (SyntaxKind::SEMICOLON, ";"),
457 ]
458 );
459 }
460
461 #[test]
462 fn single_quoted_string() {
463 let tokens = tokenize("return 200 'hello world';");
464 assert_eq!(
465 tokens,
466 vec![
467 (SyntaxKind::IDENT, "return"),
468 (SyntaxKind::WHITESPACE, " "),
469 (SyntaxKind::ARGUMENT, "200"),
470 (SyntaxKind::WHITESPACE, " "),
471 (SyntaxKind::SINGLE_QUOTED_STRING, "'hello world'"),
472 (SyntaxKind::SEMICOLON, ";"),
473 ]
474 );
475 }
476
477 #[test]
478 fn variable() {
479 let tokens = tokenize("set $var value;");
480 assert_eq!(
481 tokens,
482 vec![
483 (SyntaxKind::IDENT, "set"),
484 (SyntaxKind::WHITESPACE, " "),
485 (SyntaxKind::VARIABLE, "$var"),
486 (SyntaxKind::WHITESPACE, " "),
487 (SyntaxKind::IDENT, "value"),
488 (SyntaxKind::SEMICOLON, ";"),
489 ]
490 );
491 }
492
493 #[test]
494 fn variable_braces() {
495 let tokens = tokenize("return 200 ${request_uri};");
496 assert_eq!(
497 tokens,
498 vec![
499 (SyntaxKind::IDENT, "return"),
500 (SyntaxKind::WHITESPACE, " "),
501 (SyntaxKind::ARGUMENT, "200"),
502 (SyntaxKind::WHITESPACE, " "),
503 (SyntaxKind::VARIABLE, "${request_uri}"),
504 (SyntaxKind::SEMICOLON, ";"),
505 ]
506 );
507 }
508
509 #[test]
510 fn comment() {
511 let tokens = tokenize("# this is a comment\nlisten 80;");
512 assert_eq!(
513 tokens,
514 vec![
515 (SyntaxKind::COMMENT, "# this is a comment"),
516 (SyntaxKind::NEWLINE, "\n"),
517 (SyntaxKind::IDENT, "listen"),
518 (SyntaxKind::WHITESPACE, " "),
519 (SyntaxKind::ARGUMENT, "80"),
520 (SyntaxKind::SEMICOLON, ";"),
521 ]
522 );
523 }
524
525 #[test]
526 fn path_argument() {
527 let tokens = tokenize("root /var/www/html;");
528 assert_eq!(
529 tokens,
530 vec![
531 (SyntaxKind::IDENT, "root"),
532 (SyntaxKind::WHITESPACE, " "),
533 (SyntaxKind::ARGUMENT, "/var/www/html"),
534 (SyntaxKind::SEMICOLON, ";"),
535 ]
536 );
537 }
538
539 #[test]
540 fn newlines_and_whitespace() {
541 let source = "http {\n listen 80;\n}";
542 assert_lossless(source);
543 let tokens = tokenize(source);
544 assert_eq!(
545 tokens,
546 vec![
547 (SyntaxKind::IDENT, "http"),
548 (SyntaxKind::WHITESPACE, " "),
549 (SyntaxKind::L_BRACE, "{"),
550 (SyntaxKind::NEWLINE, "\n"),
551 (SyntaxKind::WHITESPACE, " "),
552 (SyntaxKind::IDENT, "listen"),
553 (SyntaxKind::WHITESPACE, " "),
554 (SyntaxKind::ARGUMENT, "80"),
555 (SyntaxKind::SEMICOLON, ";"),
556 (SyntaxKind::NEWLINE, "\n"),
557 (SyntaxKind::R_BRACE, "}"),
558 ]
559 );
560 }
561
562 #[test]
563 fn regex_quantifier() {
564 let tokens = tokenize(r"location ~ ^/[a-z]{8}$ {");
565 assert_eq!(
566 tokens,
567 vec![
568 (SyntaxKind::IDENT, "location"),
569 (SyntaxKind::WHITESPACE, " "),
570 (SyntaxKind::ARGUMENT, "~"),
571 (SyntaxKind::WHITESPACE, " "),
572 (SyntaxKind::ARGUMENT, "^/[a-z]{8}$"),
573 (SyntaxKind::WHITESPACE, " "),
574 (SyntaxKind::L_BRACE, "{"),
575 ]
576 );
577 }
578
579 #[test]
580 fn regex_quantifier_range() {
581 let tokens = tokenize(r"location ~ ^/[0-9]{1,3}$ {");
582 assert_eq!(
583 tokens,
584 vec![
585 (SyntaxKind::IDENT, "location"),
586 (SyntaxKind::WHITESPACE, " "),
587 (SyntaxKind::ARGUMENT, "~"),
588 (SyntaxKind::WHITESPACE, " "),
589 (SyntaxKind::ARGUMENT, "^/[0-9]{1,3}$"),
590 (SyntaxKind::WHITESPACE, " "),
591 (SyntaxKind::L_BRACE, "{"),
592 ]
593 );
594 }
595
596 #[test]
597 fn escaped_braces_in_regex() {
598 let tokens = tokenize(r"location ~ ^/nested/\{[a-z]+\}$ {");
599 assert_eq!(
600 tokens,
601 vec![
602 (SyntaxKind::IDENT, "location"),
603 (SyntaxKind::WHITESPACE, " "),
604 (SyntaxKind::ARGUMENT, "~"),
605 (SyntaxKind::WHITESPACE, " "),
606 (SyntaxKind::ARGUMENT, r"^/nested/\{[a-z]+\}$"),
607 (SyntaxKind::WHITESPACE, " "),
608 (SyntaxKind::L_BRACE, "{"),
609 ]
610 );
611 }
612
613 #[test]
614 fn hash_in_argument() {
615 let tokens = tokenize("location ~* foo#bar {");
616 assert_eq!(
617 tokens,
618 vec![
619 (SyntaxKind::IDENT, "location"),
620 (SyntaxKind::WHITESPACE, " "),
621 (SyntaxKind::ARGUMENT, "~*"),
622 (SyntaxKind::WHITESPACE, " "),
623 (SyntaxKind::ARGUMENT, "foo#bar"),
624 (SyntaxKind::WHITESPACE, " "),
625 (SyntaxKind::L_BRACE, "{"),
626 ]
627 );
628 }
629
630 #[test]
631 fn hash_comment_after_whitespace() {
632 let tokens = tokenize("listen 80; # this is a comment");
633 assert_eq!(
634 tokens,
635 vec![
636 (SyntaxKind::IDENT, "listen"),
637 (SyntaxKind::WHITESPACE, " "),
638 (SyntaxKind::ARGUMENT, "80"),
639 (SyntaxKind::SEMICOLON, ";"),
640 (SyntaxKind::WHITESPACE, " "),
641 (SyntaxKind::COMMENT, "# this is a comment"),
642 ]
643 );
644 }
645
646 #[test]
647 fn escape_in_double_quoted_string() {
648 let tokens = tokenize(r#"return 200 "hello\nworld";"#);
649 assert_eq!(
650 tokens,
651 vec![
652 (SyntaxKind::IDENT, "return"),
653 (SyntaxKind::WHITESPACE, " "),
654 (SyntaxKind::ARGUMENT, "200"),
655 (SyntaxKind::WHITESPACE, " "),
656 (SyntaxKind::DOUBLE_QUOTED_STRING, r#""hello\nworld""#),
657 (SyntaxKind::SEMICOLON, ";"),
658 ]
659 );
660 }
661
662 #[test]
663 fn lossless_complex_config() {
664 let source = r#"http {
665 # Main server
666 server {
667 listen 80;
668 server_name example.com;
669 location / {
670 proxy_pass http://backend;
671 }
672 }
673}
674"#;
675 assert_lossless(source);
676 }
677
678 #[test]
679 fn lossless_utf8() {
680 let source = "# これは日本語コメント\nlisten 80;\n";
681 assert_lossless(source);
682 }
683
684 #[test]
685 fn glob_pattern() {
686 let tokens = tokenize("include /etc/nginx/conf.d/*.conf;");
687 assert_eq!(
688 tokens,
689 vec![
690 (SyntaxKind::IDENT, "include"),
691 (SyntaxKind::WHITESPACE, " "),
692 (SyntaxKind::ARGUMENT, "/etc/nginx/conf.d/*.conf"),
693 (SyntaxKind::SEMICOLON, ";"),
694 ]
695 );
696 }
697
698 #[test]
699 fn extension_directive() {
700 let tokens = tokenize(r#"more_set_headers "Server: Custom";"#);
701 assert_eq!(
702 tokens,
703 vec![
704 (SyntaxKind::IDENT, "more_set_headers"),
705 (SyntaxKind::WHITESPACE, " "),
706 (SyntaxKind::DOUBLE_QUOTED_STRING, "\"Server: Custom\""),
707 (SyntaxKind::SEMICOLON, ";"),
708 ]
709 );
710 }
711
712 #[test]
713 fn hash_in_regex_pattern() {
714 let tokens = tokenize(r"location ~* (?:#.*#|\.bak)$ {");
715 assert_eq!(
716 tokens,
717 vec![
718 (SyntaxKind::IDENT, "location"),
719 (SyntaxKind::WHITESPACE, " "),
720 (SyntaxKind::ARGUMENT, "~*"),
721 (SyntaxKind::WHITESPACE, " "),
722 (SyntaxKind::ARGUMENT, r"(?:#.*#|\.bak)$"),
723 (SyntaxKind::WHITESPACE, " "),
724 (SyntaxKind::L_BRACE, "{"),
725 ]
726 );
727 }
728
729 #[test]
730 fn ident_classification() {
731 let _tokens = tokenize("server_name example;");
733 assert_eq!(kinds("server_name"), vec![SyntaxKind::IDENT]);
734 let toks = tokenize("text/plain");
736 assert_eq!(toks, vec![(SyntaxKind::ARGUMENT, "text/plain")]);
737 }
738}