From 82c6657bf4e9ee9d0a955cbaf3b384b7321de547 Mon Sep 17 00:00:00 2001 From: Eyal Leshem Date: Tue, 21 Oct 2025 15:26:23 +0300 Subject: [PATCH] Prepare tokenizer for using borrowed strings instead of allocations. Key points for this commit: - The peekable trait isn't sufficient for using string slices, as we need the byte indexes (start/end) to create string slices, so added the current byte position to the State struct (Note: in the long term we could potentially remove peekable and use only the current position as an iterator) - Created internal functions that create slices from the original query instead of allocating strings, then converted these functions to return String to maintain compatibility (the idea is to make a small, reviewable commit without changing the Token struct or the parser) --- src/tokenizer.rs | 141 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 110 insertions(+), 31 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 54a158c1f..a7f90d9b5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {} struct State<'a> { peekable: Peekable>, + /// Reference to the original source string being tokenized + source: &'a str, pub line: u64, pub col: u64, + /// Byte position in the source string + pub byte_pos: usize, } impl State<'_> { @@ -759,6 +763,8 @@ impl State<'_> { } else { self.col += 1; } + // Update byte position (characters can be multi-byte in UTF-8) + self.byte_pos += s.len_utf8(); Some(s) } } @@ -769,6 +775,12 @@ impl State<'_> { self.peekable.peek() } + /// return the character after the next character (lookahead by 2) without advancing the stream + pub fn peek_next(&self) -> Option { + // Use the source and byte_pos instead of cloning the peekable iterator + self.source[self.byte_pos..].chars().nth(1) + } + pub fn location(&self) -> Location { Location { line: self.line, @@ -893,8 +905,10 @@ impl<'a> Tokenizer<'a> { ) -> Result<(), TokenizerError> { let mut state = State { peekable: self.query.chars().peekable(), + source: self.query, line: 1, col: 1, + byte_pos: 0, }; let mut location = state.location(); @@ -912,18 +926,21 @@ impl<'a> Tokenizer<'a> { fn tokenize_identifier_or_keyword( &self, ch: impl IntoIterator, - chars: &mut State, + chars: &mut State<'a>, ) -> Result, TokenizerError> { chars.next(); // consume the first char - let ch: String = ch.into_iter().collect(); - let word = self.tokenize_word(ch, chars); + // Calculate total byte length without allocating a String + let consumed_byte_len: usize = ch.into_iter().map(|c| c.len_utf8()).sum(); + let word = self.tokenize_word(consumed_byte_len, chars); // TODO: implement parsing of exponent here if word.chars().all(|x| x.is_ascii_digit() || x == '.') { let mut inner_state = State { peekable: word.chars().peekable(), + source: &word, line: 0, col: 0, + byte_pos: 0, }; let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.')); let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); @@ -937,7 +954,7 @@ impl<'a> Tokenizer<'a> { /// Get the next token or return None fn next_token( &self, - chars: &mut State, + chars: &mut State<'a>, prev_token: Option<&Token>, ) -> Result, TokenizerError> { match chars.peek() { @@ -988,7 +1005,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "b" or "B" - let s = self.tokenize_word(b, chars); + let s = self.tokenize_word(b.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1015,7 +1032,7 @@ impl<'a> Tokenizer<'a> { ), _ => { // regular identifier starting with an "r" or "R" - let s = self.tokenize_word(b, chars); + let s = self.tokenize_word(b.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1034,7 +1051,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "N" - let s = self.tokenize_word(n, chars); + let s = self.tokenize_word(n.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1051,7 +1068,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "E" or "e" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1070,7 +1087,7 @@ impl<'a> Tokenizer<'a> { } } // regular identifier starting with an "U" or "u" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } // The spec only allows an uppercase 'X' to introduce a hex @@ -1085,7 +1102,7 @@ impl<'a> Tokenizer<'a> { } _ => { // regular identifier starting with an "X" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x.len_utf8(), chars); Ok(Some(Token::make_word(&s, None))) } } @@ -1876,13 +1893,26 @@ impl<'a> Tokenizer<'a> { comment } - /// Tokenize an identifier or keyword, after the first char is already consumed. - fn tokenize_word(&self, first_chars: impl Into, chars: &mut State) -> String { - let mut s = first_chars.into(); - s.push_str(&peeking_take_while(chars, |ch| { - self.dialect.is_identifier_part(ch) - })); - s + /// Tokenize an identifier or keyword, after the first char(s) have already been consumed. + /// `consumed_byte_len` is the byte length of the consumed character(s). + fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String { + // Calculate where the first character started + let first_char_byte_pos = chars.byte_pos - consumed_byte_len; + + // Use the zero-copy version and convert to String + self.tokenize_word_borrowed(first_char_byte_pos, chars) + .to_string() + } + + /// Tokenize an identifier or keyword, returning a borrowed slice when possible. + /// The first character position must be provided (before it was consumed). + /// Returns a slice with the same lifetime as the State's source. + fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str { + // Consume the rest of the word + borrow_slice_until(chars, |ch| self.dialect.is_identifier_part(ch)); + + // Return a slice from the first char to the current position + &chars.source[first_char_byte_pos..chars.byte_pos] } /// Read a quoted identifier @@ -2176,35 +2206,82 @@ impl<'a> Tokenizer<'a> { /// Read from `chars` until `predicate` returns `false` or EOF is hit. /// Return the characters read as String, and keep the first non-matching /// char available as `chars.next()`. -fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String { - let mut s = String::new(); +fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String { + borrow_slice_until(chars, predicate).to_string() +} + +/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit. +/// +/// # Arguments +/// * `chars` - The character iterator state (contains reference to original source) +/// * `predicate` - Function that returns true while we should continue taking characters +/// +/// # Returns +/// A borrowed slice of the source string containing the matched characters +fn borrow_slice_until<'a>( + chars: &mut State<'a>, + mut predicate: impl FnMut(char) -> bool, +) -> &'a str { + // Record the starting byte position + let start_pos = chars.byte_pos; + + // Consume characters while predicate is true while let Some(&ch) = chars.peek() { if predicate(ch) { - chars.next(); // consume - s.push(ch); + chars.next(); // consume (this updates byte_pos) } else { break; } } - s + + // Get the ending byte position + let end_pos = chars.byte_pos; + + // Return the slice from the original source + &chars.source[start_pos..end_pos] } -/// Same as peeking_take_while, but also passes the next character to the predicate. -fn peeking_next_take_while( - chars: &mut State, +/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit. +/// This version also passes the next character to the predicate for lookahead. +/// This is a zero-copy version of `peeking_next_take_while`. +/// +/// # Arguments +/// * `chars` - The character iterator state (contains reference to original source) +/// * `predicate` - Function that returns true while we should continue taking characters. +/// Takes current char and optional next char for lookahead. +/// +/// # Returns +/// A borrowed slice of the source string containing the matched characters +fn borrow_slice_until_next<'a>( + chars: &mut State<'a>, mut predicate: impl FnMut(char, Option) -> bool, -) -> String { - let mut s = String::new(); +) -> &'a str { + // Record the starting byte position + let start_pos = chars.byte_pos; + + // Consume characters while predicate is true while let Some(&ch) = chars.peek() { - let next_char = chars.peekable.clone().nth(1); + let next_char = chars.peek_next(); if predicate(ch, next_char) { - chars.next(); // consume - s.push(ch); + chars.next(); // consume (this updates byte_pos) } else { break; } } - s + + // Get the ending byte position + let end_pos = chars.byte_pos; + + // Return the slice from the original source + &chars.source[start_pos..end_pos] +} + +/// Same as peeking_take_while, but also passes the next character to the predicate. +fn peeking_next_take_while( + chars: &mut State, + predicate: impl FnMut(char, Option) -> bool, +) -> String { + borrow_slice_until_next(chars, predicate).to_string() } fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option { @@ -3496,8 +3573,10 @@ mod tests { let s = format!("'{s}'"); let mut state = State { peekable: s.chars().peekable(), + source: &s, line: 0, col: 0, + byte_pos: 0, }; assert_eq!(