@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743
744744struct State < ' a > {
745745 peekable : Peekable < Chars < ' a > > ,
746+ /// Reference to the original source string being tokenized
747+ source : & ' a str ,
746748 pub line : u64 ,
747749 pub col : u64 ,
750+ /// Byte position in the source string
751+ pub byte_pos : usize ,
748752}
749753
750754impl State < ' _ > {
@@ -759,6 +763,8 @@ impl State<'_> {
759763 } else {
760764 self . col += 1 ;
761765 }
766+ // Update byte position (characters can be multi-byte in UTF-8)
767+ self . byte_pos += s. len_utf8 ( ) ;
762768 Some ( s)
763769 }
764770 }
@@ -769,6 +775,12 @@ impl State<'_> {
769775 self . peekable . peek ( )
770776 }
771777
778+ /// return the character after the next character (lookahead by 2) without advancing the stream
779+ pub fn peek_next ( & self ) -> Option < char > {
780+ // Use the source and byte_pos instead of cloning the peekable iterator
781+ self . source [ self . byte_pos ..] . chars ( ) . nth ( 1 )
782+ }
783+
772784 pub fn location ( & self ) -> Location {
773785 Location {
774786 line : self . line ,
@@ -893,8 +905,10 @@ impl<'a> Tokenizer<'a> {
893905 ) -> Result < ( ) , TokenizerError > {
894906 let mut state = State {
895907 peekable : self . query . chars ( ) . peekable ( ) ,
908+ source : self . query ,
896909 line : 1 ,
897910 col : 1 ,
911+ byte_pos : 0 ,
898912 } ;
899913
900914 let mut location = state. location ( ) ;
@@ -912,18 +926,21 @@ impl<'a> Tokenizer<'a> {
912926 fn tokenize_identifier_or_keyword (
913927 & self ,
914928 ch : impl IntoIterator < Item = char > ,
915- chars : & mut State ,
929+ chars : & mut State < ' a > ,
916930 ) -> Result < Option < Token > , TokenizerError > {
917931 chars. next ( ) ; // consume the first char
918- let ch: String = ch. into_iter ( ) . collect ( ) ;
919- let word = self . tokenize_word ( ch, chars) ;
932+ // Calculate total byte length without allocating a String
933+ let consumed_byte_len: usize = ch. into_iter ( ) . map ( |c| c. len_utf8 ( ) ) . sum ( ) ;
934+ let word = self . tokenize_word ( consumed_byte_len, chars) ;
920935
921936 // TODO: implement parsing of exponent here
922937 if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
923938 let mut inner_state = State {
924939 peekable : word. chars ( ) . peekable ( ) ,
940+ source : & word,
925941 line : 0 ,
926942 col : 0 ,
943+ byte_pos : 0 ,
927944 } ;
928945 let mut s = peeking_take_while ( & mut inner_state, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
929946 let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
@@ -937,7 +954,7 @@ impl<'a> Tokenizer<'a> {
937954 /// Get the next token or return None
938955 fn next_token (
939956 & self ,
940- chars : & mut State ,
957+ chars : & mut State < ' a > ,
941958 prev_token : Option < & Token > ,
942959 ) -> Result < Option < Token > , TokenizerError > {
943960 match chars. peek ( ) {
@@ -988,7 +1005,7 @@ impl<'a> Tokenizer<'a> {
9881005 }
9891006 _ => {
9901007 // regular identifier starting with an "b" or "B"
991- let s = self . tokenize_word ( b, chars) ;
1008+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
9921009 Ok ( Some ( Token :: make_word ( & s, None ) ) )
9931010 }
9941011 }
@@ -1015,7 +1032,7 @@ impl<'a> Tokenizer<'a> {
10151032 ) ,
10161033 _ => {
10171034 // regular identifier starting with an "r" or "R"
1018- let s = self . tokenize_word ( b, chars) ;
1035+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
10191036 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10201037 }
10211038 }
@@ -1034,7 +1051,7 @@ impl<'a> Tokenizer<'a> {
10341051 }
10351052 _ => {
10361053 // regular identifier starting with an "N"
1037- let s = self . tokenize_word ( n, chars) ;
1054+ let s = self . tokenize_word ( n. len_utf8 ( ) , chars) ;
10381055 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10391056 }
10401057 }
@@ -1051,7 +1068,7 @@ impl<'a> Tokenizer<'a> {
10511068 }
10521069 _ => {
10531070 // regular identifier starting with an "E" or "e"
1054- let s = self . tokenize_word ( x, chars) ;
1071+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10551072 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10561073 }
10571074 }
@@ -1070,7 +1087,7 @@ impl<'a> Tokenizer<'a> {
10701087 }
10711088 }
10721089 // regular identifier starting with an "U" or "u"
1073- let s = self . tokenize_word ( x, chars) ;
1090+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10741091 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10751092 }
10761093 // The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1102,7 @@ impl<'a> Tokenizer<'a> {
10851102 }
10861103 _ => {
10871104 // regular identifier starting with an "X"
1088- let s = self . tokenize_word ( x, chars) ;
1105+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10891106 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10901107 }
10911108 }
@@ -1876,13 +1893,26 @@ impl<'a> Tokenizer<'a> {
18761893 comment
18771894 }
18781895
1879- /// Tokenize an identifier or keyword, after the first char is already consumed.
1880- fn tokenize_word ( & self , first_chars : impl Into < String > , chars : & mut State ) -> String {
1881- let mut s = first_chars. into ( ) ;
1882- s. push_str ( & peeking_take_while ( chars, |ch| {
1883- self . dialect . is_identifier_part ( ch)
1884- } ) ) ;
1885- s
1896+ /// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1897+ /// `consumed_byte_len` is the byte length of the consumed character(s).
1898+ fn tokenize_word ( & self , consumed_byte_len : usize , chars : & mut State < ' a > ) -> String {
1899+ // Calculate where the first character started
1900+ let first_char_byte_pos = chars. byte_pos - consumed_byte_len;
1901+
1902+ // Use the zero-copy version and convert to String
1903+ self . tokenize_word_borrowed ( first_char_byte_pos, chars)
1904+ . to_string ( )
1905+ }
1906+
1907+ /// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1908+ /// The first character position must be provided (before it was consumed).
1909+ /// Returns a slice with the same lifetime as the State's source.
1910+ fn tokenize_word_borrowed ( & self , first_char_byte_pos : usize , chars : & mut State < ' a > ) -> & ' a str {
1911+ // Consume the rest of the word
1912+ borrow_slice_until ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1913+
1914+ // Return a slice from the first char to the current position
1915+ & chars. source [ first_char_byte_pos..chars. byte_pos ]
18861916 }
18871917
18881918 /// Read a quoted identifier
@@ -2176,35 +2206,82 @@ impl<'a> Tokenizer<'a> {
21762206/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772207/// Return the characters read as String, and keep the first non-matching
21782208/// char available as `chars.next()`.
2179- fn peeking_take_while ( chars : & mut State , mut predicate : impl FnMut ( char ) -> bool ) -> String {
2180- let mut s = String :: new ( ) ;
2209+ fn peeking_take_while ( chars : & mut State , predicate : impl FnMut ( char ) -> bool ) -> String {
2210+ borrow_slice_until ( chars, predicate) . to_string ( )
2211+ }
2212+
2213+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2214+ ///
2215+ /// # Arguments
2216+ /// * `chars` - The character iterator state (contains reference to original source)
2217+ /// * `predicate` - Function that returns true while we should continue taking characters
2218+ ///
2219+ /// # Returns
2220+ /// A borrowed slice of the source string containing the matched characters
2221+ fn borrow_slice_until < ' a > (
2222+ chars : & mut State < ' a > ,
2223+ mut predicate : impl FnMut ( char ) -> bool ,
2224+ ) -> & ' a str {
2225+ // Record the starting byte position
2226+ let start_pos = chars. byte_pos ;
2227+
2228+ // Consume characters while predicate is true
21812229 while let Some ( & ch) = chars. peek ( ) {
21822230 if predicate ( ch) {
2183- chars. next ( ) ; // consume
2184- s. push ( ch) ;
2231+ chars. next ( ) ; // consume (this updates byte_pos)
21852232 } else {
21862233 break ;
21872234 }
21882235 }
2189- s
2236+
2237+ // Get the ending byte position
2238+ let end_pos = chars. byte_pos ;
2239+
2240+ // Return the slice from the original source
2241+ & chars. source [ start_pos..end_pos]
21902242}
21912243
2192- /// Same as peeking_take_while, but also passes the next character to the predicate.
2193- fn peeking_next_take_while (
2194- chars : & mut State ,
2244+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2245+ /// This version also passes the next character to the predicate for lookahead.
2246+ /// This is a zero-copy version of `peeking_next_take_while`.
2247+ ///
2248+ /// # Arguments
2249+ /// * `chars` - The character iterator state (contains reference to original source)
2250+ /// * `predicate` - Function that returns true while we should continue taking characters.
2251+ /// Takes current char and optional next char for lookahead.
2252+ ///
2253+ /// # Returns
2254+ /// A borrowed slice of the source string containing the matched characters
2255+ fn borrow_slice_until_next < ' a > (
2256+ chars : & mut State < ' a > ,
21952257 mut predicate : impl FnMut ( char , Option < char > ) -> bool ,
2196- ) -> String {
2197- let mut s = String :: new ( ) ;
2258+ ) -> & ' a str {
2259+ // Record the starting byte position
2260+ let start_pos = chars. byte_pos ;
2261+
2262+ // Consume characters while predicate is true
21982263 while let Some ( & ch) = chars. peek ( ) {
2199- let next_char = chars. peekable . clone ( ) . nth ( 1 ) ;
2264+ let next_char = chars. peek_next ( ) ;
22002265 if predicate ( ch, next_char) {
2201- chars. next ( ) ; // consume
2202- s. push ( ch) ;
2266+ chars. next ( ) ; // consume (this updates byte_pos)
22032267 } else {
22042268 break ;
22052269 }
22062270 }
2207- s
2271+
2272+ // Get the ending byte position
2273+ let end_pos = chars. byte_pos ;
2274+
2275+ // Return the slice from the original source
2276+ & chars. source [ start_pos..end_pos]
2277+ }
2278+
2279+ /// Same as peeking_take_while, but also passes the next character to the predicate.
2280+ fn peeking_next_take_while (
2281+ chars : & mut State ,
2282+ predicate : impl FnMut ( char , Option < char > ) -> bool ,
2283+ ) -> String {
2284+ borrow_slice_until_next ( chars, predicate) . to_string ( )
22082285}
22092286
22102287fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
@@ -3496,8 +3573,10 @@ mod tests {
34963573 let s = format ! ( "'{s}'" ) ;
34973574 let mut state = State {
34983575 peekable : s. chars ( ) . peekable ( ) ,
3576+ source : & s,
34993577 line : 0 ,
35003578 col : 0 ,
3579+ byte_pos : 0 ,
35013580 } ;
35023581
35033582 assert_eq ! (
0 commit comments