Skip to content

Commit 82c6657

Browse files
eyalsatorieyalleshem
authored andcommitted
Prepare tokenizer for using borrowed strings instead of allocations.
Key points for this commit: - The peekable trait isn't sufficient for using string slices, as we need the byte indexes (start/end) to create string slices, so added the current byte position to the State struct (Note: in the long term we could potentially remove peekable and use only the current position as an iterator) - Created internal functions that create slices from the original query instead of allocating strings, then converted these functions to return String to maintain compatibility (the idea is to make a small, reviewable commit without changing the Token struct or the parser)
1 parent 67684c8 commit 82c6657

File tree

1 file changed

+110
-31
lines changed

1 file changed

+110
-31
lines changed

src/tokenizer.rs

Lines changed: 110 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743

744744
struct State<'a> {
745745
peekable: Peekable<Chars<'a>>,
746+
/// Reference to the original source string being tokenized
747+
source: &'a str,
746748
pub line: u64,
747749
pub col: u64,
750+
/// Byte position in the source string
751+
pub byte_pos: usize,
748752
}
749753

750754
impl State<'_> {
@@ -759,6 +763,8 @@ impl State<'_> {
759763
} else {
760764
self.col += 1;
761765
}
766+
// Update byte position (characters can be multi-byte in UTF-8)
767+
self.byte_pos += s.len_utf8();
762768
Some(s)
763769
}
764770
}
@@ -769,6 +775,12 @@ impl State<'_> {
769775
self.peekable.peek()
770776
}
771777

778+
/// return the character after the next character (lookahead by 2) without advancing the stream
779+
pub fn peek_next(&self) -> Option<char> {
780+
// Use the source and byte_pos instead of cloning the peekable iterator
781+
self.source[self.byte_pos..].chars().nth(1)
782+
}
783+
772784
pub fn location(&self) -> Location {
773785
Location {
774786
line: self.line,
@@ -893,8 +905,10 @@ impl<'a> Tokenizer<'a> {
893905
) -> Result<(), TokenizerError> {
894906
let mut state = State {
895907
peekable: self.query.chars().peekable(),
908+
source: self.query,
896909
line: 1,
897910
col: 1,
911+
byte_pos: 0,
898912
};
899913

900914
let mut location = state.location();
@@ -912,18 +926,21 @@ impl<'a> Tokenizer<'a> {
912926
fn tokenize_identifier_or_keyword(
913927
&self,
914928
ch: impl IntoIterator<Item = char>,
915-
chars: &mut State,
929+
chars: &mut State<'a>,
916930
) -> Result<Option<Token>, TokenizerError> {
917931
chars.next(); // consume the first char
918-
let ch: String = ch.into_iter().collect();
919-
let word = self.tokenize_word(ch, chars);
932+
// Calculate total byte length without allocating a String
933+
let consumed_byte_len: usize = ch.into_iter().map(|c| c.len_utf8()).sum();
934+
let word = self.tokenize_word(consumed_byte_len, chars);
920935

921936
// TODO: implement parsing of exponent here
922937
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923938
let mut inner_state = State {
924939
peekable: word.chars().peekable(),
940+
source: &word,
925941
line: 0,
926942
col: 0,
943+
byte_pos: 0,
927944
};
928945
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929946
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
@@ -937,7 +954,7 @@ impl<'a> Tokenizer<'a> {
937954
/// Get the next token or return None
938955
fn next_token(
939956
&self,
940-
chars: &mut State,
957+
chars: &mut State<'a>,
941958
prev_token: Option<&Token>,
942959
) -> Result<Option<Token>, TokenizerError> {
943960
match chars.peek() {
@@ -988,7 +1005,7 @@ impl<'a> Tokenizer<'a> {
9881005
}
9891006
_ => {
9901007
// regular identifier starting with an "b" or "B"
991-
let s = self.tokenize_word(b, chars);
1008+
let s = self.tokenize_word(b.len_utf8(), chars);
9921009
Ok(Some(Token::make_word(&s, None)))
9931010
}
9941011
}
@@ -1015,7 +1032,7 @@ impl<'a> Tokenizer<'a> {
10151032
),
10161033
_ => {
10171034
// regular identifier starting with an "r" or "R"
1018-
let s = self.tokenize_word(b, chars);
1035+
let s = self.tokenize_word(b.len_utf8(), chars);
10191036
Ok(Some(Token::make_word(&s, None)))
10201037
}
10211038
}
@@ -1034,7 +1051,7 @@ impl<'a> Tokenizer<'a> {
10341051
}
10351052
_ => {
10361053
// regular identifier starting with an "N"
1037-
let s = self.tokenize_word(n, chars);
1054+
let s = self.tokenize_word(n.len_utf8(), chars);
10381055
Ok(Some(Token::make_word(&s, None)))
10391056
}
10401057
}
@@ -1051,7 +1068,7 @@ impl<'a> Tokenizer<'a> {
10511068
}
10521069
_ => {
10531070
// regular identifier starting with an "E" or "e"
1054-
let s = self.tokenize_word(x, chars);
1071+
let s = self.tokenize_word(x.len_utf8(), chars);
10551072
Ok(Some(Token::make_word(&s, None)))
10561073
}
10571074
}
@@ -1070,7 +1087,7 @@ impl<'a> Tokenizer<'a> {
10701087
}
10711088
}
10721089
// regular identifier starting with an "U" or "u"
1073-
let s = self.tokenize_word(x, chars);
1090+
let s = self.tokenize_word(x.len_utf8(), chars);
10741091
Ok(Some(Token::make_word(&s, None)))
10751092
}
10761093
// The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1102,7 @@ impl<'a> Tokenizer<'a> {
10851102
}
10861103
_ => {
10871104
// regular identifier starting with an "X"
1088-
let s = self.tokenize_word(x, chars);
1105+
let s = self.tokenize_word(x.len_utf8(), chars);
10891106
Ok(Some(Token::make_word(&s, None)))
10901107
}
10911108
}
@@ -1876,13 +1893,26 @@ impl<'a> Tokenizer<'a> {
18761893
comment
18771894
}
18781895

1879-
/// Tokenize an identifier or keyword, after the first char is already consumed.
1880-
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881-
let mut s = first_chars.into();
1882-
s.push_str(&peeking_take_while(chars, |ch| {
1883-
self.dialect.is_identifier_part(ch)
1884-
}));
1885-
s
1896+
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1897+
/// `consumed_byte_len` is the byte length of the consumed character(s).
1898+
fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
1899+
// Calculate where the first character started
1900+
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
1901+
1902+
// Use the zero-copy version and convert to String
1903+
self.tokenize_word_borrowed(first_char_byte_pos, chars)
1904+
.to_string()
1905+
}
1906+
1907+
/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1908+
/// The first character position must be provided (before it was consumed).
1909+
/// Returns a slice with the same lifetime as the State's source.
1910+
fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str {
1911+
// Consume the rest of the word
1912+
borrow_slice_until(chars, |ch| self.dialect.is_identifier_part(ch));
1913+
1914+
// Return a slice from the first char to the current position
1915+
&chars.source[first_char_byte_pos..chars.byte_pos]
18861916
}
18871917

18881918
/// Read a quoted identifier
@@ -2176,35 +2206,82 @@ impl<'a> Tokenizer<'a> {
21762206
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772207
/// Return the characters read as String, and keep the first non-matching
21782208
/// char available as `chars.next()`.
2179-
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180-
let mut s = String::new();
2209+
fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String {
2210+
borrow_slice_until(chars, predicate).to_string()
2211+
}
2212+
2213+
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2214+
///
2215+
/// # Arguments
2216+
/// * `chars` - The character iterator state (contains reference to original source)
2217+
/// * `predicate` - Function that returns true while we should continue taking characters
2218+
///
2219+
/// # Returns
2220+
/// A borrowed slice of the source string containing the matched characters
2221+
fn borrow_slice_until<'a>(
2222+
chars: &mut State<'a>,
2223+
mut predicate: impl FnMut(char) -> bool,
2224+
) -> &'a str {
2225+
// Record the starting byte position
2226+
let start_pos = chars.byte_pos;
2227+
2228+
// Consume characters while predicate is true
21812229
while let Some(&ch) = chars.peek() {
21822230
if predicate(ch) {
2183-
chars.next(); // consume
2184-
s.push(ch);
2231+
chars.next(); // consume (this updates byte_pos)
21852232
} else {
21862233
break;
21872234
}
21882235
}
2189-
s
2236+
2237+
// Get the ending byte position
2238+
let end_pos = chars.byte_pos;
2239+
2240+
// Return the slice from the original source
2241+
&chars.source[start_pos..end_pos]
21902242
}
21912243

2192-
/// Same as peeking_take_while, but also passes the next character to the predicate.
2193-
fn peeking_next_take_while(
2194-
chars: &mut State,
2244+
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2245+
/// This version also passes the next character to the predicate for lookahead.
2246+
/// This is a zero-copy version of `peeking_next_take_while`.
2247+
///
2248+
/// # Arguments
2249+
/// * `chars` - The character iterator state (contains reference to original source)
2250+
/// * `predicate` - Function that returns true while we should continue taking characters.
2251+
/// Takes current char and optional next char for lookahead.
2252+
///
2253+
/// # Returns
2254+
/// A borrowed slice of the source string containing the matched characters
2255+
fn borrow_slice_until_next<'a>(
2256+
chars: &mut State<'a>,
21952257
mut predicate: impl FnMut(char, Option<char>) -> bool,
2196-
) -> String {
2197-
let mut s = String::new();
2258+
) -> &'a str {
2259+
// Record the starting byte position
2260+
let start_pos = chars.byte_pos;
2261+
2262+
// Consume characters while predicate is true
21982263
while let Some(&ch) = chars.peek() {
2199-
let next_char = chars.peekable.clone().nth(1);
2264+
let next_char = chars.peek_next();
22002265
if predicate(ch, next_char) {
2201-
chars.next(); // consume
2202-
s.push(ch);
2266+
chars.next(); // consume (this updates byte_pos)
22032267
} else {
22042268
break;
22052269
}
22062270
}
2207-
s
2271+
2272+
// Get the ending byte position
2273+
let end_pos = chars.byte_pos;
2274+
2275+
// Return the slice from the original source
2276+
&chars.source[start_pos..end_pos]
2277+
}
2278+
2279+
/// Same as peeking_take_while, but also passes the next character to the predicate.
2280+
fn peeking_next_take_while(
2281+
chars: &mut State,
2282+
predicate: impl FnMut(char, Option<char>) -> bool,
2283+
) -> String {
2284+
borrow_slice_until_next(chars, predicate).to_string()
22082285
}
22092286

22102287
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
@@ -3496,8 +3573,10 @@ mod tests {
34963573
let s = format!("'{s}'");
34973574
let mut state = State {
34983575
peekable: s.chars().peekable(),
3576+
source: &s,
34993577
line: 0,
35003578
col: 0,
3579+
byte_pos: 0,
35013580
};
35023581

35033582
assert_eq!(

0 commit comments

Comments
 (0)