Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 110 additions & 31 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}

struct State<'a> {
peekable: Peekable<Chars<'a>>,
/// Reference to the original source string being tokenized
source: &'a str,
pub line: u64,
pub col: u64,
/// Byte position in the source string
pub byte_pos: usize,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pub byte_pos: usize,
byte_pos: usize,

}

impl State<'_> {
Expand All @@ -759,6 +763,8 @@ impl State<'_> {
} else {
self.col += 1;
}
// Update byte position (characters can be multi-byte in UTF-8)
self.byte_pos += s.len_utf8();
Some(s)
}
}
Expand All @@ -769,6 +775,12 @@ impl State<'_> {
self.peekable.peek()
}

/// return the character after the next character (lookahead by 2) without advancing the stream
pub fn peek_next(&self) -> Option<char> {
Comment on lines +778 to +779
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// return the character after the next character (lookahead by 2) without advancing the stream
pub fn peek_next(&self) -> Option<char> {
/// Return the `nth` next character without advancing the stream.
pub fn peek_nth(&self, n: usize) -> Option<char> {

Thinking we can have this more generic so that it can be reused in other context? Similar to the peek_nth* parser methods

// Use the source and byte_pos instead of cloning the peekable iterator
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a guard here that the index is safe? So that we don't panic if we hit a bug or the API is being misused. e.g.

if self.byte_pos >= self.source.len() {
    return None
}

self.source[self.byte_pos..].chars().nth(1)
}

pub fn location(&self) -> Location {
Location {
line: self.line,
Expand Down Expand Up @@ -893,8 +905,10 @@ impl<'a> Tokenizer<'a> {
) -> Result<(), TokenizerError> {
let mut state = State {
peekable: self.query.chars().peekable(),
source: self.query,
line: 1,
col: 1,
byte_pos: 0,
};

let mut location = state.location();
Expand All @@ -912,18 +926,21 @@ impl<'a> Tokenizer<'a> {
fn tokenize_identifier_or_keyword(
&self,
ch: impl IntoIterator<Item = char>,
chars: &mut State,
chars: &mut State<'a>,
) -> Result<Option<Token>, TokenizerError> {
chars.next(); // consume the first char
let ch: String = ch.into_iter().collect();
let word = self.tokenize_word(ch, chars);
// Calculate total byte length without allocating a String
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The indentation looks a bit off on this line?

let consumed_byte_len: usize = ch.into_iter().map(|c| c.len_utf8()).sum();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can instead replace the ch parameter to this tokenize_identifier_or_keyword function with the actual consumed_byte_len: usize?

If I'm understanding the intent/requirement of this change correctly the caller of this function should have that value on hand given that we seem to require that ch contains the preceeding characters in the stream?

The current flow was initially unclear to me that the caller passes in an iterator of items, whose contents we did not use here, and it required digging further into tokenize_word to realised why that was the case.

let word = self.tokenize_word(consumed_byte_len, chars);

// TODO: implement parsing of exponent here
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
let mut inner_state = State {
peekable: word.chars().peekable(),
source: &word,
line: 0,
col: 0,
byte_pos: 0,
};
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
Expand All @@ -937,7 +954,7 @@ impl<'a> Tokenizer<'a> {
/// Get the next token or return None
fn next_token(
&self,
chars: &mut State,
chars: &mut State<'a>,
prev_token: Option<&Token>,
) -> Result<Option<Token>, TokenizerError> {
match chars.peek() {
Expand Down Expand Up @@ -988,7 +1005,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "b" or "B"
let s = self.tokenize_word(b, chars);
let s = self.tokenize_word(b.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1015,7 +1032,7 @@ impl<'a> Tokenizer<'a> {
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b, chars);
let s = self.tokenize_word(b.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1034,7 +1051,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "N"
let s = self.tokenize_word(n, chars);
let s = self.tokenize_word(n.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1051,7 +1068,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "E" or "e"
let s = self.tokenize_word(x, chars);
let s = self.tokenize_word(x.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand All @@ -1070,7 +1087,7 @@ impl<'a> Tokenizer<'a> {
}
}
// regular identifier starting with an "U" or "u"
let s = self.tokenize_word(x, chars);
let s = self.tokenize_word(x.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
// The spec only allows an uppercase 'X' to introduce a hex
Expand All @@ -1085,7 +1102,7 @@ impl<'a> Tokenizer<'a> {
}
_ => {
// regular identifier starting with an "X"
let s = self.tokenize_word(x, chars);
let s = self.tokenize_word(x.len_utf8(), chars);
Ok(Some(Token::make_word(&s, None)))
}
}
Expand Down Expand Up @@ -1876,13 +1893,26 @@ impl<'a> Tokenizer<'a> {
comment
}

/// Tokenize an identifier or keyword, after the first char is already consumed.
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
let mut s = first_chars.into();
s.push_str(&peeking_take_while(chars, |ch| {
self.dialect.is_identifier_part(ch)
}));
s
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
/// `consumed_byte_len` is the byte length of the consumed character(s).
fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
// Calculate where the first character started
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a check that the operation doesn't overflow? e.g.

if consumed_byte_len >= chars.byte_pos {
    return "".to_string()
}


// Use the zero-copy version and convert to String
self.tokenize_word_borrowed(first_char_byte_pos, chars)
.to_string()
}

/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
/// The first character position must be provided (before it was consumed).
/// Returns a slice with the same lifetime as the State's source.
fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str {
// Consume the rest of the word
borrow_slice_until(chars, |ch| self.dialect.is_identifier_part(ch));

// Return a slice from the first char to the current position
&chars.source[first_char_byte_pos..chars.byte_pos]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly here we can check that the indexing is safe and maybe return a literal "" if not?

}

/// Read a quoted identifier
Expand Down Expand Up @@ -2176,35 +2206,82 @@ impl<'a> Tokenizer<'a> {
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
/// Return the characters read as String, and keep the first non-matching
/// char available as `chars.next()`.
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
let mut s = String::new();
fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String {
borrow_slice_until(chars, predicate).to_string()
}

/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
///
/// # Arguments
/// * `chars` - The character iterator state (contains reference to original source)
/// * `predicate` - Function that returns true while we should continue taking characters
///
/// # Returns
/// A borrowed slice of the source string containing the matched characters
Comment on lines +2213 to +2220
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doc wise I think its easier to only reference the peeking_take_while method instead, and we mention only the difference to this function. That way we only describe the functionality once. Also, the project doesn't use the # Arguments, # Returns etc documentation format so that we can skip that for consistency I think.

fn borrow_slice_until<'a>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
fn borrow_slice_until<'a>(
fn peeking_take_while_ref<'a>(

Thinking so that its clear that the function is related to peeking_take_while

chars: &mut State<'a>,
mut predicate: impl FnMut(char) -> bool,
) -> &'a str {
// Record the starting byte position
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can sanity check the index before using it here as well?

let start_pos = chars.byte_pos;

// Consume characters while predicate is true
while let Some(&ch) = chars.peek() {
if predicate(ch) {
chars.next(); // consume
s.push(ch);
chars.next(); // consume (this updates byte_pos)
} else {
break;
}
}
s

// Get the ending byte position
let end_pos = chars.byte_pos;

// Return the slice from the original source
&chars.source[start_pos..end_pos]
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
chars: &mut State,
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just flagging similar comments for borrow_while_until apply to this function

/// This version also passes the next character to the predicate for lookahead.
/// This is a zero-copy version of `peeking_next_take_while`.
///
/// # Arguments
/// * `chars` - The character iterator state (contains reference to original source)
/// * `predicate` - Function that returns true while we should continue taking characters.
/// Takes current char and optional next char for lookahead.
///
/// # Returns
/// A borrowed slice of the source string containing the matched characters
fn borrow_slice_until_next<'a>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wondering, given this function is quite similar, would it make sense to implement borrowed_slice_until as following instead? To reuse the same definition/impl

fn borrow_slice_until(chars) {
    borrow_slice_until_next(chars, |ch, _|{
        predicate(ch)
    })
}

chars: &mut State<'a>,
mut predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
let mut s = String::new();
) -> &'a str {
// Record the starting byte position
let start_pos = chars.byte_pos;

// Consume characters while predicate is true
while let Some(&ch) = chars.peek() {
let next_char = chars.peekable.clone().nth(1);
let next_char = chars.peek_next();
if predicate(ch, next_char) {
chars.next(); // consume
s.push(ch);
chars.next(); // consume (this updates byte_pos)
} else {
break;
}
}
s

// Get the ending byte position
let end_pos = chars.byte_pos;

// Return the slice from the original source
&chars.source[start_pos..end_pos]
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
chars: &mut State,
predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
borrow_slice_until_next(chars, predicate).to_string()
}

fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
Expand Down Expand Up @@ -3496,8 +3573,10 @@ mod tests {
let s = format!("'{s}'");
let mut state = State {
peekable: s.chars().peekable(),
source: &s,
line: 0,
col: 0,
byte_pos: 0,
};

assert_eq!(
Expand Down