tokenizer.rs - source

sqlparser/

tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::num::NonZeroU8;
33use core::str::Chars;
34use core::{cmp, fmt};
35use core::{iter::Peekable, str};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{
50    ast::{DollarQuotedString, QuoteDelimitedString},
51    dialect::HiveDialect,
52};
53
54/// SQL Token enumeration
55#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
58pub enum Token {
59    /// An end-of-file marker, not a real token
60    EOF,
61    /// A keyword (like SELECT) or an optionally quoted SQL identifier
62    Word(Word),
63    /// An unsigned numeric literal
64    Number(String, bool),
65    /// A character that could not be tokenized
66    Char(char),
67    /// Single quoted string: i.e: 'string'
68    SingleQuotedString(String),
69    /// Double quoted string: i.e: "string"
70    DoubleQuotedString(String),
71    /// Triple single quoted strings: Example '''abc'''
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleSingleQuotedString(String),
74    /// Triple double quoted strings: Example """abc"""
75    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
76    TripleDoubleQuotedString(String),
77    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
78    DollarQuotedString(DollarQuotedString),
79    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
80    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
81    SingleQuotedByteStringLiteral(String),
82    /// Byte string literal: i.e: b"string" or B"string"
83    DoubleQuotedByteStringLiteral(String),
84    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleSingleQuotedByteStringLiteral(String),
87    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    TripleDoubleQuotedByteStringLiteral(String),
90    /// Single quoted literal with raw string prefix. Example `R'abc'`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    SingleQuotedRawStringLiteral(String),
93    /// Double quoted literal with raw string prefix. Example `R"abc"`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    DoubleQuotedRawStringLiteral(String),
96    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleSingleQuotedRawStringLiteral(String),
99    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
100    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
101    TripleDoubleQuotedRawStringLiteral(String),
102    /// "National" string literal: i.e: N'string'
103    NationalStringLiteral(String),
104    /// Quote delimited literal. Examples `Q'{ab'c}'`, `Q'|ab'c|'`, `Q'|ab|c|'`
105    /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
106    QuoteDelimitedStringLiteral(QuoteDelimitedString),
107    /// "Nationa" quote delimited literal. Examples `NQ'{ab'c}'`, `NQ'|ab'c|'`, `NQ'|ab|c|'`
108    /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
109    NationalQuoteDelimitedStringLiteral(QuoteDelimitedString),
110    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
111    EscapedStringLiteral(String),
112    /// Unicode string literal: i.e: U&'first \000A second'
113    UnicodeStringLiteral(String),
114    /// Hexadecimal string literal: i.e.: X'deadbeef'
115    HexStringLiteral(String),
116    /// Comma
117    Comma,
118    /// Whitespace (space, tab, etc)
119    Whitespace(Whitespace),
120    /// Double equals sign `==`
121    DoubleEq,
122    /// Equality operator `=`
123    Eq,
124    /// Not Equals operator `<>` (or `!=` in some dialects)
125    Neq,
126    /// Less Than operator `<`
127    Lt,
128    /// Greater Than operator `>`
129    Gt,
130    /// Less Than Or Equals operator `<=`
131    LtEq,
132    /// Greater Than Or Equals operator `>=`
133    GtEq,
134    /// Spaceship operator <=>
135    Spaceship,
136    /// Plus operator `+`
137    Plus,
138    /// Minus operator `-`
139    Minus,
140    /// Multiplication operator `*`
141    Mul,
142    /// Division operator `/`
143    Div,
144    /// Integer division operator `//` in DuckDB
145    DuckIntDiv,
146    /// Modulo Operator `%`
147    Mod,
148    /// String concatenation `||`
149    StringConcat,
150    /// Left parenthesis `(`
151    LParen,
152    /// Right parenthesis `)`
153    RParen,
154    /// Period (used for compound identifiers or projections into nested types)
155    Period,
156    /// Colon `:`
157    Colon,
158    /// DoubleColon `::` (used for casting in PostgreSQL)
159    DoubleColon,
160    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
161    Assignment,
162    /// SemiColon `;` used as separator for COPY and payload
163    SemiColon,
164    /// Backslash `\` used in terminating the COPY payload with `\.`
165    Backslash,
166    /// Left bracket `[`
167    LBracket,
168    /// Right bracket `]`
169    RBracket,
170    /// Ampersand `&`
171    Ampersand,
172    /// Pipe `|`
173    Pipe,
174    /// Caret `^`
175    Caret,
176    /// Left brace `{`
177    LBrace,
178    /// Right brace `}`
179    RBrace,
180    /// Right Arrow `=>`
181    RArrow,
182    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
183    Sharp,
184    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
185    DoubleSharp,
186    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
187    Tilde,
188    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
189    TildeAsterisk,
190    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
191    ExclamationMarkTilde,
192    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
193    ExclamationMarkTildeAsterisk,
194    /// `~~`, a case sensitive match pattern operator in PostgreSQL
195    DoubleTilde,
196    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
197    DoubleTildeAsterisk,
198    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
199    ExclamationMarkDoubleTilde,
200    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
201    ExclamationMarkDoubleTildeAsterisk,
202    /// `<<`, a bitwise shift left operator in PostgreSQL
203    ShiftLeft,
204    /// `>>`, a bitwise shift right operator in PostgreSQL
205    ShiftRight,
206    /// `&&`, an overlap operator in PostgreSQL
207    Overlap,
208    /// Exclamation Mark `!` used for PostgreSQL factorial operator
209    ExclamationMark,
210    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
211    DoubleExclamationMark,
212    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
213    AtSign,
214    /// `^@`, a "starts with" string operator in PostgreSQL
215    CaretAt,
216    /// `|/`, a square root math operator in PostgreSQL
217    PGSquareRoot,
218    /// `||/`, a cube root math operator in PostgreSQL
219    PGCubeRoot,
220    /// `?` or `$` , a prepared statement arg placeholder
221    Placeholder(String),
222    /// `->`, used as a operator to extract json field in PostgreSQL
223    Arrow,
224    /// `->>`, used as a operator to extract json field as text in PostgreSQL
225    LongArrow,
226    /// `#>`, extracts JSON sub-object at the specified path
227    HashArrow,
228    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
229    AtDashAt,
230    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
231    QuestionMarkDash,
232    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
233    AmpersandLeftAngleBracket,
234    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
235    AmpersandRightAngleBracket,
236    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
237    AmpersandLeftAngleBracketVerticalBar,
238    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
239    VerticalBarAmpersandRightAngleBracket,
240    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
241    TwoWayArrow,
242    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
243    LeftAngleBracketCaret,
244    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
245    RightAngleBracketCaret,
246    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
247    QuestionMarkSharp,
248    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
249    QuestionMarkDashVerticalBar,
250    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
251    QuestionMarkDoubleVerticalBar,
252    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
253    TildeEqual,
254    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
255    ShiftLeftVerticalBar,
256    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
257    VerticalBarShiftRight,
258    /// `|> BigQuery pipe operator
259    VerticalBarRightAngleBracket,
260    /// `#>>`, extracts JSON sub-object at the specified path as text
261    HashLongArrow,
262    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
263    AtArrow,
264    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
265    ArrowAt,
266    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
267    /// path, where path elements can be either field keys or array indexes.
268    HashMinus,
269    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
270    /// JSON value?
271    AtQuestion,
272    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
273    /// for the specified JSON value. Only the first item of the result is taken into
274    /// account. If the result is not Boolean, then NULL is returned.
275    AtAt,
276    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
277    /// jsonb object
278    Question,
279    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
280    /// keys within the jsonb object
281    QuestionAnd,
282    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
283    /// keys within the jsonb object
284    QuestionPipe,
285    /// Custom binary operator
286    /// This is used to represent any custom binary operator that is not part of the SQL standard.
287    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
288    CustomBinaryOperator(String),
289}
290
291impl fmt::Display for Token {
292    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293        match self {
294            Token::EOF => f.write_str("EOF"),
295            Token::Word(ref w) => write!(f, "{w}"),
296            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
297            Token::Char(ref c) => write!(f, "{c}"),
298            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
299            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
300            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
301            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
302            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
303            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
304            Token::QuoteDelimitedStringLiteral(ref s) => s.fmt(f),
305            Token::NationalQuoteDelimitedStringLiteral(ref s) => write!(f, "N{s}"),
306            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
307            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
308            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
309            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
310            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
311            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
312            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
313            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
314            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
315            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
316            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
317            Token::Comma => f.write_str(","),
318            Token::Whitespace(ws) => write!(f, "{ws}"),
319            Token::DoubleEq => f.write_str("=="),
320            Token::Spaceship => f.write_str("<=>"),
321            Token::Eq => f.write_str("="),
322            Token::Neq => f.write_str("<>"),
323            Token::Lt => f.write_str("<"),
324            Token::Gt => f.write_str(">"),
325            Token::LtEq => f.write_str("<="),
326            Token::GtEq => f.write_str(">="),
327            Token::Plus => f.write_str("+"),
328            Token::Minus => f.write_str("-"),
329            Token::Mul => f.write_str("*"),
330            Token::Div => f.write_str("/"),
331            Token::DuckIntDiv => f.write_str("//"),
332            Token::StringConcat => f.write_str("||"),
333            Token::Mod => f.write_str("%"),
334            Token::LParen => f.write_str("("),
335            Token::RParen => f.write_str(")"),
336            Token::Period => f.write_str("."),
337            Token::Colon => f.write_str(":"),
338            Token::DoubleColon => f.write_str("::"),
339            Token::Assignment => f.write_str(":="),
340            Token::SemiColon => f.write_str(";"),
341            Token::Backslash => f.write_str("\\"),
342            Token::LBracket => f.write_str("["),
343            Token::RBracket => f.write_str("]"),
344            Token::Ampersand => f.write_str("&"),
345            Token::Caret => f.write_str("^"),
346            Token::Pipe => f.write_str("|"),
347            Token::LBrace => f.write_str("{"),
348            Token::RBrace => f.write_str("}"),
349            Token::RArrow => f.write_str("=>"),
350            Token::Sharp => f.write_str("#"),
351            Token::DoubleSharp => f.write_str("##"),
352            Token::ExclamationMark => f.write_str("!"),
353            Token::DoubleExclamationMark => f.write_str("!!"),
354            Token::Tilde => f.write_str("~"),
355            Token::TildeAsterisk => f.write_str("~*"),
356            Token::ExclamationMarkTilde => f.write_str("!~"),
357            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
358            Token::DoubleTilde => f.write_str("~~"),
359            Token::DoubleTildeAsterisk => f.write_str("~~*"),
360            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
361            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
362            Token::AtSign => f.write_str("@"),
363            Token::CaretAt => f.write_str("^@"),
364            Token::ShiftLeft => f.write_str("<<"),
365            Token::ShiftRight => f.write_str(">>"),
366            Token::Overlap => f.write_str("&&"),
367            Token::PGSquareRoot => f.write_str("|/"),
368            Token::PGCubeRoot => f.write_str("||/"),
369            Token::AtDashAt => f.write_str("@-@"),
370            Token::QuestionMarkDash => f.write_str("?-"),
371            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
372            Token::AmpersandRightAngleBracket => f.write_str("&>"),
373            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
374            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
375            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
376            Token::TwoWayArrow => f.write_str("<->"),
377            Token::LeftAngleBracketCaret => f.write_str("<^"),
378            Token::RightAngleBracketCaret => f.write_str(">^"),
379            Token::QuestionMarkSharp => f.write_str("?#"),
380            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
381            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
382            Token::TildeEqual => f.write_str("~="),
383            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
384            Token::VerticalBarShiftRight => f.write_str("|>>"),
385            Token::Placeholder(ref s) => write!(f, "{s}"),
386            Token::Arrow => write!(f, "->"),
387            Token::LongArrow => write!(f, "->>"),
388            Token::HashArrow => write!(f, "#>"),
389            Token::HashLongArrow => write!(f, "#>>"),
390            Token::AtArrow => write!(f, "@>"),
391            Token::ArrowAt => write!(f, "<@"),
392            Token::HashMinus => write!(f, "#-"),
393            Token::AtQuestion => write!(f, "@?"),
394            Token::AtAt => write!(f, "@@"),
395            Token::Question => write!(f, "?"),
396            Token::QuestionAnd => write!(f, "?&"),
397            Token::QuestionPipe => write!(f, "?|"),
398            Token::CustomBinaryOperator(s) => f.write_str(s),
399        }
400    }
401}
402
403impl Token {
404    /// Create a `Token::Word` from an unquoted `keyword`.
405    ///
406    /// The lookup is case-insensitive; unknown values become `Keyword::NoKeyword`.
407    pub fn make_keyword(keyword: &str) -> Self {
408        Token::make_word(keyword, None)
409    }
410
411    /// Create a `Token::Word` from `word` with an optional `quote_style`.
412    ///
413    /// When `quote_style` is `None`, the parser attempts a case-insensitive keyword
414    /// lookup and sets the `Word::keyword` accordingly.
415    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
416        // Only perform keyword lookup for unquoted identifiers.
417        // Use to_ascii_uppercase() since SQL keywords are ASCII,
418        // avoiding Unicode case conversion overhead.
419        let keyword = if quote_style.is_none() {
420            let word_uppercase = word.to_ascii_uppercase();
421            ALL_KEYWORDS
422                .binary_search(&word_uppercase.as_str())
423                .map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
424        } else {
425            Keyword::NoKeyword
426        };
427
428        Token::Word(Word {
429            value: word.to_string(),
430            quote_style,
431            keyword,
432        })
433    }
434}
435
436/// A keyword (like SELECT) or an optionally quoted SQL identifier
437#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
438#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
439#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
440pub struct Word {
441    /// The value of the token, without the enclosing quotes, and with the
442    /// escape sequences (if any) processed (TODO: escapes are not handled)
443    pub value: String,
444    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
445    /// The standard and most implementations allow using double quotes for this,
446    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
447    pub quote_style: Option<char>,
448    /// If the word was not quoted and it matched one of the known keywords,
449    /// this will have one of the values from dialect::keywords, otherwise empty
450    pub keyword: Keyword,
451}
452
453impl fmt::Display for Word {
454    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
455        match self.quote_style {
456            Some(s) if s == '"' || s == '[' || s == '`' => {
457                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
458            }
459            None => f.write_str(&self.value),
460            _ => panic!("Unexpected quote_style!"),
461        }
462    }
463}
464
465impl Word {
466    fn matching_end_quote(ch: char) -> char {
467        match ch {
468            '"' => '"', // ANSI and most dialects
469            '[' => ']', // MS SQL
470            '`' => '`', // MySQL
471            _ => panic!("unexpected quoting style!"),
472        }
473    }
474}
475
476/// Represents whitespace in the input: spaces, newlines, tabs and comments.
477#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
478#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
479#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
480pub enum Whitespace {
481    /// A single space character.
482    Space,
483    /// A newline character.
484    Newline,
485    /// A tab character.
486    Tab,
487    /// A single-line comment (e.g. `-- comment` or `# comment`).
488    /// The `comment` field contains the text, and `prefix` contains the comment prefix.
489    SingleLineComment {
490        /// The content of the comment (without the prefix).
491        comment: String,
492        /// The prefix used for the comment (for example `--` or `#`).
493        prefix: String,
494    },
495
496    /// A multi-line comment (without the `/* ... */` delimiters).
497    MultiLineComment(String),
498}
499
500impl fmt::Display for Whitespace {
501    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
502        match self {
503            Whitespace::Space => f.write_str(" "),
504            Whitespace::Newline => f.write_str("\n"),
505            Whitespace::Tab => f.write_str("\t"),
506            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
507            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
508        }
509    }
510}
511
512/// Location in input string
513///
514/// # Create an "empty" (unknown) `Location`
515/// ```
516/// # use sqlparser::tokenizer::Location;
517/// let location = Location::empty();
518/// ```
519///
520/// # Create a `Location` from a line and column
521/// ```
522/// # use sqlparser::tokenizer::Location;
523/// let location = Location::new(1, 1);
524/// ```
525///
526/// # Create a `Location` from a pair
527/// ```
528/// # use sqlparser::tokenizer::Location;
529/// let location = Location::from((1, 1));
530/// ```
531#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
532#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
533#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
534pub struct Location {
535    /// Line number, starting from 1.
536    ///
537    /// Note: Line 0 is used for empty spans
538    pub line: u64,
539    /// Line column, starting from 1.
540    ///
541    /// Note: Column 0 is used for empty spans
542    pub column: u64,
543}
544
545impl fmt::Display for Location {
546    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
547        if self.line == 0 {
548            return Ok(());
549        }
550        write!(f, " at Line: {}, Column: {}", self.line, self.column)
551    }
552}
553
554impl fmt::Debug for Location {
555    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
556        write!(f, "Location({},{})", self.line, self.column)
557    }
558}
559
560impl Location {
561    /// Return an "empty" / unknown location
562    pub fn empty() -> Self {
563        Self { line: 0, column: 0 }
564    }
565
566    /// Create a new `Location` for a given line and column
567    pub fn new(line: u64, column: u64) -> Self {
568        Self { line, column }
569    }
570
571    /// Create a new location for a given line and column
572    ///
573    /// Alias for [`Self::new`]
574    // TODO: remove / deprecate in favor of` `new` for consistency?
575    pub fn of(line: u64, column: u64) -> Self {
576        Self::new(line, column)
577    }
578
579    /// Combine self and `end` into a new `Span`
580    pub fn span_to(self, end: Self) -> Span {
581        Span { start: self, end }
582    }
583}
584
585impl From<(u64, u64)> for Location {
586    fn from((line, column): (u64, u64)) -> Self {
587        Self { line, column }
588    }
589}
590
591/// A span represents a linear portion of the input string (start, end)
592///
593/// See [Spanned](crate::ast::Spanned) for more information.
594#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
595#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
596#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
597pub struct Span {
598    /// Start `Location` (inclusive).
599    pub start: Location,
600    /// End `Location` (inclusive).
601    pub end: Location,
602}
603
604impl fmt::Debug for Span {
605    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
606        write!(f, "Span({:?}..{:?})", self.start, self.end)
607    }
608}
609
610impl Span {
611    // An empty span (0, 0) -> (0, 0)
612    // We need a const instance for pattern matching
613    const EMPTY: Span = Self::empty();
614
615    /// Create a new span from a start and end [`Location`]
616    pub fn new(start: Location, end: Location) -> Span {
617        Span { start, end }
618    }
619
620    /// Returns an empty span `(0, 0) -> (0, 0)`
621    ///
622    /// Empty spans represent no knowledge of source location
623    /// See [Spanned](crate::ast::Spanned) for more information.
624    pub const fn empty() -> Span {
625        Span {
626            start: Location { line: 0, column: 0 },
627            end: Location { line: 0, column: 0 },
628        }
629    }
630
631    /// Returns the smallest Span that contains both `self` and `other`
632    /// If either span is [Span::empty], the other span is returned
633    ///
634    /// # Examples
635    /// ```
636    /// # use sqlparser::tokenizer::{Span, Location};
637    /// // line 1, column1 -> line 2, column 5
638    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
639    /// // line 2, column 3 -> line 3, column 7
640    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
641    /// // Union of the two is the min/max of the two spans
642    /// // line 1, column 1 -> line 3, column 7
643    /// let union = span1.union(&span2);
644    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
645    /// ```
646    pub fn union(&self, other: &Span) -> Span {
647        // If either span is empty, return the other
648        // this prevents propagating (0, 0) through the tree
649        match (self, other) {
650            (&Span::EMPTY, _) => *other,
651            (_, &Span::EMPTY) => *self,
652            _ => Span {
653                start: cmp::min(self.start, other.start),
654                end: cmp::max(self.end, other.end),
655            },
656        }
657    }
658
659    /// Same as [Span::union] for `Option<Span>`
660    ///
661    /// If `other` is `None`, `self` is returned
662    pub fn union_opt(&self, other: &Option<Span>) -> Span {
663        match other {
664            Some(other) => self.union(other),
665            None => *self,
666        }
667    }
668
669    /// Return the [Span::union] of all spans in the iterator
670    ///
671    /// If the iterator is empty, an empty span is returned
672    ///
673    /// # Example
674    /// ```
675    /// # use sqlparser::tokenizer::{Span, Location};
676    /// let spans = vec![
677    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
678    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
679    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
680    /// ];
681    /// // line 1, column 1 -> line 4, column 2
682    /// assert_eq!(
683    ///   Span::union_iter(spans),
684    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
685    /// );
686    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
687        iter.into_iter()
688            .reduce(|acc, item| acc.union(&item))
689            .unwrap_or(Span::empty())
690    }
691}
692
693/// Backwards compatibility struct for [`TokenWithSpan`]
694#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
695pub type TokenWithLocation = TokenWithSpan;
696
697/// A [Token] with [Span] attached to it
698///
699/// This is used to track the location of a token in the input string
700///
701/// # Examples
702/// ```
703/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
704/// // commas @ line 1, column 10
705/// let tok1 = TokenWithSpan::new(
706///   Token::Comma,
707///   Span::new(Location::new(1, 10), Location::new(1, 11)),
708/// );
709/// assert_eq!(tok1, Token::Comma); // can compare the token
710///
711/// // commas @ line 2, column 20
712/// let tok2 = TokenWithSpan::new(
713///   Token::Comma,
714///   Span::new(Location::new(2, 20), Location::new(2, 21)),
715/// );
716/// // same token but different locations are not equal
717/// assert_ne!(tok1, tok2);
718/// ```
719#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
720#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
721#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
722/// A `Token` together with its `Span` (location in the source).
723pub struct TokenWithSpan {
724    /// The token value.
725    pub token: Token,
726    /// The span covering the token in the input.
727    pub span: Span,
728}
729
730impl TokenWithSpan {
731    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
732    pub fn new(token: Token, span: Span) -> Self {
733        Self { token, span }
734    }
735
736    /// Wrap a token with an empty span
737    pub fn wrap(token: Token) -> Self {
738        Self::new(token, Span::empty())
739    }
740
741    /// Wrap a token with a location from `start` to `end`
742    pub fn at(token: Token, start: Location, end: Location) -> Self {
743        Self::new(token, Span::new(start, end))
744    }
745
746    /// Return an EOF token with no location
747    pub fn new_eof() -> Self {
748        Self::wrap(Token::EOF)
749    }
750}
751
752impl PartialEq<Token> for TokenWithSpan {
753    fn eq(&self, other: &Token) -> bool {
754        &self.token == other
755    }
756}
757
758impl PartialEq<TokenWithSpan> for Token {
759    fn eq(&self, other: &TokenWithSpan) -> bool {
760        self == &other.token
761    }
762}
763
764impl fmt::Display for TokenWithSpan {
765    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
766        self.token.fmt(f)
767    }
768}
769
770/// An error reported by the tokenizer, with a human-readable `message` and a `location`.
771#[derive(Debug, PartialEq, Eq)]
772pub struct TokenizerError {
773    /// A descriptive error message.
774    pub message: String,
775    /// The `Location` where the error was detected.
776    pub location: Location,
777}
778
779impl fmt::Display for TokenizerError {
780    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
781        write!(f, "{}{}", self.message, self.location,)
782    }
783}
784
785impl core::error::Error for TokenizerError {}
786
787struct State<'a> {
788    peekable: Peekable<Chars<'a>>,
789    line: u64,
790    col: u64,
791}
792
793impl State<'_> {
794    /// return the next character and advance the stream
795    pub fn next(&mut self) -> Option<char> {
796        match self.peekable.next() {
797            None => None,
798            Some(s) => {
799                if s == '\n' {
800                    self.line += 1;
801                    self.col = 1;
802                } else {
803                    self.col += 1;
804                }
805                Some(s)
806            }
807        }
808    }
809
810    /// return the next character but do not advance the stream
811    pub fn peek(&mut self) -> Option<&char> {
812        self.peekable.peek()
813    }
814
815    /// Return the current `Location` (line and column)
816    pub fn location(&self) -> Location {
817        Location {
818            line: self.line,
819            column: self.col,
820        }
821    }
822}
823
824/// Represents how many quote characters enclose a string literal.
825#[derive(Copy, Clone)]
826enum NumStringQuoteChars {
827    /// e.g. `"abc"`, `'abc'`, `r'abc'`
828    One,
829    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
830    Many(NonZeroU8),
831}
832
833/// Settings for tokenizing a quoted string literal.
834struct TokenizeQuotedStringSettings {
835    /// The character used to quote the string.
836    quote_style: char,
837    /// Represents how many quotes characters enclose the string literal.
838    num_quote_chars: NumStringQuoteChars,
839    /// The number of opening quotes left to consume, before parsing
840    /// the remaining string literal.
841    /// For example: given initial string `"""abc"""`. If the caller has
842    /// already parsed the first quote for some reason, then this value
843    /// is set to 1, flagging to look to consume only 2 leading quotes.
844    num_opening_quotes_to_consume: u8,
845    /// True if the string uses backslash escaping of special characters
846    /// e.g `'abc\ndef\'ghi'
847    backslash_escape: bool,
848}
849
850/// SQL Tokenizer
851pub struct Tokenizer<'a> {
852    dialect: &'a dyn Dialect,
853    query: &'a str,
854    /// If true (the default), the tokenizer will un-escape literal
855    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
856    unescape: bool,
857}
858
859impl<'a> Tokenizer<'a> {
860    /// Create a new SQL tokenizer for the specified SQL statement
861    ///
862    /// ```
863    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
864    /// # use sqlparser::dialect::GenericDialect;
865    /// # let dialect = GenericDialect{};
866    /// let query = r#"SELECT 'foo'"#;
867    ///
868    /// // Parsing the query
869    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
870    ///
871    /// assert_eq!(tokens, vec![
872    ///   Token::make_word("SELECT", None),
873    ///   Token::Whitespace(Whitespace::Space),
874    ///   Token::SingleQuotedString("foo".to_string()),
875    /// ]);
876    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
877        Self {
878            dialect,
879            query,
880            unescape: true,
881        }
882    }
883
884    /// Set unescape mode
885    ///
886    /// When true (default) the tokenizer unescapes literal values
887    /// (for example, `""` in SQL is unescaped to the literal `"`).
888    ///
889    /// When false, the tokenizer provides the raw strings as provided
890    /// in the query.  This can be helpful for programs that wish to
891    /// recover the *exact* original query text without normalizing
892    /// the escaping
893    ///
894    /// # Example
895    ///
896    /// ```
897    /// # use sqlparser::tokenizer::{Token, Tokenizer};
898    /// # use sqlparser::dialect::GenericDialect;
899    /// # let dialect = GenericDialect{};
900    /// let query = r#""Foo "" Bar""#;
901    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
902    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
903    ///
904    /// // Parsing with unescaping (default)
905    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
906    /// assert_eq!(tokens, vec![unescaped]);
907    ///
908    /// // Parsing with unescape = false
909    /// let tokens = Tokenizer::new(&dialect, &query)
910    ///    .with_unescape(false)
911    ///    .tokenize().unwrap();
912    /// assert_eq!(tokens, vec![original]);
913    /// ```
914    pub fn with_unescape(mut self, unescape: bool) -> Self {
915        self.unescape = unescape;
916        self
917    }
918
919    /// Tokenize the statement and produce a vector of tokens
920    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
921        let twl = self.tokenize_with_location()?;
922        Ok(twl.into_iter().map(|t| t.token).collect())
923    }
924
925    /// Tokenize the statement and produce a vector of tokens with location information
926    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
927        let mut tokens: Vec<TokenWithSpan> = vec![];
928        self.tokenize_with_location_into_buf(&mut tokens)
929            .map(|_| tokens)
930    }
931
932    /// Tokenize the statement and append tokens with location information into the provided buffer.
933    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
934    pub fn tokenize_with_location_into_buf(
935        &mut self,
936        buf: &mut Vec<TokenWithSpan>,
937    ) -> Result<(), TokenizerError> {
938        self.tokenize_with_location_into_buf_with_mapper(buf, |token| token)
939    }
940
941    /// Tokenize the statement and produce a vector of tokens, mapping each token
942    /// with provided `mapper`
943    pub fn tokenize_with_location_into_buf_with_mapper(
944        &mut self,
945        buf: &mut Vec<TokenWithSpan>,
946        mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
947    ) -> Result<(), TokenizerError> {
948        let mut state = State {
949            peekable: self.query.chars().peekable(),
950            line: 1,
951            col: 1,
952        };
953
954        let mut location = state.location();
955        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
956            let span = location.span_to(state.location());
957
958            // Check if this is a multiline comment hint that should be expanded
959            match &token {
960                Token::Whitespace(Whitespace::MultiLineComment(comment))
961                    if self.dialect.supports_multiline_comment_hints()
962                        && comment.starts_with('!') =>
963                {
964                    // Re-tokenize the hints and add them to the buffer
965                    self.tokenize_comment_hints(comment, span, buf, &mut mapper)?;
966                }
967                _ => {
968                    buf.push(mapper(TokenWithSpan { token, span }));
969                }
970            }
971
972            location = state.location();
973        }
974        Ok(())
975    }
976
977    /// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
978    /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
979    fn tokenize_comment_hints(
980        &self,
981        comment: &str,
982        span: Span,
983        buf: &mut Vec<TokenWithSpan>,
984        mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
985    ) -> Result<(), TokenizerError> {
986        // Strip the leading '!' and any version digits (e.g., "50110")
987        let hint_content = comment
988            .strip_prefix('!')
989            .unwrap_or(comment)
990            .trim_start_matches(|c: char| c.is_ascii_digit());
991
992        // If there's no content after stripping, nothing to tokenize
993        if hint_content.is_empty() {
994            return Ok(());
995        }
996
997        // Create a new tokenizer for the hint content
998        let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
999
1000        // Create a state for tracking position within the hint
1001        let mut state = State {
1002            peekable: hint_content.chars().peekable(),
1003            line: span.start.line,
1004            col: span.start.column,
1005        };
1006
1007        // Tokenize the hint content and add tokens to the buffer
1008        let mut location = state.location();
1009        while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
1010            let token_span = location.span_to(state.location());
1011            buf.push(mapper(TokenWithSpan {
1012                token,
1013                span: token_span,
1014            }));
1015            location = state.location();
1016        }
1017
1018        Ok(())
1019    }
1020
1021    // Tokenize the identifier or keywords in `ch`
1022    fn tokenize_identifier_or_keyword(
1023        &self,
1024        ch: impl IntoIterator<Item = char>,
1025        chars: &mut State,
1026    ) -> Result<Option<Token>, TokenizerError> {
1027        chars.next(); // consume the first char
1028        let ch: String = ch.into_iter().collect();
1029        let word = self.tokenize_word(ch, chars);
1030
1031        // TODO: implement parsing of exponent here
1032        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
1033            let mut inner_state = State {
1034                peekable: word.chars().peekable(),
1035                line: 0,
1036                col: 0,
1037            };
1038            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
1039            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
1040            s += s2.as_str();
1041            return Ok(Some(Token::Number(s, false)));
1042        }
1043
1044        Ok(Some(Token::make_word(&word, None)))
1045    }
1046
1047    /// Get the next token or return None
1048    fn next_token(
1049        &self,
1050        chars: &mut State,
1051        prev_token: Option<&Token>,
1052    ) -> Result<Option<Token>, TokenizerError> {
1053        match chars.peek() {
1054            Some(&ch) => match ch {
1055                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
1056                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
1057                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
1058                '\r' => {
1059                    // Emit a single Whitespace::Newline token for \r and \r\n
1060                    chars.next();
1061                    if let Some('\n') = chars.peek() {
1062                        chars.next();
1063                    }
1064                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
1065                }
1066                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
1067                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
1068                {
1069                    chars.next(); // consume
1070                    match chars.peek() {
1071                        Some('\'') => {
1072                            if self.dialect.supports_triple_quoted_string() {
1073                                return self
1074                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1075                                        chars,
1076                                        '\'',
1077                                        false,
1078                                        Token::SingleQuotedByteStringLiteral,
1079                                        Token::TripleSingleQuotedByteStringLiteral,
1080                                    );
1081                            }
1082                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
1083                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
1084                        }
1085                        Some('\"') => {
1086                            if self.dialect.supports_triple_quoted_string() {
1087                                return self
1088                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1089                                        chars,
1090                                        '"',
1091                                        false,
1092                                        Token::DoubleQuotedByteStringLiteral,
1093                                        Token::TripleDoubleQuotedByteStringLiteral,
1094                                    );
1095                            }
1096                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
1097                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
1098                        }
1099                        _ => {
1100                            // regular identifier starting with an "b" or "B"
1101                            let s = self.tokenize_word(b, chars);
1102                            Ok(Some(Token::make_word(&s, None)))
1103                        }
1104                    }
1105                }
1106                // BigQuery uses r or R for raw string literal
1107                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
1108                    chars.next(); // consume
1109                    match chars.peek() {
1110                        Some('\'') => self
1111                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1112                                chars,
1113                                '\'',
1114                                false,
1115                                Token::SingleQuotedRawStringLiteral,
1116                                Token::TripleSingleQuotedRawStringLiteral,
1117                            ),
1118                        Some('\"') => self
1119                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120                                chars,
1121                                '"',
1122                                false,
1123                                Token::DoubleQuotedRawStringLiteral,
1124                                Token::TripleDoubleQuotedRawStringLiteral,
1125                            ),
1126                        _ => {
1127                            // regular identifier starting with an "r" or "R"
1128                            let s = self.tokenize_word(b, chars);
1129                            Ok(Some(Token::make_word(&s, None)))
1130                        }
1131                    }
1132                }
1133                // Redshift uses lower case n for national string literal
1134                n @ 'N' | n @ 'n' => {
1135                    chars.next(); // consume, to check the next char
1136                    match chars.peek() {
1137                        Some('\'') => {
1138                            // N'...' - a <national character string literal>
1139                            let backslash_escape =
1140                                self.dialect.supports_string_literal_backslash_escape();
1141                            let s =
1142                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1143                            Ok(Some(Token::NationalStringLiteral(s)))
1144                        }
1145                        Some(&q @ 'q') | Some(&q @ 'Q')
1146                            if self.dialect.supports_quote_delimited_string() =>
1147                        {
1148                            chars.next(); // consume and check the next char
1149                            if let Some('\'') = chars.peek() {
1150                                self.tokenize_quote_delimited_string(chars, &[n, q])
1151                                    .map(|s| Some(Token::NationalQuoteDelimitedStringLiteral(s)))
1152                            } else {
1153                                let s = self.tokenize_word(String::from_iter([n, q]), chars);
1154                                Ok(Some(Token::make_word(&s, None)))
1155                            }
1156                        }
1157                        _ => {
1158                            // regular identifier starting with an "N"
1159                            let s = self.tokenize_word(n, chars);
1160                            Ok(Some(Token::make_word(&s, None)))
1161                        }
1162                    }
1163                }
1164                q @ 'Q' | q @ 'q' if self.dialect.supports_quote_delimited_string() => {
1165                    chars.next(); // consume and check the next char
1166                    if let Some('\'') = chars.peek() {
1167                        self.tokenize_quote_delimited_string(chars, &[q])
1168                            .map(|s| Some(Token::QuoteDelimitedStringLiteral(s)))
1169                    } else {
1170                        let s = self.tokenize_word(q, chars);
1171                        Ok(Some(Token::make_word(&s, None)))
1172                    }
1173                }
1174                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1175                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1176                    let starting_loc = chars.location();
1177                    chars.next(); // consume, to check the next char
1178                    match chars.peek() {
1179                        Some('\'') => {
1180                            let s =
1181                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1182                            Ok(Some(Token::EscapedStringLiteral(s)))
1183                        }
1184                        _ => {
1185                            // regular identifier starting with an "E" or "e"
1186                            let s = self.tokenize_word(x, chars);
1187                            Ok(Some(Token::make_word(&s, None)))
1188                        }
1189                    }
1190                }
1191                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1192                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1193                    chars.next(); // consume, to check the next char
1194                    if chars.peek() == Some(&'&') {
1195                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1196                        let mut chars_clone = chars.peekable.clone();
1197                        chars_clone.next(); // consume the '&' in the clone
1198                        if chars_clone.peek() == Some(&'\'') {
1199                            chars.next(); // consume the '&' in the original iterator
1200                            let s = unescape_unicode_single_quoted_string(chars)?;
1201                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1202                        }
1203                    }
1204                    // regular identifier starting with an "U" or "u"
1205                    let s = self.tokenize_word(x, chars);
1206                    Ok(Some(Token::make_word(&s, None)))
1207                }
1208                // The spec only allows an uppercase 'X' to introduce a hex
1209                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1210                x @ 'x' | x @ 'X' => {
1211                    chars.next(); // consume, to check the next char
1212                    match chars.peek() {
1213                        Some('\'') => {
1214                            // X'...' - a <binary string literal>
1215                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1216                            Ok(Some(Token::HexStringLiteral(s)))
1217                        }
1218                        _ => {
1219                            // regular identifier starting with an "X"
1220                            let s = self.tokenize_word(x, chars);
1221                            Ok(Some(Token::make_word(&s, None)))
1222                        }
1223                    }
1224                }
1225                // single quoted string
1226                '\'' => {
1227                    if self.dialect.supports_triple_quoted_string() {
1228                        return self
1229                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1230                                chars,
1231                                '\'',
1232                                self.dialect.supports_string_literal_backslash_escape(),
1233                                Token::SingleQuotedString,
1234                                Token::TripleSingleQuotedString,
1235                            );
1236                    }
1237                    let s = self.tokenize_single_quoted_string(
1238                        chars,
1239                        '\'',
1240                        self.dialect.supports_string_literal_backslash_escape(),
1241                    )?;
1242
1243                    Ok(Some(Token::SingleQuotedString(s)))
1244                }
1245                // double quoted string
1246                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1247                    && !self.dialect.is_identifier_start(ch) =>
1248                {
1249                    if self.dialect.supports_triple_quoted_string() {
1250                        return self
1251                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1252                                chars,
1253                                '"',
1254                                self.dialect.supports_string_literal_backslash_escape(),
1255                                Token::DoubleQuotedString,
1256                                Token::TripleDoubleQuotedString,
1257                            );
1258                    }
1259                    let s = self.tokenize_single_quoted_string(
1260                        chars,
1261                        '"',
1262                        self.dialect.supports_string_literal_backslash_escape(),
1263                    )?;
1264
1265                    Ok(Some(Token::DoubleQuotedString(s)))
1266                }
1267                // delimited (quoted) identifier
1268                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1269                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1270                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1271                }
1272                // Potentially nested delimited (quoted) identifier
1273                quote_start
1274                    if self
1275                        .dialect
1276                        .is_nested_delimited_identifier_start(quote_start)
1277                        && self
1278                            .dialect
1279                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1280                            .is_some() =>
1281                {
1282                    let Some((quote_start, nested_quote_start)) = self
1283                        .dialect
1284                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1285                    else {
1286                        return self.tokenizer_error(
1287                            chars.location(),
1288                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1289                        );
1290                    };
1291
1292                    let Some(nested_quote_start) = nested_quote_start else {
1293                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1294                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1295                    };
1296
1297                    let mut word = vec![];
1298                    let quote_end = Word::matching_end_quote(quote_start);
1299                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1300                    let error_loc = chars.location();
1301
1302                    chars.next(); // skip the first delimiter
1303                    peeking_take_while(chars, |ch| ch.is_whitespace());
1304                    if chars.peek() != Some(&nested_quote_start) {
1305                        return self.tokenizer_error(
1306                            error_loc,
1307                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1308                        );
1309                    }
1310                    word.push(nested_quote_start.into());
1311                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1312                    word.push(nested_quote_end.into());
1313                    peeking_take_while(chars, |ch| ch.is_whitespace());
1314                    if chars.peek() != Some(&quote_end) {
1315                        return self.tokenizer_error(
1316                            error_loc,
1317                            format!("Expected close delimiter '{quote_end}' before EOF."),
1318                        );
1319                    }
1320                    chars.next(); // skip close delimiter
1321
1322                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1323                }
1324                // numbers and period
1325                '0'..='9' | '.' => {
1326                    // special case where if ._ is encountered after a word then that word
1327                    // is a table and the _ is the start of the col name.
1328                    // if the prev token is not a word, then this is not a valid sql
1329                    // word or number.
1330                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1331                        if let Some(Token::Word(_)) = prev_token {
1332                            chars.next();
1333                            return Ok(Some(Token::Period));
1334                        }
1335
1336                        return self.tokenizer_error(
1337                            chars.location(),
1338                            "Unexpected character '_'".to_string(),
1339                        );
1340                    }
1341
1342                    // Some dialects support underscore as number separator
1343                    // There can only be one at a time and it must be followed by another digit
1344                    let is_number_separator = |ch: char, next_char: Option<char>| {
1345                        self.dialect.supports_numeric_literal_underscores()
1346                            && ch == '_'
1347                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1348                    };
1349
1350                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1351                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1352                    });
1353
1354                    // match binary literal that starts with 0x
1355                    if s == "0" && chars.peek() == Some(&'x') {
1356                        chars.next();
1357                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1358                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1359                        });
1360                        return Ok(Some(Token::HexStringLiteral(s2)));
1361                    }
1362
1363                    // match one period
1364                    if let Some('.') = chars.peek() {
1365                        s.push('.');
1366                        chars.next();
1367                    }
1368
1369                    // If the dialect supports identifiers that start with a numeric prefix
1370                    // and we have now consumed a dot, check if the previous token was a Word.
1371                    // If so, what follows is definitely not part of a decimal number and
1372                    // we should yield the dot as a dedicated token so compound identifiers
1373                    // starting with digits can be parsed correctly.
1374                    if s == "." && self.dialect.supports_numeric_prefix() {
1375                        if let Some(Token::Word(_)) = prev_token {
1376                            return Ok(Some(Token::Period));
1377                        }
1378                    }
1379
1380                    // Consume fractional digits.
1381                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1382                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1383                    });
1384
1385                    // No fraction -> Token::Period
1386                    if s == "." {
1387                        return Ok(Some(Token::Period));
1388                    }
1389
1390                    // Parse exponent as number
1391                    let mut exponent_part = String::new();
1392                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1393                        let mut char_clone = chars.peekable.clone();
1394                        exponent_part.push(char_clone.next().unwrap());
1395
1396                        // Optional sign
1397                        match char_clone.peek() {
1398                            Some(&c) if matches!(c, '+' | '-') => {
1399                                exponent_part.push(c);
1400                                char_clone.next();
1401                            }
1402                            _ => (),
1403                        }
1404
1405                        match char_clone.peek() {
1406                            // Definitely an exponent, get original iterator up to speed and use it
1407                            Some(&c) if c.is_ascii_digit() => {
1408                                for _ in 0..exponent_part.len() {
1409                                    chars.next();
1410                                }
1411                                exponent_part +=
1412                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1413                                s += exponent_part.as_str();
1414                            }
1415                            // Not an exponent, discard the work done
1416                            _ => (),
1417                        }
1418                    }
1419
1420                    // If the dialect supports identifiers that start with a numeric prefix,
1421                    // we need to check if the value is in fact an identifier and must thus
1422                    // be tokenized as a word.
1423                    if self.dialect.supports_numeric_prefix() {
1424                        if exponent_part.is_empty() {
1425                            // If it is not a number with an exponent, it may be
1426                            // an identifier starting with digits.
1427                            let word =
1428                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1429
1430                            if !word.is_empty() {
1431                                s += word.as_str();
1432                                return Ok(Some(Token::make_word(s.as_str(), None)));
1433                            }
1434                        } else if prev_token == Some(&Token::Period) {
1435                            // If the previous token was a period, thus not belonging to a number,
1436                            // the value we have is part of an identifier.
1437                            return Ok(Some(Token::make_word(s.as_str(), None)));
1438                        }
1439                    }
1440
1441                    let long = if chars.peek() == Some(&'L') {
1442                        chars.next();
1443                        true
1444                    } else {
1445                        false
1446                    };
1447                    Ok(Some(Token::Number(s, long)))
1448                }
1449                // punctuation
1450                '(' => self.consume_and_return(chars, Token::LParen),
1451                ')' => self.consume_and_return(chars, Token::RParen),
1452                ',' => self.consume_and_return(chars, Token::Comma),
1453                // operators
1454                '-' => {
1455                    chars.next(); // consume the '-'
1456
1457                    match chars.peek() {
1458                        Some('-') => {
1459                            let mut is_comment = true;
1460                            if self.dialect.requires_single_line_comment_whitespace() {
1461                                is_comment = chars
1462                                    .peekable
1463                                    .clone()
1464                                    .nth(1)
1465                                    .is_some_and(char::is_whitespace);
1466                            }
1467
1468                            if is_comment {
1469                                chars.next(); // consume second '-'
1470                                let comment = self.tokenize_single_line_comment(chars);
1471                                return Ok(Some(Token::Whitespace(
1472                                    Whitespace::SingleLineComment {
1473                                        prefix: "--".to_owned(),
1474                                        comment,
1475                                    },
1476                                )));
1477                            }
1478
1479                            self.start_binop(chars, "-", Token::Minus)
1480                        }
1481                        Some('>') => {
1482                            chars.next();
1483                            match chars.peek() {
1484                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1485                                _ => self.start_binop(chars, "->", Token::Arrow),
1486                            }
1487                        }
1488                        // a regular '-' operator
1489                        _ => self.start_binop(chars, "-", Token::Minus),
1490                    }
1491                }
1492                '/' => {
1493                    chars.next(); // consume the '/'
1494                    match chars.peek() {
1495                        Some('*') => {
1496                            chars.next(); // consume the '*', starting a multi-line comment
1497                            self.tokenize_multiline_comment(chars)
1498                        }
1499                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1500                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1501                            let comment = self.tokenize_single_line_comment(chars);
1502                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1503                                prefix: "//".to_owned(),
1504                                comment,
1505                            })))
1506                        }
1507                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1508                            self.consume_and_return(chars, Token::DuckIntDiv)
1509                        }
1510                        // a regular '/' operator
1511                        _ => Ok(Some(Token::Div)),
1512                    }
1513                }
1514                '+' => self.consume_and_return(chars, Token::Plus),
1515                '*' => self.consume_and_return(chars, Token::Mul),
1516                '%' => {
1517                    chars.next(); // advance past '%'
1518                    match chars.peek() {
1519                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1520                        Some(sch) if self.dialect.is_identifier_start('%') => {
1521                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1522                        }
1523                        _ => self.start_binop(chars, "%", Token::Mod),
1524                    }
1525                }
1526                '|' => {
1527                    chars.next(); // consume the '|'
1528                    match chars.peek() {
1529                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1530                        Some('|') => {
1531                            chars.next(); // consume the second '|'
1532                            match chars.peek() {
1533                                Some('/') => {
1534                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1535                                }
1536                                _ => self.start_binop(chars, "||", Token::StringConcat),
1537                            }
1538                        }
1539                        Some('&') if self.dialect.supports_geometric_types() => {
1540                            chars.next(); // consume
1541                            match chars.peek() {
1542                                Some('>') => self.consume_for_binop(
1543                                    chars,
1544                                    "|&>",
1545                                    Token::VerticalBarAmpersandRightAngleBracket,
1546                                ),
1547                                _ => self.start_binop_opt(chars, "|&", None),
1548                            }
1549                        }
1550                        Some('>') if self.dialect.supports_geometric_types() => {
1551                            chars.next(); // consume
1552                            match chars.peek() {
1553                                Some('>') => self.consume_for_binop(
1554                                    chars,
1555                                    "|>>",
1556                                    Token::VerticalBarShiftRight,
1557                                ),
1558                                _ => self.start_binop_opt(chars, "|>", None),
1559                            }
1560                        }
1561                        Some('>') if self.dialect.supports_pipe_operator() => {
1562                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1563                        }
1564                        // Bitshift '|' operator
1565                        _ => self.start_binop(chars, "|", Token::Pipe),
1566                    }
1567                }
1568                '=' => {
1569                    chars.next(); // consume
1570                    match chars.peek() {
1571                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1572                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1573                        _ => Ok(Some(Token::Eq)),
1574                    }
1575                }
1576                '!' => {
1577                    chars.next(); // consume
1578                    match chars.peek() {
1579                        Some('=') => self.consume_and_return(chars, Token::Neq),
1580                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1581                        Some('~') => {
1582                            chars.next();
1583                            match chars.peek() {
1584                                Some('*') => self
1585                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1586                                Some('~') => {
1587                                    chars.next();
1588                                    match chars.peek() {
1589                                        Some('*') => self.consume_and_return(
1590                                            chars,
1591                                            Token::ExclamationMarkDoubleTildeAsterisk,
1592                                        ),
1593                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1594                                    }
1595                                }
1596                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1597                            }
1598                        }
1599                        _ => Ok(Some(Token::ExclamationMark)),
1600                    }
1601                }
1602                '<' => {
1603                    chars.next(); // consume
1604                    match chars.peek() {
1605                        Some('=') => {
1606                            chars.next();
1607                            match chars.peek() {
1608                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1609                                _ => self.start_binop(chars, "<=", Token::LtEq),
1610                            }
1611                        }
1612                        Some('|') if self.dialect.supports_geometric_types() => {
1613                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1614                        }
1615                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1616                        Some('<') if self.dialect.supports_geometric_types() => {
1617                            chars.next(); // consume
1618                            match chars.peek() {
1619                                Some('|') => self.consume_for_binop(
1620                                    chars,
1621                                    "<<|",
1622                                    Token::ShiftLeftVerticalBar,
1623                                ),
1624                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1625                            }
1626                        }
1627                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1628                        Some('-') if self.dialect.supports_geometric_types() => {
1629                            chars.next(); // consume
1630                            match chars.peek() {
1631                                Some('>') => {
1632                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1633                                }
1634                                _ => self.start_binop_opt(chars, "<-", None),
1635                            }
1636                        }
1637                        Some('^') if self.dialect.supports_geometric_types() => {
1638                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1639                        }
1640                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1641                        _ => self.start_binop(chars, "<", Token::Lt),
1642                    }
1643                }
1644                '>' => {
1645                    chars.next(); // consume
1646                    match chars.peek() {
1647                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1648                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1649                        Some('^') if self.dialect.supports_geometric_types() => {
1650                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1651                        }
1652                        _ => self.start_binop(chars, ">", Token::Gt),
1653                    }
1654                }
1655                ':' => {
1656                    chars.next();
1657                    match chars.peek() {
1658                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1659                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1660                        _ => Ok(Some(Token::Colon)),
1661                    }
1662                }
1663                ';' => self.consume_and_return(chars, Token::SemiColon),
1664                '\\' => self.consume_and_return(chars, Token::Backslash),
1665                '[' => self.consume_and_return(chars, Token::LBracket),
1666                ']' => self.consume_and_return(chars, Token::RBracket),
1667                '&' => {
1668                    chars.next(); // consume the '&'
1669                    match chars.peek() {
1670                        Some('>') if self.dialect.supports_geometric_types() => {
1671                            chars.next();
1672                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1673                        }
1674                        Some('<') if self.dialect.supports_geometric_types() => {
1675                            chars.next(); // consume
1676                            match chars.peek() {
1677                                Some('|') => self.consume_and_return(
1678                                    chars,
1679                                    Token::AmpersandLeftAngleBracketVerticalBar,
1680                                ),
1681                                _ => {
1682                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1683                                }
1684                            }
1685                        }
1686                        Some('&') => {
1687                            chars.next(); // consume the second '&'
1688                            self.start_binop(chars, "&&", Token::Overlap)
1689                        }
1690                        // Bitshift '&' operator
1691                        _ => self.start_binop(chars, "&", Token::Ampersand),
1692                    }
1693                }
1694                '^' => {
1695                    chars.next(); // consume the '^'
1696                    match chars.peek() {
1697                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1698                        _ => Ok(Some(Token::Caret)),
1699                    }
1700                }
1701                '{' => self.consume_and_return(chars, Token::LBrace),
1702                '}' => self.consume_and_return(chars, Token::RBrace),
1703                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1704                {
1705                    chars.next(); // consume the '#', starting a snowflake single-line comment
1706                    let comment = self.tokenize_single_line_comment(chars);
1707                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1708                        prefix: "#".to_owned(),
1709                        comment,
1710                    })))
1711                }
1712                '~' => {
1713                    chars.next(); // consume
1714                    match chars.peek() {
1715                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1716                        Some('=') if self.dialect.supports_geometric_types() => {
1717                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1718                        }
1719                        Some('~') => {
1720                            chars.next();
1721                            match chars.peek() {
1722                                Some('*') => {
1723                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1724                                }
1725                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1726                            }
1727                        }
1728                        _ => self.start_binop(chars, "~", Token::Tilde),
1729                    }
1730                }
1731                '#' => {
1732                    chars.next();
1733                    match chars.peek() {
1734                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1735                        Some('>') => {
1736                            chars.next();
1737                            match chars.peek() {
1738                                Some('>') => {
1739                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1740                                }
1741                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1742                            }
1743                        }
1744                        Some(' ') => Ok(Some(Token::Sharp)),
1745                        Some('#') if self.dialect.supports_geometric_types() => {
1746                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1747                        }
1748                        Some(sch) if self.dialect.is_identifier_start('#') => {
1749                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1750                        }
1751                        _ => self.start_binop(chars, "#", Token::Sharp),
1752                    }
1753                }
1754                '@' => {
1755                    chars.next();
1756                    match chars.peek() {
1757                        Some('@') if self.dialect.supports_geometric_types() => {
1758                            self.consume_and_return(chars, Token::AtAt)
1759                        }
1760                        Some('-') if self.dialect.supports_geometric_types() => {
1761                            chars.next();
1762                            match chars.peek() {
1763                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1764                                _ => self.start_binop_opt(chars, "@-", None),
1765                            }
1766                        }
1767                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1768                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1769                        Some('@') => {
1770                            chars.next();
1771                            match chars.peek() {
1772                                Some(' ') => Ok(Some(Token::AtAt)),
1773                                Some(tch) if self.dialect.is_identifier_start('@') => {
1774                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1775                                }
1776                                _ => Ok(Some(Token::AtAt)),
1777                            }
1778                        }
1779                        Some(' ') => Ok(Some(Token::AtSign)),
1780                        // We break on quotes here, because no dialect allows identifiers starting
1781                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1782                        // quoted, which is tokenized as a quoted string, not here (e.g.
1783                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1784                        // quoted string as two separate tokens, which this allows. For example,
1785                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1786                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1787                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1788                        // for the user, the `@`, and the host.
1789                        Some('\'') => Ok(Some(Token::AtSign)),
1790                        Some('\"') => Ok(Some(Token::AtSign)),
1791                        Some('`') => Ok(Some(Token::AtSign)),
1792                        Some(sch) if self.dialect.is_identifier_start('@') => {
1793                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1794                        }
1795                        _ => Ok(Some(Token::AtSign)),
1796                    }
1797                }
1798                // Postgres uses ? for jsonb operators, not prepared statements
1799                '?' if self.dialect.supports_geometric_types() => {
1800                    chars.next(); // consume
1801                    match chars.peek() {
1802                        Some('|') => {
1803                            chars.next();
1804                            match chars.peek() {
1805                                Some('|') => self.consume_and_return(
1806                                    chars,
1807                                    Token::QuestionMarkDoubleVerticalBar,
1808                                ),
1809                                _ => Ok(Some(Token::QuestionPipe)),
1810                            }
1811                        }
1812
1813                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1814                        Some('-') => {
1815                            chars.next(); // consume
1816                            match chars.peek() {
1817                                Some('|') => self
1818                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1819                                _ => Ok(Some(Token::QuestionMarkDash)),
1820                            }
1821                        }
1822                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1823                        _ => Ok(Some(Token::Question)),
1824                    }
1825                }
1826                '?' => {
1827                    chars.next();
1828                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1829                    Ok(Some(Token::Placeholder(format!("?{s}"))))
1830                }
1831
1832                // identifier or keyword
1833                ch if self.dialect.is_identifier_start(ch) => {
1834                    self.tokenize_identifier_or_keyword([ch], chars)
1835                }
1836                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1837
1838                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1839                ch if ch.is_whitespace() => {
1840                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1841                }
1842                other => self.consume_and_return(chars, Token::Char(other)),
1843            },
1844            None => Ok(None),
1845        }
1846    }
1847
1848    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1849    fn consume_for_binop(
1850        &self,
1851        chars: &mut State,
1852        prefix: &str,
1853        default: Token,
1854    ) -> Result<Option<Token>, TokenizerError> {
1855        chars.next(); // consume the first char
1856        self.start_binop_opt(chars, prefix, Some(default))
1857    }
1858
1859    /// parse a custom binary operator
1860    fn start_binop(
1861        &self,
1862        chars: &mut State,
1863        prefix: &str,
1864        default: Token,
1865    ) -> Result<Option<Token>, TokenizerError> {
1866        self.start_binop_opt(chars, prefix, Some(default))
1867    }
1868
1869    /// parse a custom binary operator
1870    fn start_binop_opt(
1871        &self,
1872        chars: &mut State,
1873        prefix: &str,
1874        default: Option<Token>,
1875    ) -> Result<Option<Token>, TokenizerError> {
1876        let mut custom = None;
1877        while let Some(&ch) = chars.peek() {
1878            if !self.dialect.is_custom_operator_part(ch) {
1879                break;
1880            }
1881
1882            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1883            chars.next();
1884        }
1885        match (custom, default) {
1886            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1887            (None, Some(tok)) => Ok(Some(tok)),
1888            (None, None) => self.tokenizer_error(
1889                chars.location(),
1890                format!("Expected a valid binary operator after '{prefix}'"),
1891            ),
1892        }
1893    }
1894
1895    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1896    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1897        let mut s = String::new();
1898        let mut value = String::new();
1899
1900        chars.next();
1901
1902        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1903        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1904            chars.next();
1905
1906            let mut is_terminated = false;
1907            let mut prev: Option<char> = None;
1908
1909            while let Some(&ch) = chars.peek() {
1910                if prev == Some('$') {
1911                    if ch == '$' {
1912                        chars.next();
1913                        is_terminated = true;
1914                        break;
1915                    } else {
1916                        s.push('$');
1917                        s.push(ch);
1918                    }
1919                } else if ch != '$' {
1920                    s.push(ch);
1921                }
1922
1923                prev = Some(ch);
1924                chars.next();
1925            }
1926
1927            return if chars.peek().is_none() && !is_terminated {
1928                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1929            } else {
1930                Ok(Token::DollarQuotedString(DollarQuotedString {
1931                    value: s,
1932                    tag: None,
1933                }))
1934            };
1935        } else {
1936            value.push_str(&peeking_take_while(chars, |ch| {
1937                ch.is_alphanumeric()
1938                    || ch == '_'
1939                    // Allow $ as a placeholder character if the dialect supports it
1940                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1941            }));
1942
1943            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1944            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1945                chars.next();
1946
1947                let mut temp = String::new();
1948                let end_delimiter = format!("${value}$");
1949
1950                loop {
1951                    match chars.next() {
1952                        Some(ch) => {
1953                            temp.push(ch);
1954
1955                            if temp.ends_with(&end_delimiter) {
1956                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1957                                    s.push_str(temp);
1958                                }
1959                                break;
1960                            }
1961                        }
1962                        None => {
1963                            if temp.ends_with(&end_delimiter) {
1964                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1965                                    s.push_str(temp);
1966                                }
1967                                break;
1968                            }
1969
1970                            return self.tokenizer_error(
1971                                chars.location(),
1972                                "Unterminated dollar-quoted, expected $",
1973                            );
1974                        }
1975                    }
1976                }
1977            } else {
1978                return Ok(Token::Placeholder(format!("${value}")));
1979            }
1980        }
1981
1982        Ok(Token::DollarQuotedString(DollarQuotedString {
1983            value: s,
1984            tag: if value.is_empty() { None } else { Some(value) },
1985        }))
1986    }
1987
1988    fn tokenizer_error<R>(
1989        &self,
1990        loc: Location,
1991        message: impl Into<String>,
1992    ) -> Result<R, TokenizerError> {
1993        Err(TokenizerError {
1994            message: message.into(),
1995            location: loc,
1996        })
1997    }
1998
1999    // Consume characters until newline
2000    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
2001        let mut comment = peeking_take_while(chars, |ch| match ch {
2002            '\n' => false,                                           // Always stop at \n
2003            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
2004            _ => true, // Keep consuming for other characters
2005        });
2006
2007        if let Some(ch) = chars.next() {
2008            assert!(ch == '\n' || ch == '\r');
2009            comment.push(ch);
2010        }
2011
2012        comment
2013    }
2014
2015    /// Tokenize an identifier or keyword, after the first char is already consumed.
2016    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
2017        let mut s = first_chars.into();
2018        s.push_str(&peeking_take_while(chars, |ch| {
2019            self.dialect.is_identifier_part(ch)
2020        }));
2021        s
2022    }
2023
2024    /// Read a quoted identifier
2025    fn tokenize_quoted_identifier(
2026        &self,
2027        quote_start: char,
2028        chars: &mut State,
2029    ) -> Result<String, TokenizerError> {
2030        let error_loc = chars.location();
2031        chars.next(); // consume the opening quote
2032        let quote_end = Word::matching_end_quote(quote_start);
2033        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
2034
2035        if last_char == Some(quote_end) {
2036            Ok(s)
2037        } else {
2038            self.tokenizer_error(
2039                error_loc,
2040                format!("Expected close delimiter '{quote_end}' before EOF."),
2041            )
2042        }
2043    }
2044
2045    /// Read a single quoted string, starting with the opening quote.
2046    fn tokenize_escaped_single_quoted_string(
2047        &self,
2048        starting_loc: Location,
2049        chars: &mut State,
2050    ) -> Result<String, TokenizerError> {
2051        if let Some(s) = unescape_single_quoted_string(chars) {
2052            return Ok(s);
2053        }
2054
2055        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
2056    }
2057
2058    /// Reads a string literal quoted by a single or triple quote characters.
2059    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
2060    fn tokenize_single_or_triple_quoted_string<F>(
2061        &self,
2062        chars: &mut State,
2063        quote_style: char,
2064        backslash_escape: bool,
2065        single_quote_token: F,
2066        triple_quote_token: F,
2067    ) -> Result<Option<Token>, TokenizerError>
2068    where
2069        F: Fn(String) -> Token,
2070    {
2071        let error_loc = chars.location();
2072
2073        let mut num_opening_quotes = 0u8;
2074        for _ in 0..3 {
2075            if Some(&quote_style) == chars.peek() {
2076                chars.next(); // Consume quote.
2077                num_opening_quotes += 1;
2078            } else {
2079                break;
2080            }
2081        }
2082
2083        let (token_fn, num_quote_chars) = match num_opening_quotes {
2084            1 => (single_quote_token, NumStringQuoteChars::One),
2085            2 => {
2086                // If we matched double quotes, then this is an empty string.
2087                return Ok(Some(single_quote_token("".into())));
2088            }
2089            3 => {
2090                let Some(num_quote_chars) = NonZeroU8::new(3) else {
2091                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
2092                };
2093                (
2094                    triple_quote_token,
2095                    NumStringQuoteChars::Many(num_quote_chars),
2096                )
2097            }
2098            _ => {
2099                return self.tokenizer_error(error_loc, "invalid string literal opening");
2100            }
2101        };
2102
2103        let settings = TokenizeQuotedStringSettings {
2104            quote_style,
2105            num_quote_chars,
2106            num_opening_quotes_to_consume: 0,
2107            backslash_escape,
2108        };
2109
2110        self.tokenize_quoted_string(chars, settings)
2111            .map(token_fn)
2112            .map(Some)
2113    }
2114
2115    /// Reads a string literal quoted by a single quote character.
2116    fn tokenize_single_quoted_string(
2117        &self,
2118        chars: &mut State,
2119        quote_style: char,
2120        backslash_escape: bool,
2121    ) -> Result<String, TokenizerError> {
2122        self.tokenize_quoted_string(
2123            chars,
2124            TokenizeQuotedStringSettings {
2125                quote_style,
2126                num_quote_chars: NumStringQuoteChars::One,
2127                num_opening_quotes_to_consume: 1,
2128                backslash_escape,
2129            },
2130        )
2131    }
2132
2133    /// Reads a quote delimited string expecting `chars.next()` to deliver a quote.
2134    ///
2135    /// See <https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA>
2136    fn tokenize_quote_delimited_string(
2137        &self,
2138        chars: &mut State,
2139        // the prefix that introduced the possible literal or word,
2140        // e.g. "Q" or "nq"
2141        literal_prefix: &[char],
2142    ) -> Result<QuoteDelimitedString, TokenizerError> {
2143        let literal_start_loc = chars.location();
2144        chars.next();
2145
2146        let start_quote_loc = chars.location();
2147        let (start_quote, end_quote) = match chars.next() {
2148            None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => {
2149                return self.tokenizer_error(
2150                    start_quote_loc,
2151                    format!(
2152                        "Invalid space, tab, newline, or EOF after '{}''",
2153                        String::from_iter(literal_prefix)
2154                    ),
2155                );
2156            }
2157            Some(c) => (
2158                c,
2159                match c {
2160                    '[' => ']',
2161                    '{' => '}',
2162                    '<' => '>',
2163                    '(' => ')',
2164                    c => c,
2165                },
2166            ),
2167        };
2168
2169        // read the string literal until the "quote character" following a by literal quote
2170        let mut value = String::new();
2171        while let Some(ch) = chars.next() {
2172            if ch == end_quote {
2173                if let Some('\'') = chars.peek() {
2174                    chars.next(); // ~ consume the quote
2175                    return Ok(QuoteDelimitedString {
2176                        start_quote,
2177                        value,
2178                        end_quote,
2179                    });
2180                }
2181            }
2182            value.push(ch);
2183        }
2184
2185        self.tokenizer_error(literal_start_loc, "Unterminated string literal")
2186    }
2187
2188    /// Read a quoted string.
2189    fn tokenize_quoted_string(
2190        &self,
2191        chars: &mut State,
2192        settings: TokenizeQuotedStringSettings,
2193    ) -> Result<String, TokenizerError> {
2194        let mut s = String::new();
2195        let error_loc = chars.location();
2196
2197        // Consume any opening quotes.
2198        for _ in 0..settings.num_opening_quotes_to_consume {
2199            if Some(settings.quote_style) != chars.next() {
2200                return self.tokenizer_error(error_loc, "invalid string literal opening");
2201            }
2202        }
2203
2204        let mut num_consecutive_quotes = 0;
2205        while let Some(&ch) = chars.peek() {
2206            let pending_final_quote = match settings.num_quote_chars {
2207                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2208                n @ NumStringQuoteChars::Many(count)
2209                    if num_consecutive_quotes + 1 == count.get() =>
2210                {
2211                    Some(n)
2212                }
2213                NumStringQuoteChars::Many(_) => None,
2214            };
2215
2216            match ch {
2217                char if char == settings.quote_style && pending_final_quote.is_some() => {
2218                    chars.next(); // consume
2219
2220                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2221                        // For an initial string like `"""abc"""`, at this point we have
2222                        // `abc""` in the buffer and have now matched the final `"`.
2223                        // However, the string to return is simply `abc`, so we strip off
2224                        // the trailing quotes before returning.
2225                        let mut buf = s.chars();
2226                        for _ in 1..count.get() {
2227                            buf.next_back();
2228                        }
2229                        return Ok(buf.as_str().to_string());
2230                    } else if chars
2231                        .peek()
2232                        .map(|c| *c == settings.quote_style)
2233                        .unwrap_or(false)
2234                    {
2235                        s.push(ch);
2236                        if !self.unescape {
2237                            // In no-escape mode, the given query has to be saved completely
2238                            s.push(ch);
2239                        }
2240                        chars.next();
2241                    } else {
2242                        return Ok(s);
2243                    }
2244                }
2245                '\\' if settings.backslash_escape => {
2246                    // consume backslash
2247                    chars.next();
2248
2249                    num_consecutive_quotes = 0;
2250
2251                    if let Some(next) = chars.peek() {
2252                        if !self.unescape
2253                            || (self.dialect.ignores_wildcard_escapes()
2254                                && (*next == '%' || *next == '_'))
2255                        {
2256                            // In no-escape mode, the given query has to be saved completely
2257                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2258                            // the backslash is not stripped.
2259                            s.push(ch);
2260                            s.push(*next);
2261                            chars.next(); // consume next
2262                        } else {
2263                            let n = match next {
2264                                '0' => '\0',
2265                                'a' => '\u{7}',
2266                                'b' => '\u{8}',
2267                                'f' => '\u{c}',
2268                                'n' => '\n',
2269                                'r' => '\r',
2270                                't' => '\t',
2271                                'Z' => '\u{1a}',
2272                                _ => *next,
2273                            };
2274                            s.push(n);
2275                            chars.next(); // consume next
2276                        }
2277                    }
2278                }
2279                ch => {
2280                    chars.next(); // consume ch
2281
2282                    if ch == settings.quote_style {
2283                        num_consecutive_quotes += 1;
2284                    } else {
2285                        num_consecutive_quotes = 0;
2286                    }
2287
2288                    s.push(ch);
2289                }
2290            }
2291        }
2292        self.tokenizer_error(error_loc, "Unterminated string literal")
2293    }
2294
2295    fn tokenize_multiline_comment(
2296        &self,
2297        chars: &mut State,
2298    ) -> Result<Option<Token>, TokenizerError> {
2299        let mut s = String::new();
2300        let mut nested = 1;
2301        let supports_nested_comments = self.dialect.supports_nested_comments();
2302        loop {
2303            match chars.next() {
2304                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2305                    chars.next(); // consume the '*'
2306                    s.push('/');
2307                    s.push('*');
2308                    nested += 1;
2309                }
2310                Some('*') if matches!(chars.peek(), Some('/')) => {
2311                    chars.next(); // consume the '/'
2312                    nested -= 1;
2313                    if nested == 0 {
2314                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2315                    }
2316                    s.push('*');
2317                    s.push('/');
2318                }
2319                Some(ch) => {
2320                    s.push(ch);
2321                }
2322                None => {
2323                    break self.tokenizer_error(
2324                        chars.location(),
2325                        "Unexpected EOF while in a multi-line comment",
2326                    );
2327                }
2328            }
2329        }
2330    }
2331
2332    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2333        let mut last_char = None;
2334        let mut s = String::new();
2335        while let Some(ch) = chars.next() {
2336            if ch == quote_end {
2337                if chars.peek() == Some(&quote_end) {
2338                    chars.next();
2339                    s.push(ch);
2340                    if !self.unescape {
2341                        // In no-escape mode, the given query has to be saved completely
2342                        s.push(ch);
2343                    }
2344                } else {
2345                    last_char = Some(quote_end);
2346                    break;
2347                }
2348            } else {
2349                s.push(ch);
2350            }
2351        }
2352        (s, last_char)
2353    }
2354
2355    #[allow(clippy::unnecessary_wraps)]
2356    fn consume_and_return(
2357        &self,
2358        chars: &mut State,
2359        t: Token,
2360    ) -> Result<Option<Token>, TokenizerError> {
2361        chars.next();
2362        Ok(Some(t))
2363    }
2364}
2365
2366/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2367/// Return the characters read as String, and keep the first non-matching
2368/// char available as `chars.next()`.
2369fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2370    let mut s = String::new();
2371    while let Some(&ch) = chars.peek() {
2372        if predicate(ch) {
2373            chars.next(); // consume
2374            s.push(ch);
2375        } else {
2376            break;
2377        }
2378    }
2379    s
2380}
2381
2382/// Same as peeking_take_while, but also passes the next character to the predicate.
2383fn peeking_next_take_while(
2384    chars: &mut State,
2385    mut predicate: impl FnMut(char, Option<char>) -> bool,
2386) -> String {
2387    let mut s = String::new();
2388    while let Some(&ch) = chars.peek() {
2389        let next_char = chars.peekable.clone().nth(1);
2390        if predicate(ch, next_char) {
2391            chars.next(); // consume
2392            s.push(ch);
2393        } else {
2394            break;
2395        }
2396    }
2397    s
2398}
2399
2400fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2401    Unescape::new(chars).unescape()
2402}
2403
2404struct Unescape<'a: 'b, 'b> {
2405    chars: &'b mut State<'a>,
2406}
2407
2408impl<'a: 'b, 'b> Unescape<'a, 'b> {
2409    fn new(chars: &'b mut State<'a>) -> Self {
2410        Self { chars }
2411    }
2412    fn unescape(mut self) -> Option<String> {
2413        let mut unescaped = String::new();
2414
2415        self.chars.next();
2416
2417        while let Some(c) = self.chars.next() {
2418            if c == '\'' {
2419                // case: ''''
2420                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2421                    self.chars.next();
2422                    unescaped.push('\'');
2423                    continue;
2424                }
2425                return Some(unescaped);
2426            }
2427
2428            if c != '\\' {
2429                unescaped.push(c);
2430                continue;
2431            }
2432
2433            let c = match self.chars.next()? {
2434                'b' => '\u{0008}',
2435                'f' => '\u{000C}',
2436                'n' => '\n',
2437                'r' => '\r',
2438                't' => '\t',
2439                'u' => self.unescape_unicode_16()?,
2440                'U' => self.unescape_unicode_32()?,
2441                'x' => self.unescape_hex()?,
2442                c if c.is_digit(8) => self.unescape_octal(c)?,
2443                c => c,
2444            };
2445
2446            unescaped.push(Self::check_null(c)?);
2447        }
2448
2449        None
2450    }
2451
2452    #[inline]
2453    fn check_null(c: char) -> Option<char> {
2454        if c == '\0' {
2455            None
2456        } else {
2457            Some(c)
2458        }
2459    }
2460
2461    #[inline]
2462    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2463        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2464        match u32::from_str_radix(s, RADIX) {
2465            Err(_) => None,
2466            Ok(n) => {
2467                let n = n & 0xFF;
2468                if n <= 127 {
2469                    char::from_u32(n)
2470                } else {
2471                    None
2472                }
2473            }
2474        }
2475    }
2476
2477    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2478    fn unescape_hex(&mut self) -> Option<char> {
2479        let mut s = String::new();
2480
2481        for _ in 0..2 {
2482            match self.next_hex_digit() {
2483                Some(c) => s.push(c),
2484                None => break,
2485            }
2486        }
2487
2488        if s.is_empty() {
2489            return Some('x');
2490        }
2491
2492        Self::byte_to_char::<16>(&s)
2493    }
2494
2495    #[inline]
2496    fn next_hex_digit(&mut self) -> Option<char> {
2497        match self.chars.peek() {
2498            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2499            _ => None,
2500        }
2501    }
2502
2503    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2504    fn unescape_octal(&mut self, c: char) -> Option<char> {
2505        let mut s = String::new();
2506
2507        s.push(c);
2508        for _ in 0..2 {
2509            match self.next_octal_digest() {
2510                Some(c) => s.push(c),
2511                None => break,
2512            }
2513        }
2514
2515        Self::byte_to_char::<8>(&s)
2516    }
2517
2518    #[inline]
2519    fn next_octal_digest(&mut self) -> Option<char> {
2520        match self.chars.peek() {
2521            Some(c) if c.is_digit(8) => self.chars.next(),
2522            _ => None,
2523        }
2524    }
2525
2526    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2527    fn unescape_unicode_16(&mut self) -> Option<char> {
2528        self.unescape_unicode::<4>()
2529    }
2530
2531    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2532    fn unescape_unicode_32(&mut self) -> Option<char> {
2533        self.unescape_unicode::<8>()
2534    }
2535
2536    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2537        let mut s = String::new();
2538        for _ in 0..NUM {
2539            s.push(self.chars.next()?);
2540        }
2541        match u32::from_str_radix(&s, 16) {
2542            Err(_) => None,
2543            Ok(n) => char::from_u32(n),
2544        }
2545    }
2546}
2547
2548fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2549    let mut unescaped = String::new();
2550    chars.next(); // consume the opening quote
2551    while let Some(c) = chars.next() {
2552        match c {
2553            '\'' => {
2554                if chars.peek() == Some(&'\'') {
2555                    chars.next();
2556                    unescaped.push('\'');
2557                } else {
2558                    return Ok(unescaped);
2559                }
2560            }
2561            '\\' => match chars.peek() {
2562                Some('\\') => {
2563                    chars.next();
2564                    unescaped.push('\\');
2565                }
2566                Some('+') => {
2567                    chars.next();
2568                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2569                }
2570                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2571            },
2572            _ => {
2573                unescaped.push(c);
2574            }
2575        }
2576    }
2577    Err(TokenizerError {
2578        message: "Unterminated unicode encoded string literal".to_string(),
2579        location: chars.location(),
2580    })
2581}
2582
2583fn take_char_from_hex_digits(
2584    chars: &mut State<'_>,
2585    max_digits: usize,
2586) -> Result<char, TokenizerError> {
2587    let mut result = 0u32;
2588    for _ in 0..max_digits {
2589        let next_char = chars.next().ok_or_else(|| TokenizerError {
2590            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2591                .to_string(),
2592            location: chars.location(),
2593        })?;
2594        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2595            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2596            location: chars.location(),
2597        })?;
2598        result = result * 16 + digit;
2599    }
2600    char::from_u32(result).ok_or_else(|| TokenizerError {
2601        message: format!("Invalid unicode character: {result:x}"),
2602        location: chars.location(),
2603    })
2604}
2605
2606#[cfg(test)]
2607mod tests {
2608    use super::*;
2609    use crate::dialect::{
2610        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2611    };
2612    use crate::test_utils::{all_dialects_except, all_dialects_where};
2613    use core::fmt::Debug;
2614
2615    #[test]
2616    fn tokenizer_error_impl() {
2617        let err = TokenizerError {
2618            message: "test".into(),
2619            location: Location { line: 1, column: 1 },
2620        };
2621        {
2622            use core::error::Error;
2623            assert!(err.source().is_none());
2624        }
2625        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2626    }
2627
2628    #[test]
2629    fn tokenize_select_1() {
2630        let sql = String::from("SELECT 1");
2631        let dialect = GenericDialect {};
2632        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2633
2634        let expected = vec![
2635            Token::make_keyword("SELECT"),
2636            Token::Whitespace(Whitespace::Space),
2637            Token::Number(String::from("1"), false),
2638        ];
2639
2640        compare(expected, tokens);
2641    }
2642
2643    #[test]
2644    fn tokenize_select_float() {
2645        let sql = String::from("SELECT .1");
2646        let dialect = GenericDialect {};
2647        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2648
2649        let expected = vec![
2650            Token::make_keyword("SELECT"),
2651            Token::Whitespace(Whitespace::Space),
2652            Token::Number(String::from(".1"), false),
2653        ];
2654
2655        compare(expected, tokens);
2656    }
2657
2658    #[test]
2659    fn tokenize_with_mapper() {
2660        let sql = String::from("SELECT ?");
2661        let dialect = GenericDialect {};
2662        let mut param_num = 1;
2663
2664        let mut tokens = vec![];
2665        Tokenizer::new(&dialect, &sql)
2666            .tokenize_with_location_into_buf_with_mapper(&mut tokens, |mut token_span| {
2667                token_span.token = match token_span.token {
2668                    Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2669                        let ret = format!("${}", param_num);
2670                        param_num += 1;
2671                        ret
2672                    } else {
2673                        n
2674                    }),
2675                    token => token,
2676                };
2677                token_span
2678            })
2679            .unwrap();
2680        let actual = tokens.into_iter().map(|t| t.token).collect();
2681        let expected = vec![
2682            Token::make_keyword("SELECT"),
2683            Token::Whitespace(Whitespace::Space),
2684            Token::Placeholder("$1".to_string()),
2685        ];
2686
2687        compare(expected, actual);
2688    }
2689
2690    #[test]
2691    fn tokenize_clickhouse_double_equal() {
2692        let sql = String::from("SELECT foo=='1'");
2693        let dialect = ClickHouseDialect {};
2694        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2695        let tokens = tokenizer.tokenize().unwrap();
2696
2697        let expected = vec![
2698            Token::make_keyword("SELECT"),
2699            Token::Whitespace(Whitespace::Space),
2700            Token::Word(Word {
2701                value: "foo".to_string(),
2702                quote_style: None,
2703                keyword: Keyword::NoKeyword,
2704            }),
2705            Token::DoubleEq,
2706            Token::SingleQuotedString("1".to_string()),
2707        ];
2708
2709        compare(expected, tokens);
2710    }
2711
2712    #[test]
2713    fn tokenize_numeric_literal_underscore() {
2714        let dialect = GenericDialect {};
2715        let sql = String::from("SELECT 10_000");
2716        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2717        let tokens = tokenizer.tokenize().unwrap();
2718        let expected = vec![
2719            Token::make_keyword("SELECT"),
2720            Token::Whitespace(Whitespace::Space),
2721            Token::Number("10".to_string(), false),
2722            Token::make_word("_000", None),
2723        ];
2724        compare(expected, tokens);
2725
2726        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2727            "SELECT 10_000, _10_000, 10_00_, 10___0",
2728            vec![
2729                Token::make_keyword("SELECT"),
2730                Token::Whitespace(Whitespace::Space),
2731                Token::Number("10_000".to_string(), false),
2732                Token::Comma,
2733                Token::Whitespace(Whitespace::Space),
2734                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2735                Token::Comma,
2736                Token::Whitespace(Whitespace::Space),
2737                Token::Number("10_00".to_string(), false),
2738                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2739                Token::Comma,
2740                Token::Whitespace(Whitespace::Space),
2741                Token::Number("10".to_string(), false),
2742                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2743            ],
2744        );
2745    }
2746
2747    #[test]
2748    fn tokenize_select_exponent() {
2749        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2750        let dialect = GenericDialect {};
2751        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2752
2753        let expected = vec![
2754            Token::make_keyword("SELECT"),
2755            Token::Whitespace(Whitespace::Space),
2756            Token::Number(String::from("1e10"), false),
2757            Token::Comma,
2758            Token::Whitespace(Whitespace::Space),
2759            Token::Number(String::from("1e-10"), false),
2760            Token::Comma,
2761            Token::Whitespace(Whitespace::Space),
2762            Token::Number(String::from("1e+10"), false),
2763            Token::Comma,
2764            Token::Whitespace(Whitespace::Space),
2765            Token::Number(String::from("1"), false),
2766            Token::make_word("ea", None),
2767            Token::Comma,
2768            Token::Whitespace(Whitespace::Space),
2769            Token::Number(String::from("1e-10"), false),
2770            Token::make_word("a", None),
2771            Token::Comma,
2772            Token::Whitespace(Whitespace::Space),
2773            Token::Number(String::from("1e-10"), false),
2774            Token::Minus,
2775            Token::Number(String::from("10"), false),
2776        ];
2777
2778        compare(expected, tokens);
2779    }
2780
2781    #[test]
2782    fn tokenize_scalar_function() {
2783        let sql = String::from("SELECT sqrt(1)");
2784        let dialect = GenericDialect {};
2785        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2786
2787        let expected = vec![
2788            Token::make_keyword("SELECT"),
2789            Token::Whitespace(Whitespace::Space),
2790            Token::make_word("sqrt", None),
2791            Token::LParen,
2792            Token::Number(String::from("1"), false),
2793            Token::RParen,
2794        ];
2795
2796        compare(expected, tokens);
2797    }
2798
2799    #[test]
2800    fn tokenize_string_string_concat() {
2801        let sql = String::from("SELECT 'a' || 'b'");
2802        let dialect = GenericDialect {};
2803        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2804
2805        let expected = vec![
2806            Token::make_keyword("SELECT"),
2807            Token::Whitespace(Whitespace::Space),
2808            Token::SingleQuotedString(String::from("a")),
2809            Token::Whitespace(Whitespace::Space),
2810            Token::StringConcat,
2811            Token::Whitespace(Whitespace::Space),
2812            Token::SingleQuotedString(String::from("b")),
2813        ];
2814
2815        compare(expected, tokens);
2816    }
2817    #[test]
2818    fn tokenize_bitwise_op() {
2819        let sql = String::from("SELECT one | two ^ three");
2820        let dialect = GenericDialect {};
2821        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2822
2823        let expected = vec![
2824            Token::make_keyword("SELECT"),
2825            Token::Whitespace(Whitespace::Space),
2826            Token::make_word("one", None),
2827            Token::Whitespace(Whitespace::Space),
2828            Token::Pipe,
2829            Token::Whitespace(Whitespace::Space),
2830            Token::make_word("two", None),
2831            Token::Whitespace(Whitespace::Space),
2832            Token::Caret,
2833            Token::Whitespace(Whitespace::Space),
2834            Token::make_word("three", None),
2835        ];
2836        compare(expected, tokens);
2837    }
2838
2839    #[test]
2840    fn tokenize_logical_xor() {
2841        let sql =
2842            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2843        let dialect = GenericDialect {};
2844        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2845
2846        let expected = vec![
2847            Token::make_keyword("SELECT"),
2848            Token::Whitespace(Whitespace::Space),
2849            Token::make_keyword("true"),
2850            Token::Whitespace(Whitespace::Space),
2851            Token::make_keyword("XOR"),
2852            Token::Whitespace(Whitespace::Space),
2853            Token::make_keyword("true"),
2854            Token::Comma,
2855            Token::Whitespace(Whitespace::Space),
2856            Token::make_keyword("false"),
2857            Token::Whitespace(Whitespace::Space),
2858            Token::make_keyword("XOR"),
2859            Token::Whitespace(Whitespace::Space),
2860            Token::make_keyword("false"),
2861            Token::Comma,
2862            Token::Whitespace(Whitespace::Space),
2863            Token::make_keyword("true"),
2864            Token::Whitespace(Whitespace::Space),
2865            Token::make_keyword("XOR"),
2866            Token::Whitespace(Whitespace::Space),
2867            Token::make_keyword("false"),
2868            Token::Comma,
2869            Token::Whitespace(Whitespace::Space),
2870            Token::make_keyword("false"),
2871            Token::Whitespace(Whitespace::Space),
2872            Token::make_keyword("XOR"),
2873            Token::Whitespace(Whitespace::Space),
2874            Token::make_keyword("true"),
2875        ];
2876        compare(expected, tokens);
2877    }
2878
2879    #[test]
2880    fn tokenize_simple_select() {
2881        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2882        let dialect = GenericDialect {};
2883        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2884
2885        let expected = vec![
2886            Token::make_keyword("SELECT"),
2887            Token::Whitespace(Whitespace::Space),
2888            Token::Mul,
2889            Token::Whitespace(Whitespace::Space),
2890            Token::make_keyword("FROM"),
2891            Token::Whitespace(Whitespace::Space),
2892            Token::make_word("customer", None),
2893            Token::Whitespace(Whitespace::Space),
2894            Token::make_keyword("WHERE"),
2895            Token::Whitespace(Whitespace::Space),
2896            Token::make_word("id", None),
2897            Token::Whitespace(Whitespace::Space),
2898            Token::Eq,
2899            Token::Whitespace(Whitespace::Space),
2900            Token::Number(String::from("1"), false),
2901            Token::Whitespace(Whitespace::Space),
2902            Token::make_keyword("LIMIT"),
2903            Token::Whitespace(Whitespace::Space),
2904            Token::Number(String::from("5"), false),
2905        ];
2906
2907        compare(expected, tokens);
2908    }
2909
2910    #[test]
2911    fn tokenize_explain_select() {
2912        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2913        let dialect = GenericDialect {};
2914        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2915
2916        let expected = vec![
2917            Token::make_keyword("EXPLAIN"),
2918            Token::Whitespace(Whitespace::Space),
2919            Token::make_keyword("SELECT"),
2920            Token::Whitespace(Whitespace::Space),
2921            Token::Mul,
2922            Token::Whitespace(Whitespace::Space),
2923            Token::make_keyword("FROM"),
2924            Token::Whitespace(Whitespace::Space),
2925            Token::make_word("customer", None),
2926            Token::Whitespace(Whitespace::Space),
2927            Token::make_keyword("WHERE"),
2928            Token::Whitespace(Whitespace::Space),
2929            Token::make_word("id", None),
2930            Token::Whitespace(Whitespace::Space),
2931            Token::Eq,
2932            Token::Whitespace(Whitespace::Space),
2933            Token::Number(String::from("1"), false),
2934        ];
2935
2936        compare(expected, tokens);
2937    }
2938
2939    #[test]
2940    fn tokenize_explain_analyze_select() {
2941        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2942        let dialect = GenericDialect {};
2943        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2944
2945        let expected = vec![
2946            Token::make_keyword("EXPLAIN"),
2947            Token::Whitespace(Whitespace::Space),
2948            Token::make_keyword("ANALYZE"),
2949            Token::Whitespace(Whitespace::Space),
2950            Token::make_keyword("SELECT"),
2951            Token::Whitespace(Whitespace::Space),
2952            Token::Mul,
2953            Token::Whitespace(Whitespace::Space),
2954            Token::make_keyword("FROM"),
2955            Token::Whitespace(Whitespace::Space),
2956            Token::make_word("customer", None),
2957            Token::Whitespace(Whitespace::Space),
2958            Token::make_keyword("WHERE"),
2959            Token::Whitespace(Whitespace::Space),
2960            Token::make_word("id", None),
2961            Token::Whitespace(Whitespace::Space),
2962            Token::Eq,
2963            Token::Whitespace(Whitespace::Space),
2964            Token::Number(String::from("1"), false),
2965        ];
2966
2967        compare(expected, tokens);
2968    }
2969
2970    #[test]
2971    fn tokenize_string_predicate() {
2972        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2973        let dialect = GenericDialect {};
2974        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2975
2976        let expected = vec![
2977            Token::make_keyword("SELECT"),
2978            Token::Whitespace(Whitespace::Space),
2979            Token::Mul,
2980            Token::Whitespace(Whitespace::Space),
2981            Token::make_keyword("FROM"),
2982            Token::Whitespace(Whitespace::Space),
2983            Token::make_word("customer", None),
2984            Token::Whitespace(Whitespace::Space),
2985            Token::make_keyword("WHERE"),
2986            Token::Whitespace(Whitespace::Space),
2987            Token::make_word("salary", None),
2988            Token::Whitespace(Whitespace::Space),
2989            Token::Neq,
2990            Token::Whitespace(Whitespace::Space),
2991            Token::SingleQuotedString(String::from("Not Provided")),
2992        ];
2993
2994        compare(expected, tokens);
2995    }
2996
2997    #[test]
2998    fn tokenize_invalid_string() {
2999        let sql = String::from("\n💝مصطفىh");
3000
3001        let dialect = GenericDialect {};
3002        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3003        // println!("tokens: {:#?}", tokens);
3004        let expected = vec![
3005            Token::Whitespace(Whitespace::Newline),
3006            Token::Char('💝'),
3007            Token::make_word("مصطفىh", None),
3008        ];
3009        compare(expected, tokens);
3010    }
3011
3012    #[test]
3013    fn tokenize_newline_in_string_literal() {
3014        let sql = String::from("'foo\r\nbar\nbaz'");
3015
3016        let dialect = GenericDialect {};
3017        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3018        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
3019        compare(expected, tokens);
3020    }
3021
3022    #[test]
3023    fn tokenize_unterminated_string_literal() {
3024        let sql = String::from("select 'foo");
3025
3026        let dialect = GenericDialect {};
3027        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3028        assert_eq!(
3029            tokenizer.tokenize(),
3030            Err(TokenizerError {
3031                message: "Unterminated string literal".to_string(),
3032                location: Location { line: 1, column: 8 },
3033            })
3034        );
3035    }
3036
3037    #[test]
3038    fn tokenize_unterminated_string_literal_utf8() {
3039        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
3040
3041        let dialect = GenericDialect {};
3042        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3043        assert_eq!(
3044            tokenizer.tokenize(),
3045            Err(TokenizerError {
3046                message: "Unterminated string literal".to_string(),
3047                location: Location {
3048                    line: 1,
3049                    column: 35
3050                }
3051            })
3052        );
3053    }
3054
3055    #[test]
3056    fn tokenize_invalid_string_cols() {
3057        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
3058
3059        let dialect = GenericDialect {};
3060        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3061        // println!("tokens: {:#?}", tokens);
3062        let expected = vec![
3063            Token::Whitespace(Whitespace::Newline),
3064            Token::Whitespace(Whitespace::Newline),
3065            Token::make_keyword("SELECT"),
3066            Token::Whitespace(Whitespace::Space),
3067            Token::Mul,
3068            Token::Whitespace(Whitespace::Space),
3069            Token::make_keyword("FROM"),
3070            Token::Whitespace(Whitespace::Space),
3071            Token::make_keyword("table"),
3072            Token::Whitespace(Whitespace::Tab),
3073            Token::Char('💝'),
3074            Token::make_word("مصطفىh", None),
3075        ];
3076        compare(expected, tokens);
3077    }
3078
3079    #[test]
3080    fn tokenize_dollar_quoted_string_tagged() {
3081        let test_cases = vec![
3082            (
3083                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
3084                vec![
3085                    Token::make_keyword("SELECT"),
3086                    Token::Whitespace(Whitespace::Space),
3087                    Token::DollarQuotedString(DollarQuotedString {
3088                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
3089                        tag: Some("tag".into()),
3090                    })
3091                ]
3092            ),
3093            (
3094                String::from("SELECT $abc$x$ab$abc$"),
3095                vec![
3096                    Token::make_keyword("SELECT"),
3097                    Token::Whitespace(Whitespace::Space),
3098                    Token::DollarQuotedString(DollarQuotedString {
3099                        value: "x$ab".into(),
3100                        tag: Some("abc".into()),
3101                    })
3102                ]
3103            ),
3104            (
3105                String::from("SELECT $abc$$abc$"),
3106                vec![
3107                    Token::make_keyword("SELECT"),
3108                    Token::Whitespace(Whitespace::Space),
3109                    Token::DollarQuotedString(DollarQuotedString {
3110                        value: "".into(),
3111                        tag: Some("abc".into()),
3112                    })
3113                ]
3114            ),
3115            (
3116                String::from("0$abc$$abc$1"),
3117                vec![
3118                    Token::Number("0".into(), false),
3119                    Token::DollarQuotedString(DollarQuotedString {
3120                        value: "".into(),
3121                        tag: Some("abc".into()),
3122                    }),
3123                    Token::Number("1".into(), false),
3124                ]
3125            ),
3126            (
3127                String::from("$function$abc$q$data$q$$function$"),
3128                vec![
3129                    Token::DollarQuotedString(DollarQuotedString {
3130                        value: "abc$q$data$q$".into(),
3131                        tag: Some("function".into()),
3132                    }),
3133                ]
3134            ),
3135        ];
3136
3137        let dialect = GenericDialect {};
3138        for (sql, expected) in test_cases {
3139            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3140            compare(expected, tokens);
3141        }
3142    }
3143
3144    #[test]
3145    fn tokenize_dollar_quoted_string_tagged_unterminated() {
3146        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
3147        let dialect = GenericDialect {};
3148        assert_eq!(
3149            Tokenizer::new(&dialect, &sql).tokenize(),
3150            Err(TokenizerError {
3151                message: "Unterminated dollar-quoted, expected $".into(),
3152                location: Location {
3153                    line: 1,
3154                    column: 91
3155                }
3156            })
3157        );
3158    }
3159
3160    #[test]
3161    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
3162        let sql = String::from("SELECT $abc$abc$");
3163        let dialect = GenericDialect {};
3164        assert_eq!(
3165            Tokenizer::new(&dialect, &sql).tokenize(),
3166            Err(TokenizerError {
3167                message: "Unterminated dollar-quoted, expected $".into(),
3168                location: Location {
3169                    line: 1,
3170                    column: 17
3171                }
3172            })
3173        );
3174    }
3175
3176    #[test]
3177    fn tokenize_dollar_placeholder() {
3178        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3179        let dialect = SQLiteDialect {};
3180        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3181        assert_eq!(
3182            tokens,
3183            vec![
3184                Token::make_keyword("SELECT"),
3185                Token::Whitespace(Whitespace::Space),
3186                Token::Placeholder("$$".into()),
3187                Token::Comma,
3188                Token::Whitespace(Whitespace::Space),
3189                Token::Placeholder("$$ABC$$".into()),
3190                Token::Comma,
3191                Token::Whitespace(Whitespace::Space),
3192                Token::Placeholder("$ABC$".into()),
3193                Token::Comma,
3194                Token::Whitespace(Whitespace::Space),
3195                Token::Placeholder("$ABC".into()),
3196            ]
3197        );
3198    }
3199
3200    #[test]
3201    fn tokenize_nested_dollar_quoted_strings() {
3202        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3203        let dialect = GenericDialect {};
3204        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3205        let expected = vec![
3206            Token::make_keyword("SELECT"),
3207            Token::Whitespace(Whitespace::Space),
3208            Token::DollarQuotedString(DollarQuotedString {
3209                value: "dollar $nested$ string".into(),
3210                tag: Some("tag".into()),
3211            }),
3212        ];
3213        compare(expected, tokens);
3214    }
3215
3216    #[test]
3217    fn tokenize_dollar_quoted_string_untagged_empty() {
3218        let sql = String::from("SELECT $$$$");
3219        let dialect = GenericDialect {};
3220        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3221        let expected = vec![
3222            Token::make_keyword("SELECT"),
3223            Token::Whitespace(Whitespace::Space),
3224            Token::DollarQuotedString(DollarQuotedString {
3225                value: "".into(),
3226                tag: None,
3227            }),
3228        ];
3229        compare(expected, tokens);
3230    }
3231
3232    #[test]
3233    fn tokenize_dollar_quoted_string_untagged() {
3234        let sql =
3235            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3236        let dialect = GenericDialect {};
3237        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3238        let expected = vec![
3239            Token::make_keyword("SELECT"),
3240            Token::Whitespace(Whitespace::Space),
3241            Token::DollarQuotedString(DollarQuotedString {
3242                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3243                tag: None,
3244            }),
3245        ];
3246        compare(expected, tokens);
3247    }
3248
3249    #[test]
3250    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3251        let sql = String::from(
3252            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3253        );
3254        let dialect = GenericDialect {};
3255        assert_eq!(
3256            Tokenizer::new(&dialect, &sql).tokenize(),
3257            Err(TokenizerError {
3258                message: "Unterminated dollar-quoted string".into(),
3259                location: Location {
3260                    line: 1,
3261                    column: 86
3262                }
3263            })
3264        );
3265    }
3266
3267    #[test]
3268    fn tokenize_right_arrow() {
3269        let sql = String::from("FUNCTION(key=>value)");
3270        let dialect = GenericDialect {};
3271        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3272        let expected = vec![
3273            Token::make_word("FUNCTION", None),
3274            Token::LParen,
3275            Token::make_word("key", None),
3276            Token::RArrow,
3277            Token::make_word("value", None),
3278            Token::RParen,
3279        ];
3280        compare(expected, tokens);
3281    }
3282
3283    #[test]
3284    fn tokenize_is_null() {
3285        let sql = String::from("a IS NULL");
3286        let dialect = GenericDialect {};
3287        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3288
3289        let expected = vec![
3290            Token::make_word("a", None),
3291            Token::Whitespace(Whitespace::Space),
3292            Token::make_keyword("IS"),
3293            Token::Whitespace(Whitespace::Space),
3294            Token::make_keyword("NULL"),
3295        ];
3296
3297        compare(expected, tokens);
3298    }
3299
3300    #[test]
3301    fn tokenize_comment() {
3302        let test_cases = vec![
3303            (
3304                String::from("0--this is a comment\n1"),
3305                vec![
3306                    Token::Number("0".to_string(), false),
3307                    Token::Whitespace(Whitespace::SingleLineComment {
3308                        prefix: "--".to_string(),
3309                        comment: "this is a comment\n".to_string(),
3310                    }),
3311                    Token::Number("1".to_string(), false),
3312                ],
3313            ),
3314            (
3315                String::from("0--this is a comment\r1"),
3316                vec![
3317                    Token::Number("0".to_string(), false),
3318                    Token::Whitespace(Whitespace::SingleLineComment {
3319                        prefix: "--".to_string(),
3320                        comment: "this is a comment\r1".to_string(),
3321                    }),
3322                ],
3323            ),
3324            (
3325                String::from("0--this is a comment\r\n1"),
3326                vec![
3327                    Token::Number("0".to_string(), false),
3328                    Token::Whitespace(Whitespace::SingleLineComment {
3329                        prefix: "--".to_string(),
3330                        comment: "this is a comment\r\n".to_string(),
3331                    }),
3332                    Token::Number("1".to_string(), false),
3333                ],
3334            ),
3335        ];
3336
3337        let dialect = GenericDialect {};
3338
3339        for (sql, expected) in test_cases {
3340            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3341            compare(expected, tokens);
3342        }
3343    }
3344
3345    #[test]
3346    fn tokenize_comment_postgres() {
3347        let sql = String::from("1--\r0");
3348
3349        let dialect = PostgreSqlDialect {};
3350        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3351        let expected = vec![
3352            Token::Number("1".to_string(), false),
3353            Token::Whitespace(Whitespace::SingleLineComment {
3354                prefix: "--".to_string(),
3355                comment: "\r".to_string(),
3356            }),
3357            Token::Number("0".to_string(), false),
3358        ];
3359        compare(expected, tokens);
3360    }
3361
3362    #[test]
3363    fn tokenize_comment_at_eof() {
3364        let sql = String::from("--this is a comment");
3365
3366        let dialect = GenericDialect {};
3367        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3368        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3369            prefix: "--".to_string(),
3370            comment: "this is a comment".to_string(),
3371        })];
3372        compare(expected, tokens);
3373    }
3374
3375    #[test]
3376    fn tokenize_multiline_comment() {
3377        let sql = String::from("0/*multi-line\n* /comment*/1");
3378
3379        let dialect = GenericDialect {};
3380        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3381        let expected = vec![
3382            Token::Number("0".to_string(), false),
3383            Token::Whitespace(Whitespace::MultiLineComment(
3384                "multi-line\n* /comment".to_string(),
3385            )),
3386            Token::Number("1".to_string(), false),
3387        ];
3388        compare(expected, tokens);
3389    }
3390
3391    #[test]
3392    fn tokenize_nested_multiline_comment() {
3393        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3394            "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3395            vec![
3396                Token::Number("0".to_string(), false),
3397                Token::Whitespace(Whitespace::MultiLineComment(
3398                    "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3399                )),
3400                Token::Whitespace(Whitespace::Space),
3401                Token::Div,
3402                Token::Word(Word {
3403                    value: "comment".to_string(),
3404                    quote_style: None,
3405                    keyword: Keyword::COMMENT,
3406                }),
3407                Token::Mul,
3408                Token::Div,
3409                Token::Number("1".to_string(), false),
3410            ],
3411        );
3412
3413        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3414            "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3415            vec![
3416                Token::Number("0".to_string(), false),
3417                Token::Whitespace(Whitespace::MultiLineComment(
3418                    "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3419                )),
3420                Token::Number("1".to_string(), false),
3421            ],
3422        );
3423
3424        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3425            "SELECT 1/* a /* b */ c */0",
3426            vec![
3427                Token::make_keyword("SELECT"),
3428                Token::Whitespace(Whitespace::Space),
3429                Token::Number("1".to_string(), false),
3430                Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3431                Token::Number("0".to_string(), false),
3432            ],
3433        );
3434    }
3435
3436    #[test]
3437    fn tokenize_nested_multiline_comment_empty() {
3438        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3439            "select 1/*/**/*/0",
3440            vec![
3441                Token::make_keyword("select"),
3442                Token::Whitespace(Whitespace::Space),
3443                Token::Number("1".to_string(), false),
3444                Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3445                Token::Number("0".to_string(), false),
3446            ],
3447        );
3448    }
3449
3450    #[test]
3451    fn tokenize_nested_comments_if_not_supported() {
3452        all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3453            "SELECT 1/*/* nested comment */*/0",
3454            vec![
3455                Token::make_keyword("SELECT"),
3456                Token::Whitespace(Whitespace::Space),
3457                Token::Number("1".to_string(), false),
3458                Token::Whitespace(Whitespace::MultiLineComment(
3459                    "/* nested comment ".to_string(),
3460                )),
3461                Token::Mul,
3462                Token::Div,
3463                Token::Number("0".to_string(), false),
3464            ],
3465        );
3466    }
3467
3468    #[test]
3469    fn tokenize_multiline_comment_with_even_asterisks() {
3470        let sql = String::from("\n/** Comment **/\n");
3471
3472        let dialect = GenericDialect {};
3473        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3474        let expected = vec![
3475            Token::Whitespace(Whitespace::Newline),
3476            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3477            Token::Whitespace(Whitespace::Newline),
3478        ];
3479        compare(expected, tokens);
3480    }
3481
3482    #[test]
3483    fn tokenize_unicode_whitespace() {
3484        let sql = String::from(" \u{2003}\n");
3485
3486        let dialect = GenericDialect {};
3487        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3488        let expected = vec![
3489            Token::Whitespace(Whitespace::Space),
3490            Token::Whitespace(Whitespace::Space),
3491            Token::Whitespace(Whitespace::Newline),
3492        ];
3493        compare(expected, tokens);
3494    }
3495
3496    #[test]
3497    fn tokenize_mismatched_quotes() {
3498        let sql = String::from("\"foo");
3499
3500        let dialect = GenericDialect {};
3501        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3502        assert_eq!(
3503            tokenizer.tokenize(),
3504            Err(TokenizerError {
3505                message: "Expected close delimiter '\"' before EOF.".to_string(),
3506                location: Location { line: 1, column: 1 },
3507            })
3508        );
3509    }
3510
3511    #[test]
3512    fn tokenize_newlines() {
3513        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3514
3515        let dialect = GenericDialect {};
3516        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3517        let expected = vec![
3518            Token::make_word("line1", None),
3519            Token::Whitespace(Whitespace::Newline),
3520            Token::make_word("line2", None),
3521            Token::Whitespace(Whitespace::Newline),
3522            Token::make_word("line3", None),
3523            Token::Whitespace(Whitespace::Newline),
3524            Token::make_word("line4", None),
3525            Token::Whitespace(Whitespace::Newline),
3526        ];
3527        compare(expected, tokens);
3528    }
3529
3530    #[test]
3531    fn tokenize_mssql_top() {
3532        let sql = "SELECT TOP 5 [bar] FROM foo";
3533        let dialect = MsSqlDialect {};
3534        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3535        let expected = vec![
3536            Token::make_keyword("SELECT"),
3537            Token::Whitespace(Whitespace::Space),
3538            Token::make_keyword("TOP"),
3539            Token::Whitespace(Whitespace::Space),
3540            Token::Number(String::from("5"), false),
3541            Token::Whitespace(Whitespace::Space),
3542            Token::make_word("bar", Some('[')),
3543            Token::Whitespace(Whitespace::Space),
3544            Token::make_keyword("FROM"),
3545            Token::Whitespace(Whitespace::Space),
3546            Token::make_word("foo", None),
3547        ];
3548        compare(expected, tokens);
3549    }
3550
3551    #[test]
3552    fn tokenize_pg_regex_match() {
3553        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3554        let dialect = GenericDialect {};
3555        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3556        let expected = vec![
3557            Token::make_keyword("SELECT"),
3558            Token::Whitespace(Whitespace::Space),
3559            Token::make_word("col", None),
3560            Token::Whitespace(Whitespace::Space),
3561            Token::Tilde,
3562            Token::Whitespace(Whitespace::Space),
3563            Token::SingleQuotedString("^a".into()),
3564            Token::Comma,
3565            Token::Whitespace(Whitespace::Space),
3566            Token::make_word("col", None),
3567            Token::Whitespace(Whitespace::Space),
3568            Token::TildeAsterisk,
3569            Token::Whitespace(Whitespace::Space),
3570            Token::SingleQuotedString("^a".into()),
3571            Token::Comma,
3572            Token::Whitespace(Whitespace::Space),
3573            Token::make_word("col", None),
3574            Token::Whitespace(Whitespace::Space),
3575            Token::ExclamationMarkTilde,
3576            Token::Whitespace(Whitespace::Space),
3577            Token::SingleQuotedString("^a".into()),
3578            Token::Comma,
3579            Token::Whitespace(Whitespace::Space),
3580            Token::make_word("col", None),
3581            Token::Whitespace(Whitespace::Space),
3582            Token::ExclamationMarkTildeAsterisk,
3583            Token::Whitespace(Whitespace::Space),
3584            Token::SingleQuotedString("^a".into()),
3585        ];
3586        compare(expected, tokens);
3587    }
3588
3589    #[test]
3590    fn tokenize_pg_like_match() {
3591        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3592        let dialect = GenericDialect {};
3593        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3594        let expected = vec![
3595            Token::make_keyword("SELECT"),
3596            Token::Whitespace(Whitespace::Space),
3597            Token::make_word("col", None),
3598            Token::Whitespace(Whitespace::Space),
3599            Token::DoubleTilde,
3600            Token::Whitespace(Whitespace::Space),
3601            Token::SingleQuotedString("_a%".into()),
3602            Token::Comma,
3603            Token::Whitespace(Whitespace::Space),
3604            Token::make_word("col", None),
3605            Token::Whitespace(Whitespace::Space),
3606            Token::DoubleTildeAsterisk,
3607            Token::Whitespace(Whitespace::Space),
3608            Token::SingleQuotedString("_a%".into()),
3609            Token::Comma,
3610            Token::Whitespace(Whitespace::Space),
3611            Token::make_word("col", None),
3612            Token::Whitespace(Whitespace::Space),
3613            Token::ExclamationMarkDoubleTilde,
3614            Token::Whitespace(Whitespace::Space),
3615            Token::SingleQuotedString("_a%".into()),
3616            Token::Comma,
3617            Token::Whitespace(Whitespace::Space),
3618            Token::make_word("col", None),
3619            Token::Whitespace(Whitespace::Space),
3620            Token::ExclamationMarkDoubleTildeAsterisk,
3621            Token::Whitespace(Whitespace::Space),
3622            Token::SingleQuotedString("_a%".into()),
3623        ];
3624        compare(expected, tokens);
3625    }
3626
3627    #[test]
3628    fn tokenize_quoted_identifier() {
3629        let sql = r#" "a "" b" "a """ "c """"" "#;
3630        let dialect = GenericDialect {};
3631        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3632        let expected = vec![
3633            Token::Whitespace(Whitespace::Space),
3634            Token::make_word(r#"a " b"#, Some('"')),
3635            Token::Whitespace(Whitespace::Space),
3636            Token::make_word(r#"a ""#, Some('"')),
3637            Token::Whitespace(Whitespace::Space),
3638            Token::make_word(r#"c """#, Some('"')),
3639            Token::Whitespace(Whitespace::Space),
3640        ];
3641        compare(expected, tokens);
3642    }
3643
3644    #[test]
3645    fn tokenize_snowflake_div() {
3646        let sql = r#"field/1000"#;
3647        let dialect = SnowflakeDialect {};
3648        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3649        let expected = vec![
3650            Token::make_word(r#"field"#, None),
3651            Token::Div,
3652            Token::Number("1000".to_string(), false),
3653        ];
3654        compare(expected, tokens);
3655    }
3656
3657    #[test]
3658    fn tokenize_quoted_identifier_with_no_escape() {
3659        let sql = r#" "a "" b" "a """ "c """"" "#;
3660        let dialect = GenericDialect {};
3661        let tokens = Tokenizer::new(&dialect, sql)
3662            .with_unescape(false)
3663            .tokenize()
3664            .unwrap();
3665        let expected = vec![
3666            Token::Whitespace(Whitespace::Space),
3667            Token::make_word(r#"a "" b"#, Some('"')),
3668            Token::Whitespace(Whitespace::Space),
3669            Token::make_word(r#"a """#, Some('"')),
3670            Token::Whitespace(Whitespace::Space),
3671            Token::make_word(r#"c """""#, Some('"')),
3672            Token::Whitespace(Whitespace::Space),
3673        ];
3674        compare(expected, tokens);
3675    }
3676
3677    #[test]
3678    fn tokenize_with_location() {
3679        let sql = "SELECT a,\n b";
3680        let dialect = GenericDialect {};
3681        let tokens = Tokenizer::new(&dialect, sql)
3682            .tokenize_with_location()
3683            .unwrap();
3684        let expected = vec![
3685            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3686            TokenWithSpan::at(
3687                Token::Whitespace(Whitespace::Space),
3688                (1, 7).into(),
3689                (1, 8).into(),
3690            ),
3691            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3692            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3693            TokenWithSpan::at(
3694                Token::Whitespace(Whitespace::Newline),
3695                (1, 10).into(),
3696                (2, 1).into(),
3697            ),
3698            TokenWithSpan::at(
3699                Token::Whitespace(Whitespace::Space),
3700                (2, 1).into(),
3701                (2, 2).into(),
3702            ),
3703            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3704        ];
3705        compare(expected, tokens);
3706    }
3707
3708    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3709        //println!("------------------------------");
3710        //println!("tokens   = {:?}", actual);
3711        //println!("expected = {:?}", expected);
3712        //println!("------------------------------");
3713        assert_eq!(expected, actual);
3714    }
3715
3716    fn check_unescape(s: &str, expected: Option<&str>) {
3717        let s = format!("'{s}'");
3718        let mut state = State {
3719            peekable: s.chars().peekable(),
3720            line: 0,
3721            col: 0,
3722        };
3723
3724        assert_eq!(
3725            unescape_single_quoted_string(&mut state),
3726            expected.map(|s| s.to_string())
3727        );
3728    }
3729
3730    #[test]
3731    fn test_unescape() {
3732        check_unescape(r"\b", Some("\u{0008}"));
3733        check_unescape(r"\f", Some("\u{000C}"));
3734        check_unescape(r"\t", Some("\t"));
3735        check_unescape(r"\r\n", Some("\r\n"));
3736        check_unescape(r"\/", Some("/"));
3737        check_unescape(r"/", Some("/"));
3738        check_unescape(r"\\", Some("\\"));
3739
3740        // 16 and 32-bit hexadecimal Unicode character value
3741        check_unescape(r"\u0001", Some("\u{0001}"));
3742        check_unescape(r"\u4c91", Some("\u{4c91}"));
3743        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3744        check_unescape(r"\u4c", None);
3745        check_unescape(r"\u0000", None);
3746        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3747        check_unescape(r"\U00110000", None);
3748        check_unescape(r"\U00000000", None);
3749        check_unescape(r"\u", None);
3750        check_unescape(r"\U", None);
3751        check_unescape(r"\U1010FFFF", None);
3752
3753        // hexadecimal byte value
3754        check_unescape(r"\x4B", Some("\u{004b}"));
3755        check_unescape(r"\x4", Some("\u{0004}"));
3756        check_unescape(r"\x4L", Some("\u{0004}L"));
3757        check_unescape(r"\x", Some("x"));
3758        check_unescape(r"\xP", Some("xP"));
3759        check_unescape(r"\x0", None);
3760        check_unescape(r"\xCAD", None);
3761        check_unescape(r"\xA9", None);
3762
3763        // octal byte value
3764        check_unescape(r"\1", Some("\u{0001}"));
3765        check_unescape(r"\12", Some("\u{000a}"));
3766        check_unescape(r"\123", Some("\u{0053}"));
3767        check_unescape(r"\1232", Some("\u{0053}2"));
3768        check_unescape(r"\4", Some("\u{0004}"));
3769        check_unescape(r"\45", Some("\u{0025}"));
3770        check_unescape(r"\450", Some("\u{0028}"));
3771        check_unescape(r"\603", None);
3772        check_unescape(r"\0", None);
3773        check_unescape(r"\080", None);
3774
3775        // others
3776        check_unescape(r"\9", Some("9"));
3777        check_unescape(r"''", Some("'"));
3778        check_unescape(
3779            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3780            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3781        );
3782        check_unescape(r"Hello\0", None);
3783        check_unescape(r"Hello\xCADRust", None);
3784    }
3785
3786    #[test]
3787    fn tokenize_numeric_prefix_trait() {
3788        #[derive(Debug)]
3789        struct NumericPrefixDialect;
3790
3791        impl Dialect for NumericPrefixDialect {
3792            fn is_identifier_start(&self, ch: char) -> bool {
3793                ch.is_ascii_lowercase()
3794                    || ch.is_ascii_uppercase()
3795                    || ch.is_ascii_digit()
3796                    || ch == '$'
3797            }
3798
3799            fn is_identifier_part(&self, ch: char) -> bool {
3800                ch.is_ascii_lowercase()
3801                    || ch.is_ascii_uppercase()
3802                    || ch.is_ascii_digit()
3803                    || ch == '_'
3804                    || ch == '$'
3805                    || ch == '{'
3806                    || ch == '}'
3807            }
3808
3809            fn supports_numeric_prefix(&self) -> bool {
3810                true
3811            }
3812        }
3813
3814        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3815        tokenize_numeric_prefix_inner(&HiveDialect {});
3816        tokenize_numeric_prefix_inner(&MySqlDialect {});
3817    }
3818
3819    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3820        let sql = r#"SELECT * FROM 1"#;
3821        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3822        let expected = vec![
3823            Token::make_keyword("SELECT"),
3824            Token::Whitespace(Whitespace::Space),
3825            Token::Mul,
3826            Token::Whitespace(Whitespace::Space),
3827            Token::make_keyword("FROM"),
3828            Token::Whitespace(Whitespace::Space),
3829            Token::Number(String::from("1"), false),
3830        ];
3831        compare(expected, tokens);
3832    }
3833
3834    #[test]
3835    fn tokenize_quoted_string_escape() {
3836        let dialect = SnowflakeDialect {};
3837        for (sql, expected, expected_unescaped) in [
3838            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3839            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3840            (r#"'\\'"#, r#"\\"#, r#"\"#),
3841            (
3842                r#"'\0\a\b\f\n\r\t\Z'"#,
3843                r#"\0\a\b\f\n\r\t\Z"#,
3844                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3845            ),
3846            (r#"'\"'"#, r#"\""#, "\""),
3847            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3848            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3849            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3850            (r#"'\q'"#, r#"\q"#, r#"q"#),
3851            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3852            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3853        ] {
3854            let tokens = Tokenizer::new(&dialect, sql)
3855                .with_unescape(false)
3856                .tokenize()
3857                .unwrap();
3858            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3859            compare(expected, tokens);
3860
3861            let tokens = Tokenizer::new(&dialect, sql)
3862                .with_unescape(true)
3863                .tokenize()
3864                .unwrap();
3865            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3866            compare(expected, tokens);
3867        }
3868
3869        for sql in [r#"'\'"#, r#"'ab\'"#] {
3870            let mut tokenizer = Tokenizer::new(&dialect, sql);
3871            assert_eq!(
3872                "Unterminated string literal",
3873                tokenizer.tokenize().unwrap_err().message.as_str(),
3874            );
3875        }
3876
3877        // Non-escape dialect
3878        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3879            let dialect = GenericDialect {};
3880            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3881
3882            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3883
3884            compare(expected, tokens);
3885        }
3886
3887        // MySQL special case for LIKE escapes
3888        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3889            let dialect = MySqlDialect {};
3890            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3891
3892            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3893
3894            compare(expected, tokens);
3895        }
3896    }
3897
3898    #[test]
3899    fn tokenize_triple_quoted_string() {
3900        fn check<F>(
3901            q: char, // The quote character to test
3902            r: char, // An alternate quote character.
3903            quote_token: F,
3904        ) where
3905            F: Fn(String) -> Token,
3906        {
3907            let dialect = BigQueryDialect {};
3908
3909            for (sql, expected, expected_unescaped) in [
3910                // Empty string
3911                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3912                // Should not count escaped quote as end of string.
3913                (
3914                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3915                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3916                    format!(r#"ab{q}{q}{q}{q}cd"#),
3917                ),
3918                // Simple string
3919                (
3920                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3921                    "abc".into(),
3922                    "abc".into(),
3923                ),
3924                // Mix single-double quotes unescaped.
3925                (
3926                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3927                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3928                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3929                ),
3930                // Escaped quote.
3931                (
3932                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3933                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3934                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3935                ),
3936                // backslash-escaped quote characters.
3937                (
3938                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3939                    r#"a\'\'b\'c\'d"#.into(),
3940                    r#"a''b'c'd"#.into(),
3941                ),
3942                // backslash-escaped characters
3943                (
3944                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3945                    r#"abc\0\n\rdef"#.into(),
3946                    "abc\0\n\rdef".into(),
3947                ),
3948            ] {
3949                let tokens = Tokenizer::new(&dialect, sql.as_str())
3950                    .with_unescape(false)
3951                    .tokenize()
3952                    .unwrap();
3953                let expected = vec![quote_token(expected.to_string())];
3954                compare(expected, tokens);
3955
3956                let tokens = Tokenizer::new(&dialect, sql.as_str())
3957                    .with_unescape(true)
3958                    .tokenize()
3959                    .unwrap();
3960                let expected = vec![quote_token(expected_unescaped.to_string())];
3961                compare(expected, tokens);
3962            }
3963
3964            for sql in [
3965                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3966                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3967                format!(r#"{q}{q}{q}{q}"#),
3968                format!(r#"{q}{q}{q}{r}{r}"#),
3969                format!(r#"{q}{q}{q}abc{q}"#),
3970                format!(r#"{q}{q}{q}abc{q}{q}"#),
3971                format!(r#"{q}{q}{q}abc"#),
3972            ] {
3973                let dialect = BigQueryDialect {};
3974                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3975                assert_eq!(
3976                    "Unterminated string literal",
3977                    tokenizer.tokenize().unwrap_err().message.as_str(),
3978                );
3979            }
3980        }
3981
3982        check('"', '\'', Token::TripleDoubleQuotedString);
3983
3984        check('\'', '"', Token::TripleSingleQuotedString);
3985
3986        let dialect = BigQueryDialect {};
3987
3988        let sql = r#"""''"#;
3989        let tokens = Tokenizer::new(&dialect, sql)
3990            .with_unescape(true)
3991            .tokenize()
3992            .unwrap();
3993        let expected = vec![
3994            Token::DoubleQuotedString("".to_string()),
3995            Token::SingleQuotedString("".to_string()),
3996        ];
3997        compare(expected, tokens);
3998
3999        let sql = r#"''"""#;
4000        let tokens = Tokenizer::new(&dialect, sql)
4001            .with_unescape(true)
4002            .tokenize()
4003            .unwrap();
4004        let expected = vec![
4005            Token::SingleQuotedString("".to_string()),
4006            Token::DoubleQuotedString("".to_string()),
4007        ];
4008        compare(expected, tokens);
4009
4010        // Non-triple quoted string dialect
4011        let dialect = SnowflakeDialect {};
4012        let sql = r#"''''''"#;
4013        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4014        let expected = vec![Token::SingleQuotedString("''".to_string())];
4015        compare(expected, tokens);
4016    }
4017
4018    #[test]
4019    fn test_mysql_users_grantees() {
4020        let dialect = MySqlDialect {};
4021
4022        let sql = "CREATE USER `root`@`%`";
4023        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4024        let expected = vec![
4025            Token::make_keyword("CREATE"),
4026            Token::Whitespace(Whitespace::Space),
4027            Token::make_keyword("USER"),
4028            Token::Whitespace(Whitespace::Space),
4029            Token::make_word("root", Some('`')),
4030            Token::AtSign,
4031            Token::make_word("%", Some('`')),
4032        ];
4033        compare(expected, tokens);
4034    }
4035
4036    #[test]
4037    fn test_postgres_abs_without_space_and_string_literal() {
4038        let dialect = MySqlDialect {};
4039
4040        let sql = "SELECT @'1'";
4041        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4042        let expected = vec![
4043            Token::make_keyword("SELECT"),
4044            Token::Whitespace(Whitespace::Space),
4045            Token::AtSign,
4046            Token::SingleQuotedString("1".to_string()),
4047        ];
4048        compare(expected, tokens);
4049    }
4050
4051    #[test]
4052    fn test_postgres_abs_without_space_and_quoted_column() {
4053        let dialect = MySqlDialect {};
4054
4055        let sql = r#"SELECT @"bar" FROM foo"#;
4056        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4057        let expected = vec![
4058            Token::make_keyword("SELECT"),
4059            Token::Whitespace(Whitespace::Space),
4060            Token::AtSign,
4061            Token::DoubleQuotedString("bar".to_string()),
4062            Token::Whitespace(Whitespace::Space),
4063            Token::make_keyword("FROM"),
4064            Token::Whitespace(Whitespace::Space),
4065            Token::make_word("foo", None),
4066        ];
4067        compare(expected, tokens);
4068    }
4069
4070    #[test]
4071    fn test_national_strings_backslash_escape_not_supported() {
4072        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
4073            .tokenizes_to(
4074                "select n'''''\\'",
4075                vec![
4076                    Token::make_keyword("select"),
4077                    Token::Whitespace(Whitespace::Space),
4078                    Token::NationalStringLiteral("''\\".to_string()),
4079                ],
4080            );
4081    }
4082
4083    #[test]
4084    fn test_national_strings_backslash_escape_supported() {
4085        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
4086            .tokenizes_to(
4087                "select n'''''\\''",
4088                vec![
4089                    Token::make_keyword("select"),
4090                    Token::Whitespace(Whitespace::Space),
4091                    Token::NationalStringLiteral("'''".to_string()),
4092                ],
4093            );
4094    }
4095
4096    #[test]
4097    fn test_string_escape_constant_not_supported() {
4098        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4099            "select e'...'",
4100            vec![
4101                Token::make_keyword("select"),
4102                Token::Whitespace(Whitespace::Space),
4103                Token::make_word("e", None),
4104                Token::SingleQuotedString("...".to_string()),
4105            ],
4106        );
4107
4108        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4109            "select E'...'",
4110            vec![
4111                Token::make_keyword("select"),
4112                Token::Whitespace(Whitespace::Space),
4113                Token::make_word("E", None),
4114                Token::SingleQuotedString("...".to_string()),
4115            ],
4116        );
4117    }
4118
4119    #[test]
4120    fn test_string_escape_constant_supported() {
4121        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4122            "select e'\\''",
4123            vec![
4124                Token::make_keyword("select"),
4125                Token::Whitespace(Whitespace::Space),
4126                Token::EscapedStringLiteral("'".to_string()),
4127            ],
4128        );
4129
4130        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4131            "select E'\\''",
4132            vec![
4133                Token::make_keyword("select"),
4134                Token::Whitespace(Whitespace::Space),
4135                Token::EscapedStringLiteral("'".to_string()),
4136            ],
4137        );
4138    }
4139
4140    #[test]
4141    fn test_whitespace_required_after_single_line_comment() {
4142        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4143            .tokenizes_to(
4144                "SELECT --'abc'",
4145                vec![
4146                    Token::make_keyword("SELECT"),
4147                    Token::Whitespace(Whitespace::Space),
4148                    Token::Minus,
4149                    Token::Minus,
4150                    Token::SingleQuotedString("abc".to_string()),
4151                ],
4152            );
4153
4154        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4155            .tokenizes_to(
4156                "SELECT -- 'abc'",
4157                vec![
4158                    Token::make_keyword("SELECT"),
4159                    Token::Whitespace(Whitespace::Space),
4160                    Token::Whitespace(Whitespace::SingleLineComment {
4161                        prefix: "--".to_string(),
4162                        comment: " 'abc'".to_string(),
4163                    }),
4164                ],
4165            );
4166
4167        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4168            .tokenizes_to(
4169                "SELECT --",
4170                vec![
4171                    Token::make_keyword("SELECT"),
4172                    Token::Whitespace(Whitespace::Space),
4173                    Token::Minus,
4174                    Token::Minus,
4175                ],
4176            );
4177
4178        all_dialects_where(|d| d.requires_single_line_comment_whitespace()).tokenizes_to(
4179            "--\n-- Table structure for table...\n--\n",
4180            vec![
4181                Token::Whitespace(Whitespace::SingleLineComment {
4182                    prefix: "--".to_string(),
4183                    comment: "\n".to_string(),
4184                }),
4185                Token::Whitespace(Whitespace::SingleLineComment {
4186                    prefix: "--".to_string(),
4187                    comment: " Table structure for table...\n".to_string(),
4188                }),
4189                Token::Whitespace(Whitespace::SingleLineComment {
4190                    prefix: "--".to_string(),
4191                    comment: "\n".to_string(),
4192                }),
4193            ],
4194        );
4195    }
4196
4197    #[test]
4198    fn test_whitespace_not_required_after_single_line_comment() {
4199        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4200            .tokenizes_to(
4201                "SELECT --'abc'",
4202                vec![
4203                    Token::make_keyword("SELECT"),
4204                    Token::Whitespace(Whitespace::Space),
4205                    Token::Whitespace(Whitespace::SingleLineComment {
4206                        prefix: "--".to_string(),
4207                        comment: "'abc'".to_string(),
4208                    }),
4209                ],
4210            );
4211
4212        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4213            .tokenizes_to(
4214                "SELECT -- 'abc'",
4215                vec![
4216                    Token::make_keyword("SELECT"),
4217                    Token::Whitespace(Whitespace::Space),
4218                    Token::Whitespace(Whitespace::SingleLineComment {
4219                        prefix: "--".to_string(),
4220                        comment: " 'abc'".to_string(),
4221                    }),
4222                ],
4223            );
4224
4225        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4226            .tokenizes_to(
4227                "SELECT --",
4228                vec![
4229                    Token::make_keyword("SELECT"),
4230                    Token::Whitespace(Whitespace::Space),
4231                    Token::Whitespace(Whitespace::SingleLineComment {
4232                        prefix: "--".to_string(),
4233                        comment: "".to_string(),
4234                    }),
4235                ],
4236            );
4237    }
4238
4239    #[test]
4240    fn test_tokenize_identifiers_numeric_prefix() {
4241        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4242            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4243
4244        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4245            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4246
4247        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4248            "t.12e34",
4249            vec![
4250                Token::make_word("t", None),
4251                Token::Period,
4252                Token::make_word("12e34", None),
4253            ],
4254        );
4255
4256        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4257            "t.1two3",
4258            vec![
4259                Token::make_word("t", None),
4260                Token::Period,
4261                Token::make_word("1two3", None),
4262            ],
4263        );
4264    }
4265
4266    #[test]
4267    fn tokenize_period_underscore() {
4268        let sql = String::from("SELECT table._col");
4269        // a dialect that supports underscores in numeric literals
4270        let dialect = PostgreSqlDialect {};
4271        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4272
4273        let expected = vec![
4274            Token::make_keyword("SELECT"),
4275            Token::Whitespace(Whitespace::Space),
4276            Token::Word(Word {
4277                value: "table".to_string(),
4278                quote_style: None,
4279                keyword: Keyword::TABLE,
4280            }),
4281            Token::Period,
4282            Token::Word(Word {
4283                value: "_col".to_string(),
4284                quote_style: None,
4285                keyword: Keyword::NoKeyword,
4286            }),
4287        ];
4288
4289        compare(expected, tokens);
4290
4291        let sql = String::from("SELECT ._123");
4292        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4293            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4294        }
4295
4296        let sql = String::from("SELECT ._abc");
4297        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4298            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4299        }
4300    }
4301
4302    #[test]
4303    fn tokenize_question_mark() {
4304        let dialect = PostgreSqlDialect {};
4305        let sql = "SELECT x ? y";
4306        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4307        compare(
4308            tokens,
4309            vec![
4310                Token::make_keyword("SELECT"),
4311                Token::Whitespace(Whitespace::Space),
4312                Token::make_word("x", None),
4313                Token::Whitespace(Whitespace::Space),
4314                Token::Question,
4315                Token::Whitespace(Whitespace::Space),
4316                Token::make_word("y", None),
4317            ],
4318        );
4319    }
4320
4321    #[test]
4322    fn tokenize_multiline_comment_with_comment_hint() {
4323        let sql = String::from("0/*! word */1");
4324
4325        let dialect = MySqlDialect {};
4326        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4327        let expected = vec![
4328            Token::Number("0".to_string(), false),
4329            Token::Whitespace(Whitespace::Space),
4330            Token::Word(Word {
4331                value: "word".to_string(),
4332                quote_style: None,
4333                keyword: Keyword::NoKeyword,
4334            }),
4335            Token::Whitespace(Whitespace::Space),
4336            Token::Number("1".to_string(), false),
4337        ];
4338        compare(expected, tokens);
4339    }
4340
4341    #[test]
4342    fn tokenize_multiline_comment_with_comment_hint_and_version() {
4343        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
4344        let dialect = MySqlDialect {};
4345        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
4346        let expected = vec![
4347            Token::Number("0".to_string(), false),
4348            Token::Whitespace(Whitespace::Space),
4349            Token::Whitespace(Whitespace::Space),
4350            Token::Word(Word {
4351                value: "KEY_BLOCK_SIZE".to_string(),
4352                quote_style: None,
4353                keyword: Keyword::KEY_BLOCK_SIZE,
4354            }),
4355            Token::Whitespace(Whitespace::Space),
4356            Token::Eq,
4357            Token::Whitespace(Whitespace::Space),
4358            Token::Number("1024".to_string(), false),
4359            Token::Whitespace(Whitespace::Space),
4360            Token::Number("1".to_string(), false),
4361        ];
4362        compare(expected, tokens);
4363
4364        let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
4365            .tokenize()
4366            .unwrap();
4367        compare(
4368            vec![
4369                Token::Number("0".to_string(), false),
4370                Token::Whitespace(Whitespace::Space),
4371                Token::Whitespace(Whitespace::Space),
4372                Token::Whitespace(Whitespace::Space),
4373                Token::Number("1".to_string(), false),
4374            ],
4375            tokens,
4376        );
4377
4378        let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
4379        compare(
4380            vec![
4381                Token::Number("0".to_string(), false),
4382                Token::Whitespace(Whitespace::Space),
4383                Token::Whitespace(Whitespace::Space),
4384                Token::Number("1".to_string(), false),
4385            ],
4386            tokens,
4387        );
4388        let tokens = Tokenizer::new(&dialect, "0 /*!   */ 1").tokenize().unwrap();
4389        compare(
4390            vec![
4391                Token::Number("0".to_string(), false),
4392                Token::Whitespace(Whitespace::Space),
4393                Token::Whitespace(Whitespace::Space),
4394                Token::Whitespace(Whitespace::Space),
4395                Token::Whitespace(Whitespace::Space),
4396                Token::Whitespace(Whitespace::Space),
4397                Token::Number("1".to_string(), false),
4398            ],
4399            tokens,
4400        );
4401    }
4402}