sqlparser/
tokenizer.rs1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::num::NonZeroU8;
33use core::str::Chars;
34use core::{cmp, fmt};
35use core::{iter::Peekable, str};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{
50 ast::{DollarQuotedString, QuoteDelimitedString},
51 dialect::HiveDialect,
52};
53
54/// SQL Token enumeration
55#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
58pub enum Token {
59 /// An end-of-file marker, not a real token
60 EOF,
61 /// A keyword (like SELECT) or an optionally quoted SQL identifier
62 Word(Word),
63 /// An unsigned numeric literal
64 Number(String, bool),
65 /// A character that could not be tokenized
66 Char(char),
67 /// Single quoted string: i.e: 'string'
68 SingleQuotedString(String),
69 /// Double quoted string: i.e: "string"
70 DoubleQuotedString(String),
71 /// Triple single quoted strings: Example '''abc'''
72 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73 TripleSingleQuotedString(String),
74 /// Triple double quoted strings: Example """abc"""
75 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
76 TripleDoubleQuotedString(String),
77 /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
78 DollarQuotedString(DollarQuotedString),
79 /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
80 /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
81 SingleQuotedByteStringLiteral(String),
82 /// Byte string literal: i.e: b"string" or B"string"
83 DoubleQuotedByteStringLiteral(String),
84 /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
85 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86 TripleSingleQuotedByteStringLiteral(String),
87 /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
88 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89 TripleDoubleQuotedByteStringLiteral(String),
90 /// Single quoted literal with raw string prefix. Example `R'abc'`
91 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92 SingleQuotedRawStringLiteral(String),
93 /// Double quoted literal with raw string prefix. Example `R"abc"`
94 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95 DoubleQuotedRawStringLiteral(String),
96 /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
97 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98 TripleSingleQuotedRawStringLiteral(String),
99 /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
100 /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
101 TripleDoubleQuotedRawStringLiteral(String),
102 /// "National" string literal: i.e: N'string'
103 NationalStringLiteral(String),
104 /// Quote delimited literal. Examples `Q'{ab'c}'`, `Q'|ab'c|'`, `Q'|ab|c|'`
105 /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
106 QuoteDelimitedStringLiteral(QuoteDelimitedString),
107 /// "Nationa" quote delimited literal. Examples `NQ'{ab'c}'`, `NQ'|ab'c|'`, `NQ'|ab|c|'`
108 /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
109 NationalQuoteDelimitedStringLiteral(QuoteDelimitedString),
110 /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
111 EscapedStringLiteral(String),
112 /// Unicode string literal: i.e: U&'first \000A second'
113 UnicodeStringLiteral(String),
114 /// Hexadecimal string literal: i.e.: X'deadbeef'
115 HexStringLiteral(String),
116 /// Comma
117 Comma,
118 /// Whitespace (space, tab, etc)
119 Whitespace(Whitespace),
120 /// Double equals sign `==`
121 DoubleEq,
122 /// Equality operator `=`
123 Eq,
124 /// Not Equals operator `<>` (or `!=` in some dialects)
125 Neq,
126 /// Less Than operator `<`
127 Lt,
128 /// Greater Than operator `>`
129 Gt,
130 /// Less Than Or Equals operator `<=`
131 LtEq,
132 /// Greater Than Or Equals operator `>=`
133 GtEq,
134 /// Spaceship operator <=>
135 Spaceship,
136 /// Plus operator `+`
137 Plus,
138 /// Minus operator `-`
139 Minus,
140 /// Multiplication operator `*`
141 Mul,
142 /// Division operator `/`
143 Div,
144 /// Integer division operator `//` in DuckDB
145 DuckIntDiv,
146 /// Modulo Operator `%`
147 Mod,
148 /// String concatenation `||`
149 StringConcat,
150 /// Left parenthesis `(`
151 LParen,
152 /// Right parenthesis `)`
153 RParen,
154 /// Period (used for compound identifiers or projections into nested types)
155 Period,
156 /// Colon `:`
157 Colon,
158 /// DoubleColon `::` (used for casting in PostgreSQL)
159 DoubleColon,
160 /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
161 Assignment,
162 /// SemiColon `;` used as separator for COPY and payload
163 SemiColon,
164 /// Backslash `\` used in terminating the COPY payload with `\.`
165 Backslash,
166 /// Left bracket `[`
167 LBracket,
168 /// Right bracket `]`
169 RBracket,
170 /// Ampersand `&`
171 Ampersand,
172 /// Pipe `|`
173 Pipe,
174 /// Caret `^`
175 Caret,
176 /// Left brace `{`
177 LBrace,
178 /// Right brace `}`
179 RBrace,
180 /// Right Arrow `=>`
181 RArrow,
182 /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
183 Sharp,
184 /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
185 DoubleSharp,
186 /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
187 Tilde,
188 /// `~*` , a case insensitive match regular expression operator in PostgreSQL
189 TildeAsterisk,
190 /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
191 ExclamationMarkTilde,
192 /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
193 ExclamationMarkTildeAsterisk,
194 /// `~~`, a case sensitive match pattern operator in PostgreSQL
195 DoubleTilde,
196 /// `~~*`, a case insensitive match pattern operator in PostgreSQL
197 DoubleTildeAsterisk,
198 /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
199 ExclamationMarkDoubleTilde,
200 /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
201 ExclamationMarkDoubleTildeAsterisk,
202 /// `<<`, a bitwise shift left operator in PostgreSQL
203 ShiftLeft,
204 /// `>>`, a bitwise shift right operator in PostgreSQL
205 ShiftRight,
206 /// `&&`, an overlap operator in PostgreSQL
207 Overlap,
208 /// Exclamation Mark `!` used for PostgreSQL factorial operator
209 ExclamationMark,
210 /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
211 DoubleExclamationMark,
212 /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
213 AtSign,
214 /// `^@`, a "starts with" string operator in PostgreSQL
215 CaretAt,
216 /// `|/`, a square root math operator in PostgreSQL
217 PGSquareRoot,
218 /// `||/`, a cube root math operator in PostgreSQL
219 PGCubeRoot,
220 /// `?` or `$` , a prepared statement arg placeholder
221 Placeholder(String),
222 /// `->`, used as a operator to extract json field in PostgreSQL
223 Arrow,
224 /// `->>`, used as a operator to extract json field as text in PostgreSQL
225 LongArrow,
226 /// `#>`, extracts JSON sub-object at the specified path
227 HashArrow,
228 /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
229 AtDashAt,
230 /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
231 QuestionMarkDash,
232 /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
233 AmpersandLeftAngleBracket,
234 /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
235 AmpersandRightAngleBracket,
236 /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
237 AmpersandLeftAngleBracketVerticalBar,
238 /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
239 VerticalBarAmpersandRightAngleBracket,
240 /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
241 TwoWayArrow,
242 /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
243 LeftAngleBracketCaret,
244 /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
245 RightAngleBracketCaret,
246 /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
247 QuestionMarkSharp,
248 /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
249 QuestionMarkDashVerticalBar,
250 /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
251 QuestionMarkDoubleVerticalBar,
252 /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
253 TildeEqual,
254 /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
255 ShiftLeftVerticalBar,
256 /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
257 VerticalBarShiftRight,
258 /// `|> BigQuery pipe operator
259 VerticalBarRightAngleBracket,
260 /// `#>>`, extracts JSON sub-object at the specified path as text
261 HashLongArrow,
262 /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
263 AtArrow,
264 /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
265 ArrowAt,
266 /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
267 /// path, where path elements can be either field keys or array indexes.
268 HashMinus,
269 /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
270 /// JSON value?
271 AtQuestion,
272 /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
273 /// for the specified JSON value. Only the first item of the result is taken into
274 /// account. If the result is not Boolean, then NULL is returned.
275 AtAt,
276 /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
277 /// jsonb object
278 Question,
279 /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
280 /// keys within the jsonb object
281 QuestionAnd,
282 /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
283 /// keys within the jsonb object
284 QuestionPipe,
285 /// Custom binary operator
286 /// This is used to represent any custom binary operator that is not part of the SQL standard.
287 /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
288 CustomBinaryOperator(String),
289}
290
291impl fmt::Display for Token {
292 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293 match self {
294 Token::EOF => f.write_str("EOF"),
295 Token::Word(ref w) => write!(f, "{w}"),
296 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
297 Token::Char(ref c) => write!(f, "{c}"),
298 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
299 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
300 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
301 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
302 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
303 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
304 Token::QuoteDelimitedStringLiteral(ref s) => s.fmt(f),
305 Token::NationalQuoteDelimitedStringLiteral(ref s) => write!(f, "N{s}"),
306 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
307 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
308 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
309 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
310 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
311 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
312 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
313 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
314 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
315 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
316 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
317 Token::Comma => f.write_str(","),
318 Token::Whitespace(ws) => write!(f, "{ws}"),
319 Token::DoubleEq => f.write_str("=="),
320 Token::Spaceship => f.write_str("<=>"),
321 Token::Eq => f.write_str("="),
322 Token::Neq => f.write_str("<>"),
323 Token::Lt => f.write_str("<"),
324 Token::Gt => f.write_str(">"),
325 Token::LtEq => f.write_str("<="),
326 Token::GtEq => f.write_str(">="),
327 Token::Plus => f.write_str("+"),
328 Token::Minus => f.write_str("-"),
329 Token::Mul => f.write_str("*"),
330 Token::Div => f.write_str("/"),
331 Token::DuckIntDiv => f.write_str("//"),
332 Token::StringConcat => f.write_str("||"),
333 Token::Mod => f.write_str("%"),
334 Token::LParen => f.write_str("("),
335 Token::RParen => f.write_str(")"),
336 Token::Period => f.write_str("."),
337 Token::Colon => f.write_str(":"),
338 Token::DoubleColon => f.write_str("::"),
339 Token::Assignment => f.write_str(":="),
340 Token::SemiColon => f.write_str(";"),
341 Token::Backslash => f.write_str("\\"),
342 Token::LBracket => f.write_str("["),
343 Token::RBracket => f.write_str("]"),
344 Token::Ampersand => f.write_str("&"),
345 Token::Caret => f.write_str("^"),
346 Token::Pipe => f.write_str("|"),
347 Token::LBrace => f.write_str("{"),
348 Token::RBrace => f.write_str("}"),
349 Token::RArrow => f.write_str("=>"),
350 Token::Sharp => f.write_str("#"),
351 Token::DoubleSharp => f.write_str("##"),
352 Token::ExclamationMark => f.write_str("!"),
353 Token::DoubleExclamationMark => f.write_str("!!"),
354 Token::Tilde => f.write_str("~"),
355 Token::TildeAsterisk => f.write_str("~*"),
356 Token::ExclamationMarkTilde => f.write_str("!~"),
357 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
358 Token::DoubleTilde => f.write_str("~~"),
359 Token::DoubleTildeAsterisk => f.write_str("~~*"),
360 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
361 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
362 Token::AtSign => f.write_str("@"),
363 Token::CaretAt => f.write_str("^@"),
364 Token::ShiftLeft => f.write_str("<<"),
365 Token::ShiftRight => f.write_str(">>"),
366 Token::Overlap => f.write_str("&&"),
367 Token::PGSquareRoot => f.write_str("|/"),
368 Token::PGCubeRoot => f.write_str("||/"),
369 Token::AtDashAt => f.write_str("@-@"),
370 Token::QuestionMarkDash => f.write_str("?-"),
371 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
372 Token::AmpersandRightAngleBracket => f.write_str("&>"),
373 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
374 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
375 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
376 Token::TwoWayArrow => f.write_str("<->"),
377 Token::LeftAngleBracketCaret => f.write_str("<^"),
378 Token::RightAngleBracketCaret => f.write_str(">^"),
379 Token::QuestionMarkSharp => f.write_str("?#"),
380 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
381 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
382 Token::TildeEqual => f.write_str("~="),
383 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
384 Token::VerticalBarShiftRight => f.write_str("|>>"),
385 Token::Placeholder(ref s) => write!(f, "{s}"),
386 Token::Arrow => write!(f, "->"),
387 Token::LongArrow => write!(f, "->>"),
388 Token::HashArrow => write!(f, "#>"),
389 Token::HashLongArrow => write!(f, "#>>"),
390 Token::AtArrow => write!(f, "@>"),
391 Token::ArrowAt => write!(f, "<@"),
392 Token::HashMinus => write!(f, "#-"),
393 Token::AtQuestion => write!(f, "@?"),
394 Token::AtAt => write!(f, "@@"),
395 Token::Question => write!(f, "?"),
396 Token::QuestionAnd => write!(f, "?&"),
397 Token::QuestionPipe => write!(f, "?|"),
398 Token::CustomBinaryOperator(s) => f.write_str(s),
399 }
400 }
401}
402
403impl Token {
404 /// Create a `Token::Word` from an unquoted `keyword`.
405 ///
406 /// The lookup is case-insensitive; unknown values become `Keyword::NoKeyword`.
407 pub fn make_keyword(keyword: &str) -> Self {
408 Token::make_word(keyword, None)
409 }
410
411 /// Create a `Token::Word` from `word` with an optional `quote_style`.
412 ///
413 /// When `quote_style` is `None`, the parser attempts a case-insensitive keyword
414 /// lookup and sets the `Word::keyword` accordingly.
415 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
416 // Only perform keyword lookup for unquoted identifiers.
417 // Use to_ascii_uppercase() since SQL keywords are ASCII,
418 // avoiding Unicode case conversion overhead.
419 let keyword = if quote_style.is_none() {
420 let word_uppercase = word.to_ascii_uppercase();
421 ALL_KEYWORDS
422 .binary_search(&word_uppercase.as_str())
423 .map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
424 } else {
425 Keyword::NoKeyword
426 };
427
428 Token::Word(Word {
429 value: word.to_string(),
430 quote_style,
431 keyword,
432 })
433 }
434}
435
436/// A keyword (like SELECT) or an optionally quoted SQL identifier
437#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
438#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
439#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
440pub struct Word {
441 /// The value of the token, without the enclosing quotes, and with the
442 /// escape sequences (if any) processed (TODO: escapes are not handled)
443 pub value: String,
444 /// An identifier can be "quoted" (<delimited identifier> in ANSI parlance).
445 /// The standard and most implementations allow using double quotes for this,
446 /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
447 pub quote_style: Option<char>,
448 /// If the word was not quoted and it matched one of the known keywords,
449 /// this will have one of the values from dialect::keywords, otherwise empty
450 pub keyword: Keyword,
451}
452
453impl fmt::Display for Word {
454 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
455 match self.quote_style {
456 Some(s) if s == '"' || s == '[' || s == '`' => {
457 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
458 }
459 None => f.write_str(&self.value),
460 _ => panic!("Unexpected quote_style!"),
461 }
462 }
463}
464
465impl Word {
466 fn matching_end_quote(ch: char) -> char {
467 match ch {
468 '"' => '"', // ANSI and most dialects
469 '[' => ']', // MS SQL
470 '`' => '`', // MySQL
471 _ => panic!("unexpected quoting style!"),
472 }
473 }
474}
475
476/// Represents whitespace in the input: spaces, newlines, tabs and comments.
477#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
478#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
479#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
480pub enum Whitespace {
481 /// A single space character.
482 Space,
483 /// A newline character.
484 Newline,
485 /// A tab character.
486 Tab,
487 /// A single-line comment (e.g. `-- comment` or `# comment`).
488 /// The `comment` field contains the text, and `prefix` contains the comment prefix.
489 SingleLineComment {
490 /// The content of the comment (without the prefix).
491 comment: String,
492 /// The prefix used for the comment (for example `--` or `#`).
493 prefix: String,
494 },
495
496 /// A multi-line comment (without the `/* ... */` delimiters).
497 MultiLineComment(String),
498}
499
500impl fmt::Display for Whitespace {
501 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
502 match self {
503 Whitespace::Space => f.write_str(" "),
504 Whitespace::Newline => f.write_str("\n"),
505 Whitespace::Tab => f.write_str("\t"),
506 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
507 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
508 }
509 }
510}
511
512/// Location in input string
513///
514/// # Create an "empty" (unknown) `Location`
515/// ```
516/// # use sqlparser::tokenizer::Location;
517/// let location = Location::empty();
518/// ```
519///
520/// # Create a `Location` from a line and column
521/// ```
522/// # use sqlparser::tokenizer::Location;
523/// let location = Location::new(1, 1);
524/// ```
525///
526/// # Create a `Location` from a pair
527/// ```
528/// # use sqlparser::tokenizer::Location;
529/// let location = Location::from((1, 1));
530/// ```
531#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
532#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
533#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
534pub struct Location {
535 /// Line number, starting from 1.
536 ///
537 /// Note: Line 0 is used for empty spans
538 pub line: u64,
539 /// Line column, starting from 1.
540 ///
541 /// Note: Column 0 is used for empty spans
542 pub column: u64,
543}
544
545impl fmt::Display for Location {
546 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
547 if self.line == 0 {
548 return Ok(());
549 }
550 write!(f, " at Line: {}, Column: {}", self.line, self.column)
551 }
552}
553
554impl fmt::Debug for Location {
555 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
556 write!(f, "Location({},{})", self.line, self.column)
557 }
558}
559
560impl Location {
561 /// Return an "empty" / unknown location
562 pub fn empty() -> Self {
563 Self { line: 0, column: 0 }
564 }
565
566 /// Create a new `Location` for a given line and column
567 pub fn new(line: u64, column: u64) -> Self {
568 Self { line, column }
569 }
570
571 /// Create a new location for a given line and column
572 ///
573 /// Alias for [`Self::new`]
574 // TODO: remove / deprecate in favor of` `new` for consistency?
575 pub fn of(line: u64, column: u64) -> Self {
576 Self::new(line, column)
577 }
578
579 /// Combine self and `end` into a new `Span`
580 pub fn span_to(self, end: Self) -> Span {
581 Span { start: self, end }
582 }
583}
584
585impl From<(u64, u64)> for Location {
586 fn from((line, column): (u64, u64)) -> Self {
587 Self { line, column }
588 }
589}
590
591/// A span represents a linear portion of the input string (start, end)
592///
593/// See [Spanned](crate::ast::Spanned) for more information.
594#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
595#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
596#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
597pub struct Span {
598 /// Start `Location` (inclusive).
599 pub start: Location,
600 /// End `Location` (inclusive).
601 pub end: Location,
602}
603
604impl fmt::Debug for Span {
605 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
606 write!(f, "Span({:?}..{:?})", self.start, self.end)
607 }
608}
609
610impl Span {
611 // An empty span (0, 0) -> (0, 0)
612 // We need a const instance for pattern matching
613 const EMPTY: Span = Self::empty();
614
615 /// Create a new span from a start and end [`Location`]
616 pub fn new(start: Location, end: Location) -> Span {
617 Span { start, end }
618 }
619
620 /// Returns an empty span `(0, 0) -> (0, 0)`
621 ///
622 /// Empty spans represent no knowledge of source location
623 /// See [Spanned](crate::ast::Spanned) for more information.
624 pub const fn empty() -> Span {
625 Span {
626 start: Location { line: 0, column: 0 },
627 end: Location { line: 0, column: 0 },
628 }
629 }
630
631 /// Returns the smallest Span that contains both `self` and `other`
632 /// If either span is [Span::empty], the other span is returned
633 ///
634 /// # Examples
635 /// ```
636 /// # use sqlparser::tokenizer::{Span, Location};
637 /// // line 1, column1 -> line 2, column 5
638 /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
639 /// // line 2, column 3 -> line 3, column 7
640 /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
641 /// // Union of the two is the min/max of the two spans
642 /// // line 1, column 1 -> line 3, column 7
643 /// let union = span1.union(&span2);
644 /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
645 /// ```
646 pub fn union(&self, other: &Span) -> Span {
647 // If either span is empty, return the other
648 // this prevents propagating (0, 0) through the tree
649 match (self, other) {
650 (&Span::EMPTY, _) => *other,
651 (_, &Span::EMPTY) => *self,
652 _ => Span {
653 start: cmp::min(self.start, other.start),
654 end: cmp::max(self.end, other.end),
655 },
656 }
657 }
658
659 /// Same as [Span::union] for `Option<Span>`
660 ///
661 /// If `other` is `None`, `self` is returned
662 pub fn union_opt(&self, other: &Option<Span>) -> Span {
663 match other {
664 Some(other) => self.union(other),
665 None => *self,
666 }
667 }
668
669 /// Return the [Span::union] of all spans in the iterator
670 ///
671 /// If the iterator is empty, an empty span is returned
672 ///
673 /// # Example
674 /// ```
675 /// # use sqlparser::tokenizer::{Span, Location};
676 /// let spans = vec![
677 /// Span::new(Location::new(1, 1), Location::new(2, 5)),
678 /// Span::new(Location::new(2, 3), Location::new(3, 7)),
679 /// Span::new(Location::new(3, 1), Location::new(4, 2)),
680 /// ];
681 /// // line 1, column 1 -> line 4, column 2
682 /// assert_eq!(
683 /// Span::union_iter(spans),
684 /// Span::new(Location::new(1, 1), Location::new(4, 2))
685 /// );
686 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
687 iter.into_iter()
688 .reduce(|acc, item| acc.union(&item))
689 .unwrap_or(Span::empty())
690 }
691}
692
693/// Backwards compatibility struct for [`TokenWithSpan`]
694#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
695pub type TokenWithLocation = TokenWithSpan;
696
697/// A [Token] with [Span] attached to it
698///
699/// This is used to track the location of a token in the input string
700///
701/// # Examples
702/// ```
703/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
704/// // commas @ line 1, column 10
705/// let tok1 = TokenWithSpan::new(
706/// Token::Comma,
707/// Span::new(Location::new(1, 10), Location::new(1, 11)),
708/// );
709/// assert_eq!(tok1, Token::Comma); // can compare the token
710///
711/// // commas @ line 2, column 20
712/// let tok2 = TokenWithSpan::new(
713/// Token::Comma,
714/// Span::new(Location::new(2, 20), Location::new(2, 21)),
715/// );
716/// // same token but different locations are not equal
717/// assert_ne!(tok1, tok2);
718/// ```
719#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
720#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
721#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
722/// A `Token` together with its `Span` (location in the source).
723pub struct TokenWithSpan {
724 /// The token value.
725 pub token: Token,
726 /// The span covering the token in the input.
727 pub span: Span,
728}
729
730impl TokenWithSpan {
731 /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
732 pub fn new(token: Token, span: Span) -> Self {
733 Self { token, span }
734 }
735
736 /// Wrap a token with an empty span
737 pub fn wrap(token: Token) -> Self {
738 Self::new(token, Span::empty())
739 }
740
741 /// Wrap a token with a location from `start` to `end`
742 pub fn at(token: Token, start: Location, end: Location) -> Self {
743 Self::new(token, Span::new(start, end))
744 }
745
746 /// Return an EOF token with no location
747 pub fn new_eof() -> Self {
748 Self::wrap(Token::EOF)
749 }
750}
751
752impl PartialEq<Token> for TokenWithSpan {
753 fn eq(&self, other: &Token) -> bool {
754 &self.token == other
755 }
756}
757
758impl PartialEq<TokenWithSpan> for Token {
759 fn eq(&self, other: &TokenWithSpan) -> bool {
760 self == &other.token
761 }
762}
763
764impl fmt::Display for TokenWithSpan {
765 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
766 self.token.fmt(f)
767 }
768}
769
770/// An error reported by the tokenizer, with a human-readable `message` and a `location`.
771#[derive(Debug, PartialEq, Eq)]
772pub struct TokenizerError {
773 /// A descriptive error message.
774 pub message: String,
775 /// The `Location` where the error was detected.
776 pub location: Location,
777}
778
779impl fmt::Display for TokenizerError {
780 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
781 write!(f, "{}{}", self.message, self.location,)
782 }
783}
784
785impl core::error::Error for TokenizerError {}
786
787struct State<'a> {
788 peekable: Peekable<Chars<'a>>,
789 line: u64,
790 col: u64,
791}
792
793impl State<'_> {
794 /// return the next character and advance the stream
795 pub fn next(&mut self) -> Option<char> {
796 match self.peekable.next() {
797 None => None,
798 Some(s) => {
799 if s == '\n' {
800 self.line += 1;
801 self.col = 1;
802 } else {
803 self.col += 1;
804 }
805 Some(s)
806 }
807 }
808 }
809
810 /// return the next character but do not advance the stream
811 pub fn peek(&mut self) -> Option<&char> {
812 self.peekable.peek()
813 }
814
815 /// Return the current `Location` (line and column)
816 pub fn location(&self) -> Location {
817 Location {
818 line: self.line,
819 column: self.col,
820 }
821 }
822}
823
824/// Represents how many quote characters enclose a string literal.
825#[derive(Copy, Clone)]
826enum NumStringQuoteChars {
827 /// e.g. `"abc"`, `'abc'`, `r'abc'`
828 One,
829 /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
830 Many(NonZeroU8),
831}
832
833/// Settings for tokenizing a quoted string literal.
834struct TokenizeQuotedStringSettings {
835 /// The character used to quote the string.
836 quote_style: char,
837 /// Represents how many quotes characters enclose the string literal.
838 num_quote_chars: NumStringQuoteChars,
839 /// The number of opening quotes left to consume, before parsing
840 /// the remaining string literal.
841 /// For example: given initial string `"""abc"""`. If the caller has
842 /// already parsed the first quote for some reason, then this value
843 /// is set to 1, flagging to look to consume only 2 leading quotes.
844 num_opening_quotes_to_consume: u8,
845 /// True if the string uses backslash escaping of special characters
846 /// e.g `'abc\ndef\'ghi'
847 backslash_escape: bool,
848}
849
850/// SQL Tokenizer
851pub struct Tokenizer<'a> {
852 dialect: &'a dyn Dialect,
853 query: &'a str,
854 /// If true (the default), the tokenizer will un-escape literal
855 /// SQL strings See [`Tokenizer::with_unescape`] for more details.
856 unescape: bool,
857}
858
859impl<'a> Tokenizer<'a> {
860 /// Create a new SQL tokenizer for the specified SQL statement
861 ///
862 /// ```
863 /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
864 /// # use sqlparser::dialect::GenericDialect;
865 /// # let dialect = GenericDialect{};
866 /// let query = r#"SELECT 'foo'"#;
867 ///
868 /// // Parsing the query
869 /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
870 ///
871 /// assert_eq!(tokens, vec![
872 /// Token::make_word("SELECT", None),
873 /// Token::Whitespace(Whitespace::Space),
874 /// Token::SingleQuotedString("foo".to_string()),
875 /// ]);
876 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
877 Self {
878 dialect,
879 query,
880 unescape: true,
881 }
882 }
883
884 /// Set unescape mode
885 ///
886 /// When true (default) the tokenizer unescapes literal values
887 /// (for example, `""` in SQL is unescaped to the literal `"`).
888 ///
889 /// When false, the tokenizer provides the raw strings as provided
890 /// in the query. This can be helpful for programs that wish to
891 /// recover the *exact* original query text without normalizing
892 /// the escaping
893 ///
894 /// # Example
895 ///
896 /// ```
897 /// # use sqlparser::tokenizer::{Token, Tokenizer};
898 /// # use sqlparser::dialect::GenericDialect;
899 /// # let dialect = GenericDialect{};
900 /// let query = r#""Foo "" Bar""#;
901 /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
902 /// let original = Token::make_word(r#"Foo "" Bar"#, Some('"'));
903 ///
904 /// // Parsing with unescaping (default)
905 /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
906 /// assert_eq!(tokens, vec![unescaped]);
907 ///
908 /// // Parsing with unescape = false
909 /// let tokens = Tokenizer::new(&dialect, &query)
910 /// .with_unescape(false)
911 /// .tokenize().unwrap();
912 /// assert_eq!(tokens, vec![original]);
913 /// ```
914 pub fn with_unescape(mut self, unescape: bool) -> Self {
915 self.unescape = unescape;
916 self
917 }
918
919 /// Tokenize the statement and produce a vector of tokens
920 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
921 let twl = self.tokenize_with_location()?;
922 Ok(twl.into_iter().map(|t| t.token).collect())
923 }
924
925 /// Tokenize the statement and produce a vector of tokens with location information
926 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
927 let mut tokens: Vec<TokenWithSpan> = vec![];
928 self.tokenize_with_location_into_buf(&mut tokens)
929 .map(|_| tokens)
930 }
931
932 /// Tokenize the statement and append tokens with location information into the provided buffer.
933 /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
934 pub fn tokenize_with_location_into_buf(
935 &mut self,
936 buf: &mut Vec<TokenWithSpan>,
937 ) -> Result<(), TokenizerError> {
938 self.tokenize_with_location_into_buf_with_mapper(buf, |token| token)
939 }
940
941 /// Tokenize the statement and produce a vector of tokens, mapping each token
942 /// with provided `mapper`
943 pub fn tokenize_with_location_into_buf_with_mapper(
944 &mut self,
945 buf: &mut Vec<TokenWithSpan>,
946 mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
947 ) -> Result<(), TokenizerError> {
948 let mut state = State {
949 peekable: self.query.chars().peekable(),
950 line: 1,
951 col: 1,
952 };
953
954 let mut location = state.location();
955 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
956 let span = location.span_to(state.location());
957
958 // Check if this is a multiline comment hint that should be expanded
959 match &token {
960 Token::Whitespace(Whitespace::MultiLineComment(comment))
961 if self.dialect.supports_multiline_comment_hints()
962 && comment.starts_with('!') =>
963 {
964 // Re-tokenize the hints and add them to the buffer
965 self.tokenize_comment_hints(comment, span, buf, &mut mapper)?;
966 }
967 _ => {
968 buf.push(mapper(TokenWithSpan { token, span }));
969 }
970 }
971
972 location = state.location();
973 }
974 Ok(())
975 }
976
977 /// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
978 /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
979 fn tokenize_comment_hints(
980 &self,
981 comment: &str,
982 span: Span,
983 buf: &mut Vec<TokenWithSpan>,
984 mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
985 ) -> Result<(), TokenizerError> {
986 // Strip the leading '!' and any version digits (e.g., "50110")
987 let hint_content = comment
988 .strip_prefix('!')
989 .unwrap_or(comment)
990 .trim_start_matches(|c: char| c.is_ascii_digit());
991
992 // If there's no content after stripping, nothing to tokenize
993 if hint_content.is_empty() {
994 return Ok(());
995 }
996
997 // Create a new tokenizer for the hint content
998 let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
999
1000 // Create a state for tracking position within the hint
1001 let mut state = State {
1002 peekable: hint_content.chars().peekable(),
1003 line: span.start.line,
1004 col: span.start.column,
1005 };
1006
1007 // Tokenize the hint content and add tokens to the buffer
1008 let mut location = state.location();
1009 while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
1010 let token_span = location.span_to(state.location());
1011 buf.push(mapper(TokenWithSpan {
1012 token,
1013 span: token_span,
1014 }));
1015 location = state.location();
1016 }
1017
1018 Ok(())
1019 }
1020
1021 // Tokenize the identifier or keywords in `ch`
1022 fn tokenize_identifier_or_keyword(
1023 &self,
1024 ch: impl IntoIterator<Item = char>,
1025 chars: &mut State,
1026 ) -> Result<Option<Token>, TokenizerError> {
1027 chars.next(); // consume the first char
1028 let ch: String = ch.into_iter().collect();
1029 let word = self.tokenize_word(ch, chars);
1030
1031 // TODO: implement parsing of exponent here
1032 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
1033 let mut inner_state = State {
1034 peekable: word.chars().peekable(),
1035 line: 0,
1036 col: 0,
1037 };
1038 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
1039 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
1040 s += s2.as_str();
1041 return Ok(Some(Token::Number(s, false)));
1042 }
1043
1044 Ok(Some(Token::make_word(&word, None)))
1045 }
1046
1047 /// Get the next token or return None
1048 fn next_token(
1049 &self,
1050 chars: &mut State,
1051 prev_token: Option<&Token>,
1052 ) -> Result<Option<Token>, TokenizerError> {
1053 match chars.peek() {
1054 Some(&ch) => match ch {
1055 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
1056 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
1057 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
1058 '\r' => {
1059 // Emit a single Whitespace::Newline token for \r and \r\n
1060 chars.next();
1061 if let Some('\n') = chars.peek() {
1062 chars.next();
1063 }
1064 Ok(Some(Token::Whitespace(Whitespace::Newline)))
1065 }
1066 // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
1067 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
1068 {
1069 chars.next(); // consume
1070 match chars.peek() {
1071 Some('\'') => {
1072 if self.dialect.supports_triple_quoted_string() {
1073 return self
1074 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1075 chars,
1076 '\'',
1077 false,
1078 Token::SingleQuotedByteStringLiteral,
1079 Token::TripleSingleQuotedByteStringLiteral,
1080 );
1081 }
1082 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
1083 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
1084 }
1085 Some('\"') => {
1086 if self.dialect.supports_triple_quoted_string() {
1087 return self
1088 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1089 chars,
1090 '"',
1091 false,
1092 Token::DoubleQuotedByteStringLiteral,
1093 Token::TripleDoubleQuotedByteStringLiteral,
1094 );
1095 }
1096 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
1097 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
1098 }
1099 _ => {
1100 // regular identifier starting with an "b" or "B"
1101 let s = self.tokenize_word(b, chars);
1102 Ok(Some(Token::make_word(&s, None)))
1103 }
1104 }
1105 }
1106 // BigQuery uses r or R for raw string literal
1107 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
1108 chars.next(); // consume
1109 match chars.peek() {
1110 Some('\'') => self
1111 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1112 chars,
1113 '\'',
1114 false,
1115 Token::SingleQuotedRawStringLiteral,
1116 Token::TripleSingleQuotedRawStringLiteral,
1117 ),
1118 Some('\"') => self
1119 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120 chars,
1121 '"',
1122 false,
1123 Token::DoubleQuotedRawStringLiteral,
1124 Token::TripleDoubleQuotedRawStringLiteral,
1125 ),
1126 _ => {
1127 // regular identifier starting with an "r" or "R"
1128 let s = self.tokenize_word(b, chars);
1129 Ok(Some(Token::make_word(&s, None)))
1130 }
1131 }
1132 }
1133 // Redshift uses lower case n for national string literal
1134 n @ 'N' | n @ 'n' => {
1135 chars.next(); // consume, to check the next char
1136 match chars.peek() {
1137 Some('\'') => {
1138 // N'...' - a <national character string literal>
1139 let backslash_escape =
1140 self.dialect.supports_string_literal_backslash_escape();
1141 let s =
1142 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1143 Ok(Some(Token::NationalStringLiteral(s)))
1144 }
1145 Some(&q @ 'q') | Some(&q @ 'Q')
1146 if self.dialect.supports_quote_delimited_string() =>
1147 {
1148 chars.next(); // consume and check the next char
1149 if let Some('\'') = chars.peek() {
1150 self.tokenize_quote_delimited_string(chars, &[n, q])
1151 .map(|s| Some(Token::NationalQuoteDelimitedStringLiteral(s)))
1152 } else {
1153 let s = self.tokenize_word(String::from_iter([n, q]), chars);
1154 Ok(Some(Token::make_word(&s, None)))
1155 }
1156 }
1157 _ => {
1158 // regular identifier starting with an "N"
1159 let s = self.tokenize_word(n, chars);
1160 Ok(Some(Token::make_word(&s, None)))
1161 }
1162 }
1163 }
1164 q @ 'Q' | q @ 'q' if self.dialect.supports_quote_delimited_string() => {
1165 chars.next(); // consume and check the next char
1166 if let Some('\'') = chars.peek() {
1167 self.tokenize_quote_delimited_string(chars, &[q])
1168 .map(|s| Some(Token::QuoteDelimitedStringLiteral(s)))
1169 } else {
1170 let s = self.tokenize_word(q, chars);
1171 Ok(Some(Token::make_word(&s, None)))
1172 }
1173 }
1174 // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1175 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1176 let starting_loc = chars.location();
1177 chars.next(); // consume, to check the next char
1178 match chars.peek() {
1179 Some('\'') => {
1180 let s =
1181 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1182 Ok(Some(Token::EscapedStringLiteral(s)))
1183 }
1184 _ => {
1185 // regular identifier starting with an "E" or "e"
1186 let s = self.tokenize_word(x, chars);
1187 Ok(Some(Token::make_word(&s, None)))
1188 }
1189 }
1190 }
1191 // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1192 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1193 chars.next(); // consume, to check the next char
1194 if chars.peek() == Some(&'&') {
1195 // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1196 let mut chars_clone = chars.peekable.clone();
1197 chars_clone.next(); // consume the '&' in the clone
1198 if chars_clone.peek() == Some(&'\'') {
1199 chars.next(); // consume the '&' in the original iterator
1200 let s = unescape_unicode_single_quoted_string(chars)?;
1201 return Ok(Some(Token::UnicodeStringLiteral(s)));
1202 }
1203 }
1204 // regular identifier starting with an "U" or "u"
1205 let s = self.tokenize_word(x, chars);
1206 Ok(Some(Token::make_word(&s, None)))
1207 }
1208 // The spec only allows an uppercase 'X' to introduce a hex
1209 // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1210 x @ 'x' | x @ 'X' => {
1211 chars.next(); // consume, to check the next char
1212 match chars.peek() {
1213 Some('\'') => {
1214 // X'...' - a <binary string literal>
1215 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1216 Ok(Some(Token::HexStringLiteral(s)))
1217 }
1218 _ => {
1219 // regular identifier starting with an "X"
1220 let s = self.tokenize_word(x, chars);
1221 Ok(Some(Token::make_word(&s, None)))
1222 }
1223 }
1224 }
1225 // single quoted string
1226 '\'' => {
1227 if self.dialect.supports_triple_quoted_string() {
1228 return self
1229 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1230 chars,
1231 '\'',
1232 self.dialect.supports_string_literal_backslash_escape(),
1233 Token::SingleQuotedString,
1234 Token::TripleSingleQuotedString,
1235 );
1236 }
1237 let s = self.tokenize_single_quoted_string(
1238 chars,
1239 '\'',
1240 self.dialect.supports_string_literal_backslash_escape(),
1241 )?;
1242
1243 Ok(Some(Token::SingleQuotedString(s)))
1244 }
1245 // double quoted string
1246 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1247 && !self.dialect.is_identifier_start(ch) =>
1248 {
1249 if self.dialect.supports_triple_quoted_string() {
1250 return self
1251 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1252 chars,
1253 '"',
1254 self.dialect.supports_string_literal_backslash_escape(),
1255 Token::DoubleQuotedString,
1256 Token::TripleDoubleQuotedString,
1257 );
1258 }
1259 let s = self.tokenize_single_quoted_string(
1260 chars,
1261 '"',
1262 self.dialect.supports_string_literal_backslash_escape(),
1263 )?;
1264
1265 Ok(Some(Token::DoubleQuotedString(s)))
1266 }
1267 // delimited (quoted) identifier
1268 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1269 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1270 Ok(Some(Token::make_word(&word, Some(quote_start))))
1271 }
1272 // Potentially nested delimited (quoted) identifier
1273 quote_start
1274 if self
1275 .dialect
1276 .is_nested_delimited_identifier_start(quote_start)
1277 && self
1278 .dialect
1279 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1280 .is_some() =>
1281 {
1282 let Some((quote_start, nested_quote_start)) = self
1283 .dialect
1284 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1285 else {
1286 return self.tokenizer_error(
1287 chars.location(),
1288 format!("Expected nested delimiter '{quote_start}' before EOF."),
1289 );
1290 };
1291
1292 let Some(nested_quote_start) = nested_quote_start else {
1293 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1294 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1295 };
1296
1297 let mut word = vec![];
1298 let quote_end = Word::matching_end_quote(quote_start);
1299 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1300 let error_loc = chars.location();
1301
1302 chars.next(); // skip the first delimiter
1303 peeking_take_while(chars, |ch| ch.is_whitespace());
1304 if chars.peek() != Some(&nested_quote_start) {
1305 return self.tokenizer_error(
1306 error_loc,
1307 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1308 );
1309 }
1310 word.push(nested_quote_start.into());
1311 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1312 word.push(nested_quote_end.into());
1313 peeking_take_while(chars, |ch| ch.is_whitespace());
1314 if chars.peek() != Some("e_end) {
1315 return self.tokenizer_error(
1316 error_loc,
1317 format!("Expected close delimiter '{quote_end}' before EOF."),
1318 );
1319 }
1320 chars.next(); // skip close delimiter
1321
1322 Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1323 }
1324 // numbers and period
1325 '0'..='9' | '.' => {
1326 // special case where if ._ is encountered after a word then that word
1327 // is a table and the _ is the start of the col name.
1328 // if the prev token is not a word, then this is not a valid sql
1329 // word or number.
1330 if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1331 if let Some(Token::Word(_)) = prev_token {
1332 chars.next();
1333 return Ok(Some(Token::Period));
1334 }
1335
1336 return self.tokenizer_error(
1337 chars.location(),
1338 "Unexpected character '_'".to_string(),
1339 );
1340 }
1341
1342 // Some dialects support underscore as number separator
1343 // There can only be one at a time and it must be followed by another digit
1344 let is_number_separator = |ch: char, next_char: Option<char>| {
1345 self.dialect.supports_numeric_literal_underscores()
1346 && ch == '_'
1347 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1348 };
1349
1350 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1351 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1352 });
1353
1354 // match binary literal that starts with 0x
1355 if s == "0" && chars.peek() == Some(&'x') {
1356 chars.next();
1357 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1358 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1359 });
1360 return Ok(Some(Token::HexStringLiteral(s2)));
1361 }
1362
1363 // match one period
1364 if let Some('.') = chars.peek() {
1365 s.push('.');
1366 chars.next();
1367 }
1368
1369 // If the dialect supports identifiers that start with a numeric prefix
1370 // and we have now consumed a dot, check if the previous token was a Word.
1371 // If so, what follows is definitely not part of a decimal number and
1372 // we should yield the dot as a dedicated token so compound identifiers
1373 // starting with digits can be parsed correctly.
1374 if s == "." && self.dialect.supports_numeric_prefix() {
1375 if let Some(Token::Word(_)) = prev_token {
1376 return Ok(Some(Token::Period));
1377 }
1378 }
1379
1380 // Consume fractional digits.
1381 s += &peeking_next_take_while(chars, |ch, next_ch| {
1382 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1383 });
1384
1385 // No fraction -> Token::Period
1386 if s == "." {
1387 return Ok(Some(Token::Period));
1388 }
1389
1390 // Parse exponent as number
1391 let mut exponent_part = String::new();
1392 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1393 let mut char_clone = chars.peekable.clone();
1394 exponent_part.push(char_clone.next().unwrap());
1395
1396 // Optional sign
1397 match char_clone.peek() {
1398 Some(&c) if matches!(c, '+' | '-') => {
1399 exponent_part.push(c);
1400 char_clone.next();
1401 }
1402 _ => (),
1403 }
1404
1405 match char_clone.peek() {
1406 // Definitely an exponent, get original iterator up to speed and use it
1407 Some(&c) if c.is_ascii_digit() => {
1408 for _ in 0..exponent_part.len() {
1409 chars.next();
1410 }
1411 exponent_part +=
1412 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1413 s += exponent_part.as_str();
1414 }
1415 // Not an exponent, discard the work done
1416 _ => (),
1417 }
1418 }
1419
1420 // If the dialect supports identifiers that start with a numeric prefix,
1421 // we need to check if the value is in fact an identifier and must thus
1422 // be tokenized as a word.
1423 if self.dialect.supports_numeric_prefix() {
1424 if exponent_part.is_empty() {
1425 // If it is not a number with an exponent, it may be
1426 // an identifier starting with digits.
1427 let word =
1428 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1429
1430 if !word.is_empty() {
1431 s += word.as_str();
1432 return Ok(Some(Token::make_word(s.as_str(), None)));
1433 }
1434 } else if prev_token == Some(&Token::Period) {
1435 // If the previous token was a period, thus not belonging to a number,
1436 // the value we have is part of an identifier.
1437 return Ok(Some(Token::make_word(s.as_str(), None)));
1438 }
1439 }
1440
1441 let long = if chars.peek() == Some(&'L') {
1442 chars.next();
1443 true
1444 } else {
1445 false
1446 };
1447 Ok(Some(Token::Number(s, long)))
1448 }
1449 // punctuation
1450 '(' => self.consume_and_return(chars, Token::LParen),
1451 ')' => self.consume_and_return(chars, Token::RParen),
1452 ',' => self.consume_and_return(chars, Token::Comma),
1453 // operators
1454 '-' => {
1455 chars.next(); // consume the '-'
1456
1457 match chars.peek() {
1458 Some('-') => {
1459 let mut is_comment = true;
1460 if self.dialect.requires_single_line_comment_whitespace() {
1461 is_comment = chars
1462 .peekable
1463 .clone()
1464 .nth(1)
1465 .is_some_and(char::is_whitespace);
1466 }
1467
1468 if is_comment {
1469 chars.next(); // consume second '-'
1470 let comment = self.tokenize_single_line_comment(chars);
1471 return Ok(Some(Token::Whitespace(
1472 Whitespace::SingleLineComment {
1473 prefix: "--".to_owned(),
1474 comment,
1475 },
1476 )));
1477 }
1478
1479 self.start_binop(chars, "-", Token::Minus)
1480 }
1481 Some('>') => {
1482 chars.next();
1483 match chars.peek() {
1484 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1485 _ => self.start_binop(chars, "->", Token::Arrow),
1486 }
1487 }
1488 // a regular '-' operator
1489 _ => self.start_binop(chars, "-", Token::Minus),
1490 }
1491 }
1492 '/' => {
1493 chars.next(); // consume the '/'
1494 match chars.peek() {
1495 Some('*') => {
1496 chars.next(); // consume the '*', starting a multi-line comment
1497 self.tokenize_multiline_comment(chars)
1498 }
1499 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1500 chars.next(); // consume the second '/', starting a snowflake single-line comment
1501 let comment = self.tokenize_single_line_comment(chars);
1502 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1503 prefix: "//".to_owned(),
1504 comment,
1505 })))
1506 }
1507 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1508 self.consume_and_return(chars, Token::DuckIntDiv)
1509 }
1510 // a regular '/' operator
1511 _ => Ok(Some(Token::Div)),
1512 }
1513 }
1514 '+' => self.consume_and_return(chars, Token::Plus),
1515 '*' => self.consume_and_return(chars, Token::Mul),
1516 '%' => {
1517 chars.next(); // advance past '%'
1518 match chars.peek() {
1519 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1520 Some(sch) if self.dialect.is_identifier_start('%') => {
1521 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1522 }
1523 _ => self.start_binop(chars, "%", Token::Mod),
1524 }
1525 }
1526 '|' => {
1527 chars.next(); // consume the '|'
1528 match chars.peek() {
1529 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1530 Some('|') => {
1531 chars.next(); // consume the second '|'
1532 match chars.peek() {
1533 Some('/') => {
1534 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1535 }
1536 _ => self.start_binop(chars, "||", Token::StringConcat),
1537 }
1538 }
1539 Some('&') if self.dialect.supports_geometric_types() => {
1540 chars.next(); // consume
1541 match chars.peek() {
1542 Some('>') => self.consume_for_binop(
1543 chars,
1544 "|&>",
1545 Token::VerticalBarAmpersandRightAngleBracket,
1546 ),
1547 _ => self.start_binop_opt(chars, "|&", None),
1548 }
1549 }
1550 Some('>') if self.dialect.supports_geometric_types() => {
1551 chars.next(); // consume
1552 match chars.peek() {
1553 Some('>') => self.consume_for_binop(
1554 chars,
1555 "|>>",
1556 Token::VerticalBarShiftRight,
1557 ),
1558 _ => self.start_binop_opt(chars, "|>", None),
1559 }
1560 }
1561 Some('>') if self.dialect.supports_pipe_operator() => {
1562 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1563 }
1564 // Bitshift '|' operator
1565 _ => self.start_binop(chars, "|", Token::Pipe),
1566 }
1567 }
1568 '=' => {
1569 chars.next(); // consume
1570 match chars.peek() {
1571 Some('>') => self.consume_and_return(chars, Token::RArrow),
1572 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1573 _ => Ok(Some(Token::Eq)),
1574 }
1575 }
1576 '!' => {
1577 chars.next(); // consume
1578 match chars.peek() {
1579 Some('=') => self.consume_and_return(chars, Token::Neq),
1580 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1581 Some('~') => {
1582 chars.next();
1583 match chars.peek() {
1584 Some('*') => self
1585 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1586 Some('~') => {
1587 chars.next();
1588 match chars.peek() {
1589 Some('*') => self.consume_and_return(
1590 chars,
1591 Token::ExclamationMarkDoubleTildeAsterisk,
1592 ),
1593 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1594 }
1595 }
1596 _ => Ok(Some(Token::ExclamationMarkTilde)),
1597 }
1598 }
1599 _ => Ok(Some(Token::ExclamationMark)),
1600 }
1601 }
1602 '<' => {
1603 chars.next(); // consume
1604 match chars.peek() {
1605 Some('=') => {
1606 chars.next();
1607 match chars.peek() {
1608 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1609 _ => self.start_binop(chars, "<=", Token::LtEq),
1610 }
1611 }
1612 Some('|') if self.dialect.supports_geometric_types() => {
1613 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1614 }
1615 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1616 Some('<') if self.dialect.supports_geometric_types() => {
1617 chars.next(); // consume
1618 match chars.peek() {
1619 Some('|') => self.consume_for_binop(
1620 chars,
1621 "<<|",
1622 Token::ShiftLeftVerticalBar,
1623 ),
1624 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1625 }
1626 }
1627 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1628 Some('-') if self.dialect.supports_geometric_types() => {
1629 chars.next(); // consume
1630 match chars.peek() {
1631 Some('>') => {
1632 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1633 }
1634 _ => self.start_binop_opt(chars, "<-", None),
1635 }
1636 }
1637 Some('^') if self.dialect.supports_geometric_types() => {
1638 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1639 }
1640 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1641 _ => self.start_binop(chars, "<", Token::Lt),
1642 }
1643 }
1644 '>' => {
1645 chars.next(); // consume
1646 match chars.peek() {
1647 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1648 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1649 Some('^') if self.dialect.supports_geometric_types() => {
1650 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1651 }
1652 _ => self.start_binop(chars, ">", Token::Gt),
1653 }
1654 }
1655 ':' => {
1656 chars.next();
1657 match chars.peek() {
1658 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1659 Some('=') => self.consume_and_return(chars, Token::Assignment),
1660 _ => Ok(Some(Token::Colon)),
1661 }
1662 }
1663 ';' => self.consume_and_return(chars, Token::SemiColon),
1664 '\\' => self.consume_and_return(chars, Token::Backslash),
1665 '[' => self.consume_and_return(chars, Token::LBracket),
1666 ']' => self.consume_and_return(chars, Token::RBracket),
1667 '&' => {
1668 chars.next(); // consume the '&'
1669 match chars.peek() {
1670 Some('>') if self.dialect.supports_geometric_types() => {
1671 chars.next();
1672 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1673 }
1674 Some('<') if self.dialect.supports_geometric_types() => {
1675 chars.next(); // consume
1676 match chars.peek() {
1677 Some('|') => self.consume_and_return(
1678 chars,
1679 Token::AmpersandLeftAngleBracketVerticalBar,
1680 ),
1681 _ => {
1682 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1683 }
1684 }
1685 }
1686 Some('&') => {
1687 chars.next(); // consume the second '&'
1688 self.start_binop(chars, "&&", Token::Overlap)
1689 }
1690 // Bitshift '&' operator
1691 _ => self.start_binop(chars, "&", Token::Ampersand),
1692 }
1693 }
1694 '^' => {
1695 chars.next(); // consume the '^'
1696 match chars.peek() {
1697 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1698 _ => Ok(Some(Token::Caret)),
1699 }
1700 }
1701 '{' => self.consume_and_return(chars, Token::LBrace),
1702 '}' => self.consume_and_return(chars, Token::RBrace),
1703 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1704 {
1705 chars.next(); // consume the '#', starting a snowflake single-line comment
1706 let comment = self.tokenize_single_line_comment(chars);
1707 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1708 prefix: "#".to_owned(),
1709 comment,
1710 })))
1711 }
1712 '~' => {
1713 chars.next(); // consume
1714 match chars.peek() {
1715 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1716 Some('=') if self.dialect.supports_geometric_types() => {
1717 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1718 }
1719 Some('~') => {
1720 chars.next();
1721 match chars.peek() {
1722 Some('*') => {
1723 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1724 }
1725 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1726 }
1727 }
1728 _ => self.start_binop(chars, "~", Token::Tilde),
1729 }
1730 }
1731 '#' => {
1732 chars.next();
1733 match chars.peek() {
1734 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1735 Some('>') => {
1736 chars.next();
1737 match chars.peek() {
1738 Some('>') => {
1739 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1740 }
1741 _ => self.start_binop(chars, "#>", Token::HashArrow),
1742 }
1743 }
1744 Some(' ') => Ok(Some(Token::Sharp)),
1745 Some('#') if self.dialect.supports_geometric_types() => {
1746 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1747 }
1748 Some(sch) if self.dialect.is_identifier_start('#') => {
1749 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1750 }
1751 _ => self.start_binop(chars, "#", Token::Sharp),
1752 }
1753 }
1754 '@' => {
1755 chars.next();
1756 match chars.peek() {
1757 Some('@') if self.dialect.supports_geometric_types() => {
1758 self.consume_and_return(chars, Token::AtAt)
1759 }
1760 Some('-') if self.dialect.supports_geometric_types() => {
1761 chars.next();
1762 match chars.peek() {
1763 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1764 _ => self.start_binop_opt(chars, "@-", None),
1765 }
1766 }
1767 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1768 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1769 Some('@') => {
1770 chars.next();
1771 match chars.peek() {
1772 Some(' ') => Ok(Some(Token::AtAt)),
1773 Some(tch) if self.dialect.is_identifier_start('@') => {
1774 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1775 }
1776 _ => Ok(Some(Token::AtAt)),
1777 }
1778 }
1779 Some(' ') => Ok(Some(Token::AtSign)),
1780 // We break on quotes here, because no dialect allows identifiers starting
1781 // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1782 // quoted, which is tokenized as a quoted string, not here (e.g.
1783 // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1784 // quoted string as two separate tokens, which this allows. For example,
1785 // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1786 // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1787 // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1788 // for the user, the `@`, and the host.
1789 Some('\'') => Ok(Some(Token::AtSign)),
1790 Some('\"') => Ok(Some(Token::AtSign)),
1791 Some('`') => Ok(Some(Token::AtSign)),
1792 Some(sch) if self.dialect.is_identifier_start('@') => {
1793 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1794 }
1795 _ => Ok(Some(Token::AtSign)),
1796 }
1797 }
1798 // Postgres uses ? for jsonb operators, not prepared statements
1799 '?' if self.dialect.supports_geometric_types() => {
1800 chars.next(); // consume
1801 match chars.peek() {
1802 Some('|') => {
1803 chars.next();
1804 match chars.peek() {
1805 Some('|') => self.consume_and_return(
1806 chars,
1807 Token::QuestionMarkDoubleVerticalBar,
1808 ),
1809 _ => Ok(Some(Token::QuestionPipe)),
1810 }
1811 }
1812
1813 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1814 Some('-') => {
1815 chars.next(); // consume
1816 match chars.peek() {
1817 Some('|') => self
1818 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1819 _ => Ok(Some(Token::QuestionMarkDash)),
1820 }
1821 }
1822 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1823 _ => Ok(Some(Token::Question)),
1824 }
1825 }
1826 '?' => {
1827 chars.next();
1828 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1829 Ok(Some(Token::Placeholder(format!("?{s}"))))
1830 }
1831
1832 // identifier or keyword
1833 ch if self.dialect.is_identifier_start(ch) => {
1834 self.tokenize_identifier_or_keyword([ch], chars)
1835 }
1836 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1837
1838 // whitespace check (including unicode chars) should be last as it covers some of the chars above
1839 ch if ch.is_whitespace() => {
1840 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1841 }
1842 other => self.consume_and_return(chars, Token::Char(other)),
1843 },
1844 None => Ok(None),
1845 }
1846 }
1847
1848 /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1849 fn consume_for_binop(
1850 &self,
1851 chars: &mut State,
1852 prefix: &str,
1853 default: Token,
1854 ) -> Result<Option<Token>, TokenizerError> {
1855 chars.next(); // consume the first char
1856 self.start_binop_opt(chars, prefix, Some(default))
1857 }
1858
1859 /// parse a custom binary operator
1860 fn start_binop(
1861 &self,
1862 chars: &mut State,
1863 prefix: &str,
1864 default: Token,
1865 ) -> Result<Option<Token>, TokenizerError> {
1866 self.start_binop_opt(chars, prefix, Some(default))
1867 }
1868
1869 /// parse a custom binary operator
1870 fn start_binop_opt(
1871 &self,
1872 chars: &mut State,
1873 prefix: &str,
1874 default: Option<Token>,
1875 ) -> Result<Option<Token>, TokenizerError> {
1876 let mut custom = None;
1877 while let Some(&ch) = chars.peek() {
1878 if !self.dialect.is_custom_operator_part(ch) {
1879 break;
1880 }
1881
1882 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1883 chars.next();
1884 }
1885 match (custom, default) {
1886 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1887 (None, Some(tok)) => Ok(Some(tok)),
1888 (None, None) => self.tokenizer_error(
1889 chars.location(),
1890 format!("Expected a valid binary operator after '{prefix}'"),
1891 ),
1892 }
1893 }
1894
1895 /// Tokenize dollar preceded value (i.e: a string/placeholder)
1896 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1897 let mut s = String::new();
1898 let mut value = String::new();
1899
1900 chars.next();
1901
1902 // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1903 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1904 chars.next();
1905
1906 let mut is_terminated = false;
1907 let mut prev: Option<char> = None;
1908
1909 while let Some(&ch) = chars.peek() {
1910 if prev == Some('$') {
1911 if ch == '$' {
1912 chars.next();
1913 is_terminated = true;
1914 break;
1915 } else {
1916 s.push('$');
1917 s.push(ch);
1918 }
1919 } else if ch != '$' {
1920 s.push(ch);
1921 }
1922
1923 prev = Some(ch);
1924 chars.next();
1925 }
1926
1927 return if chars.peek().is_none() && !is_terminated {
1928 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1929 } else {
1930 Ok(Token::DollarQuotedString(DollarQuotedString {
1931 value: s,
1932 tag: None,
1933 }))
1934 };
1935 } else {
1936 value.push_str(&peeking_take_while(chars, |ch| {
1937 ch.is_alphanumeric()
1938 || ch == '_'
1939 // Allow $ as a placeholder character if the dialect supports it
1940 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1941 }));
1942
1943 // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1944 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1945 chars.next();
1946
1947 let mut temp = String::new();
1948 let end_delimiter = format!("${value}$");
1949
1950 loop {
1951 match chars.next() {
1952 Some(ch) => {
1953 temp.push(ch);
1954
1955 if temp.ends_with(&end_delimiter) {
1956 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1957 s.push_str(temp);
1958 }
1959 break;
1960 }
1961 }
1962 None => {
1963 if temp.ends_with(&end_delimiter) {
1964 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1965 s.push_str(temp);
1966 }
1967 break;
1968 }
1969
1970 return self.tokenizer_error(
1971 chars.location(),
1972 "Unterminated dollar-quoted, expected $",
1973 );
1974 }
1975 }
1976 }
1977 } else {
1978 return Ok(Token::Placeholder(format!("${value}")));
1979 }
1980 }
1981
1982 Ok(Token::DollarQuotedString(DollarQuotedString {
1983 value: s,
1984 tag: if value.is_empty() { None } else { Some(value) },
1985 }))
1986 }
1987
1988 fn tokenizer_error<R>(
1989 &self,
1990 loc: Location,
1991 message: impl Into<String>,
1992 ) -> Result<R, TokenizerError> {
1993 Err(TokenizerError {
1994 message: message.into(),
1995 location: loc,
1996 })
1997 }
1998
1999 // Consume characters until newline
2000 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
2001 let mut comment = peeking_take_while(chars, |ch| match ch {
2002 '\n' => false, // Always stop at \n
2003 '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
2004 _ => true, // Keep consuming for other characters
2005 });
2006
2007 if let Some(ch) = chars.next() {
2008 assert!(ch == '\n' || ch == '\r');
2009 comment.push(ch);
2010 }
2011
2012 comment
2013 }
2014
2015 /// Tokenize an identifier or keyword, after the first char is already consumed.
2016 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
2017 let mut s = first_chars.into();
2018 s.push_str(&peeking_take_while(chars, |ch| {
2019 self.dialect.is_identifier_part(ch)
2020 }));
2021 s
2022 }
2023
2024 /// Read a quoted identifier
2025 fn tokenize_quoted_identifier(
2026 &self,
2027 quote_start: char,
2028 chars: &mut State,
2029 ) -> Result<String, TokenizerError> {
2030 let error_loc = chars.location();
2031 chars.next(); // consume the opening quote
2032 let quote_end = Word::matching_end_quote(quote_start);
2033 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
2034
2035 if last_char == Some(quote_end) {
2036 Ok(s)
2037 } else {
2038 self.tokenizer_error(
2039 error_loc,
2040 format!("Expected close delimiter '{quote_end}' before EOF."),
2041 )
2042 }
2043 }
2044
2045 /// Read a single quoted string, starting with the opening quote.
2046 fn tokenize_escaped_single_quoted_string(
2047 &self,
2048 starting_loc: Location,
2049 chars: &mut State,
2050 ) -> Result<String, TokenizerError> {
2051 if let Some(s) = unescape_single_quoted_string(chars) {
2052 return Ok(s);
2053 }
2054
2055 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
2056 }
2057
2058 /// Reads a string literal quoted by a single or triple quote characters.
2059 /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
2060 fn tokenize_single_or_triple_quoted_string<F>(
2061 &self,
2062 chars: &mut State,
2063 quote_style: char,
2064 backslash_escape: bool,
2065 single_quote_token: F,
2066 triple_quote_token: F,
2067 ) -> Result<Option<Token>, TokenizerError>
2068 where
2069 F: Fn(String) -> Token,
2070 {
2071 let error_loc = chars.location();
2072
2073 let mut num_opening_quotes = 0u8;
2074 for _ in 0..3 {
2075 if Some("e_style) == chars.peek() {
2076 chars.next(); // Consume quote.
2077 num_opening_quotes += 1;
2078 } else {
2079 break;
2080 }
2081 }
2082
2083 let (token_fn, num_quote_chars) = match num_opening_quotes {
2084 1 => (single_quote_token, NumStringQuoteChars::One),
2085 2 => {
2086 // If we matched double quotes, then this is an empty string.
2087 return Ok(Some(single_quote_token("".into())));
2088 }
2089 3 => {
2090 let Some(num_quote_chars) = NonZeroU8::new(3) else {
2091 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
2092 };
2093 (
2094 triple_quote_token,
2095 NumStringQuoteChars::Many(num_quote_chars),
2096 )
2097 }
2098 _ => {
2099 return self.tokenizer_error(error_loc, "invalid string literal opening");
2100 }
2101 };
2102
2103 let settings = TokenizeQuotedStringSettings {
2104 quote_style,
2105 num_quote_chars,
2106 num_opening_quotes_to_consume: 0,
2107 backslash_escape,
2108 };
2109
2110 self.tokenize_quoted_string(chars, settings)
2111 .map(token_fn)
2112 .map(Some)
2113 }
2114
2115 /// Reads a string literal quoted by a single quote character.
2116 fn tokenize_single_quoted_string(
2117 &self,
2118 chars: &mut State,
2119 quote_style: char,
2120 backslash_escape: bool,
2121 ) -> Result<String, TokenizerError> {
2122 self.tokenize_quoted_string(
2123 chars,
2124 TokenizeQuotedStringSettings {
2125 quote_style,
2126 num_quote_chars: NumStringQuoteChars::One,
2127 num_opening_quotes_to_consume: 1,
2128 backslash_escape,
2129 },
2130 )
2131 }
2132
2133 /// Reads a quote delimited string expecting `chars.next()` to deliver a quote.
2134 ///
2135 /// See <https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA>
2136 fn tokenize_quote_delimited_string(
2137 &self,
2138 chars: &mut State,
2139 // the prefix that introduced the possible literal or word,
2140 // e.g. "Q" or "nq"
2141 literal_prefix: &[char],
2142 ) -> Result<QuoteDelimitedString, TokenizerError> {
2143 let literal_start_loc = chars.location();
2144 chars.next();
2145
2146 let start_quote_loc = chars.location();
2147 let (start_quote, end_quote) = match chars.next() {
2148 None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => {
2149 return self.tokenizer_error(
2150 start_quote_loc,
2151 format!(
2152 "Invalid space, tab, newline, or EOF after '{}''",
2153 String::from_iter(literal_prefix)
2154 ),
2155 );
2156 }
2157 Some(c) => (
2158 c,
2159 match c {
2160 '[' => ']',
2161 '{' => '}',
2162 '<' => '>',
2163 '(' => ')',
2164 c => c,
2165 },
2166 ),
2167 };
2168
2169 // read the string literal until the "quote character" following a by literal quote
2170 let mut value = String::new();
2171 while let Some(ch) = chars.next() {
2172 if ch == end_quote {
2173 if let Some('\'') = chars.peek() {
2174 chars.next(); // ~ consume the quote
2175 return Ok(QuoteDelimitedString {
2176 start_quote,
2177 value,
2178 end_quote,
2179 });
2180 }
2181 }
2182 value.push(ch);
2183 }
2184
2185 self.tokenizer_error(literal_start_loc, "Unterminated string literal")
2186 }
2187
2188 /// Read a quoted string.
2189 fn tokenize_quoted_string(
2190 &self,
2191 chars: &mut State,
2192 settings: TokenizeQuotedStringSettings,
2193 ) -> Result<String, TokenizerError> {
2194 let mut s = String::new();
2195 let error_loc = chars.location();
2196
2197 // Consume any opening quotes.
2198 for _ in 0..settings.num_opening_quotes_to_consume {
2199 if Some(settings.quote_style) != chars.next() {
2200 return self.tokenizer_error(error_loc, "invalid string literal opening");
2201 }
2202 }
2203
2204 let mut num_consecutive_quotes = 0;
2205 while let Some(&ch) = chars.peek() {
2206 let pending_final_quote = match settings.num_quote_chars {
2207 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2208 n @ NumStringQuoteChars::Many(count)
2209 if num_consecutive_quotes + 1 == count.get() =>
2210 {
2211 Some(n)
2212 }
2213 NumStringQuoteChars::Many(_) => None,
2214 };
2215
2216 match ch {
2217 char if char == settings.quote_style && pending_final_quote.is_some() => {
2218 chars.next(); // consume
2219
2220 if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2221 // For an initial string like `"""abc"""`, at this point we have
2222 // `abc""` in the buffer and have now matched the final `"`.
2223 // However, the string to return is simply `abc`, so we strip off
2224 // the trailing quotes before returning.
2225 let mut buf = s.chars();
2226 for _ in 1..count.get() {
2227 buf.next_back();
2228 }
2229 return Ok(buf.as_str().to_string());
2230 } else if chars
2231 .peek()
2232 .map(|c| *c == settings.quote_style)
2233 .unwrap_or(false)
2234 {
2235 s.push(ch);
2236 if !self.unescape {
2237 // In no-escape mode, the given query has to be saved completely
2238 s.push(ch);
2239 }
2240 chars.next();
2241 } else {
2242 return Ok(s);
2243 }
2244 }
2245 '\\' if settings.backslash_escape => {
2246 // consume backslash
2247 chars.next();
2248
2249 num_consecutive_quotes = 0;
2250
2251 if let Some(next) = chars.peek() {
2252 if !self.unescape
2253 || (self.dialect.ignores_wildcard_escapes()
2254 && (*next == '%' || *next == '_'))
2255 {
2256 // In no-escape mode, the given query has to be saved completely
2257 // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2258 // the backslash is not stripped.
2259 s.push(ch);
2260 s.push(*next);
2261 chars.next(); // consume next
2262 } else {
2263 let n = match next {
2264 '0' => '\0',
2265 'a' => '\u{7}',
2266 'b' => '\u{8}',
2267 'f' => '\u{c}',
2268 'n' => '\n',
2269 'r' => '\r',
2270 't' => '\t',
2271 'Z' => '\u{1a}',
2272 _ => *next,
2273 };
2274 s.push(n);
2275 chars.next(); // consume next
2276 }
2277 }
2278 }
2279 ch => {
2280 chars.next(); // consume ch
2281
2282 if ch == settings.quote_style {
2283 num_consecutive_quotes += 1;
2284 } else {
2285 num_consecutive_quotes = 0;
2286 }
2287
2288 s.push(ch);
2289 }
2290 }
2291 }
2292 self.tokenizer_error(error_loc, "Unterminated string literal")
2293 }
2294
2295 fn tokenize_multiline_comment(
2296 &self,
2297 chars: &mut State,
2298 ) -> Result<Option<Token>, TokenizerError> {
2299 let mut s = String::new();
2300 let mut nested = 1;
2301 let supports_nested_comments = self.dialect.supports_nested_comments();
2302 loop {
2303 match chars.next() {
2304 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2305 chars.next(); // consume the '*'
2306 s.push('/');
2307 s.push('*');
2308 nested += 1;
2309 }
2310 Some('*') if matches!(chars.peek(), Some('/')) => {
2311 chars.next(); // consume the '/'
2312 nested -= 1;
2313 if nested == 0 {
2314 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2315 }
2316 s.push('*');
2317 s.push('/');
2318 }
2319 Some(ch) => {
2320 s.push(ch);
2321 }
2322 None => {
2323 break self.tokenizer_error(
2324 chars.location(),
2325 "Unexpected EOF while in a multi-line comment",
2326 );
2327 }
2328 }
2329 }
2330 }
2331
2332 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2333 let mut last_char = None;
2334 let mut s = String::new();
2335 while let Some(ch) = chars.next() {
2336 if ch == quote_end {
2337 if chars.peek() == Some("e_end) {
2338 chars.next();
2339 s.push(ch);
2340 if !self.unescape {
2341 // In no-escape mode, the given query has to be saved completely
2342 s.push(ch);
2343 }
2344 } else {
2345 last_char = Some(quote_end);
2346 break;
2347 }
2348 } else {
2349 s.push(ch);
2350 }
2351 }
2352 (s, last_char)
2353 }
2354
2355 #[allow(clippy::unnecessary_wraps)]
2356 fn consume_and_return(
2357 &self,
2358 chars: &mut State,
2359 t: Token,
2360 ) -> Result<Option<Token>, TokenizerError> {
2361 chars.next();
2362 Ok(Some(t))
2363 }
2364}
2365
2366/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2367/// Return the characters read as String, and keep the first non-matching
2368/// char available as `chars.next()`.
2369fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2370 let mut s = String::new();
2371 while let Some(&ch) = chars.peek() {
2372 if predicate(ch) {
2373 chars.next(); // consume
2374 s.push(ch);
2375 } else {
2376 break;
2377 }
2378 }
2379 s
2380}
2381
2382/// Same as peeking_take_while, but also passes the next character to the predicate.
2383fn peeking_next_take_while(
2384 chars: &mut State,
2385 mut predicate: impl FnMut(char, Option<char>) -> bool,
2386) -> String {
2387 let mut s = String::new();
2388 while let Some(&ch) = chars.peek() {
2389 let next_char = chars.peekable.clone().nth(1);
2390 if predicate(ch, next_char) {
2391 chars.next(); // consume
2392 s.push(ch);
2393 } else {
2394 break;
2395 }
2396 }
2397 s
2398}
2399
2400fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2401 Unescape::new(chars).unescape()
2402}
2403
2404struct Unescape<'a: 'b, 'b> {
2405 chars: &'b mut State<'a>,
2406}
2407
2408impl<'a: 'b, 'b> Unescape<'a, 'b> {
2409 fn new(chars: &'b mut State<'a>) -> Self {
2410 Self { chars }
2411 }
2412 fn unescape(mut self) -> Option<String> {
2413 let mut unescaped = String::new();
2414
2415 self.chars.next();
2416
2417 while let Some(c) = self.chars.next() {
2418 if c == '\'' {
2419 // case: ''''
2420 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2421 self.chars.next();
2422 unescaped.push('\'');
2423 continue;
2424 }
2425 return Some(unescaped);
2426 }
2427
2428 if c != '\\' {
2429 unescaped.push(c);
2430 continue;
2431 }
2432
2433 let c = match self.chars.next()? {
2434 'b' => '\u{0008}',
2435 'f' => '\u{000C}',
2436 'n' => '\n',
2437 'r' => '\r',
2438 't' => '\t',
2439 'u' => self.unescape_unicode_16()?,
2440 'U' => self.unescape_unicode_32()?,
2441 'x' => self.unescape_hex()?,
2442 c if c.is_digit(8) => self.unescape_octal(c)?,
2443 c => c,
2444 };
2445
2446 unescaped.push(Self::check_null(c)?);
2447 }
2448
2449 None
2450 }
2451
2452 #[inline]
2453 fn check_null(c: char) -> Option<char> {
2454 if c == '\0' {
2455 None
2456 } else {
2457 Some(c)
2458 }
2459 }
2460
2461 #[inline]
2462 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2463 // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2464 match u32::from_str_radix(s, RADIX) {
2465 Err(_) => None,
2466 Ok(n) => {
2467 let n = n & 0xFF;
2468 if n <= 127 {
2469 char::from_u32(n)
2470 } else {
2471 None
2472 }
2473 }
2474 }
2475 }
2476
2477 // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2478 fn unescape_hex(&mut self) -> Option<char> {
2479 let mut s = String::new();
2480
2481 for _ in 0..2 {
2482 match self.next_hex_digit() {
2483 Some(c) => s.push(c),
2484 None => break,
2485 }
2486 }
2487
2488 if s.is_empty() {
2489 return Some('x');
2490 }
2491
2492 Self::byte_to_char::<16>(&s)
2493 }
2494
2495 #[inline]
2496 fn next_hex_digit(&mut self) -> Option<char> {
2497 match self.chars.peek() {
2498 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2499 _ => None,
2500 }
2501 }
2502
2503 // Octal byte value. \o, \oo, \ooo (o = 0–7)
2504 fn unescape_octal(&mut self, c: char) -> Option<char> {
2505 let mut s = String::new();
2506
2507 s.push(c);
2508 for _ in 0..2 {
2509 match self.next_octal_digest() {
2510 Some(c) => s.push(c),
2511 None => break,
2512 }
2513 }
2514
2515 Self::byte_to_char::<8>(&s)
2516 }
2517
2518 #[inline]
2519 fn next_octal_digest(&mut self) -> Option<char> {
2520 match self.chars.peek() {
2521 Some(c) if c.is_digit(8) => self.chars.next(),
2522 _ => None,
2523 }
2524 }
2525
2526 // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2527 fn unescape_unicode_16(&mut self) -> Option<char> {
2528 self.unescape_unicode::<4>()
2529 }
2530
2531 // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2532 fn unescape_unicode_32(&mut self) -> Option<char> {
2533 self.unescape_unicode::<8>()
2534 }
2535
2536 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2537 let mut s = String::new();
2538 for _ in 0..NUM {
2539 s.push(self.chars.next()?);
2540 }
2541 match u32::from_str_radix(&s, 16) {
2542 Err(_) => None,
2543 Ok(n) => char::from_u32(n),
2544 }
2545 }
2546}
2547
2548fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2549 let mut unescaped = String::new();
2550 chars.next(); // consume the opening quote
2551 while let Some(c) = chars.next() {
2552 match c {
2553 '\'' => {
2554 if chars.peek() == Some(&'\'') {
2555 chars.next();
2556 unescaped.push('\'');
2557 } else {
2558 return Ok(unescaped);
2559 }
2560 }
2561 '\\' => match chars.peek() {
2562 Some('\\') => {
2563 chars.next();
2564 unescaped.push('\\');
2565 }
2566 Some('+') => {
2567 chars.next();
2568 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2569 }
2570 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2571 },
2572 _ => {
2573 unescaped.push(c);
2574 }
2575 }
2576 }
2577 Err(TokenizerError {
2578 message: "Unterminated unicode encoded string literal".to_string(),
2579 location: chars.location(),
2580 })
2581}
2582
2583fn take_char_from_hex_digits(
2584 chars: &mut State<'_>,
2585 max_digits: usize,
2586) -> Result<char, TokenizerError> {
2587 let mut result = 0u32;
2588 for _ in 0..max_digits {
2589 let next_char = chars.next().ok_or_else(|| TokenizerError {
2590 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2591 .to_string(),
2592 location: chars.location(),
2593 })?;
2594 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2595 message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2596 location: chars.location(),
2597 })?;
2598 result = result * 16 + digit;
2599 }
2600 char::from_u32(result).ok_or_else(|| TokenizerError {
2601 message: format!("Invalid unicode character: {result:x}"),
2602 location: chars.location(),
2603 })
2604}
2605
2606#[cfg(test)]
2607mod tests {
2608 use super::*;
2609 use crate::dialect::{
2610 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2611 };
2612 use crate::test_utils::{all_dialects_except, all_dialects_where};
2613 use core::fmt::Debug;
2614
2615 #[test]
2616 fn tokenizer_error_impl() {
2617 let err = TokenizerError {
2618 message: "test".into(),
2619 location: Location { line: 1, column: 1 },
2620 };
2621 {
2622 use core::error::Error;
2623 assert!(err.source().is_none());
2624 }
2625 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2626 }
2627
2628 #[test]
2629 fn tokenize_select_1() {
2630 let sql = String::from("SELECT 1");
2631 let dialect = GenericDialect {};
2632 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2633
2634 let expected = vec![
2635 Token::make_keyword("SELECT"),
2636 Token::Whitespace(Whitespace::Space),
2637 Token::Number(String::from("1"), false),
2638 ];
2639
2640 compare(expected, tokens);
2641 }
2642
2643 #[test]
2644 fn tokenize_select_float() {
2645 let sql = String::from("SELECT .1");
2646 let dialect = GenericDialect {};
2647 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2648
2649 let expected = vec![
2650 Token::make_keyword("SELECT"),
2651 Token::Whitespace(Whitespace::Space),
2652 Token::Number(String::from(".1"), false),
2653 ];
2654
2655 compare(expected, tokens);
2656 }
2657
2658 #[test]
2659 fn tokenize_with_mapper() {
2660 let sql = String::from("SELECT ?");
2661 let dialect = GenericDialect {};
2662 let mut param_num = 1;
2663
2664 let mut tokens = vec![];
2665 Tokenizer::new(&dialect, &sql)
2666 .tokenize_with_location_into_buf_with_mapper(&mut tokens, |mut token_span| {
2667 token_span.token = match token_span.token {
2668 Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2669 let ret = format!("${}", param_num);
2670 param_num += 1;
2671 ret
2672 } else {
2673 n
2674 }),
2675 token => token,
2676 };
2677 token_span
2678 })
2679 .unwrap();
2680 let actual = tokens.into_iter().map(|t| t.token).collect();
2681 let expected = vec![
2682 Token::make_keyword("SELECT"),
2683 Token::Whitespace(Whitespace::Space),
2684 Token::Placeholder("$1".to_string()),
2685 ];
2686
2687 compare(expected, actual);
2688 }
2689
2690 #[test]
2691 fn tokenize_clickhouse_double_equal() {
2692 let sql = String::from("SELECT foo=='1'");
2693 let dialect = ClickHouseDialect {};
2694 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2695 let tokens = tokenizer.tokenize().unwrap();
2696
2697 let expected = vec![
2698 Token::make_keyword("SELECT"),
2699 Token::Whitespace(Whitespace::Space),
2700 Token::Word(Word {
2701 value: "foo".to_string(),
2702 quote_style: None,
2703 keyword: Keyword::NoKeyword,
2704 }),
2705 Token::DoubleEq,
2706 Token::SingleQuotedString("1".to_string()),
2707 ];
2708
2709 compare(expected, tokens);
2710 }
2711
2712 #[test]
2713 fn tokenize_numeric_literal_underscore() {
2714 let dialect = GenericDialect {};
2715 let sql = String::from("SELECT 10_000");
2716 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2717 let tokens = tokenizer.tokenize().unwrap();
2718 let expected = vec![
2719 Token::make_keyword("SELECT"),
2720 Token::Whitespace(Whitespace::Space),
2721 Token::Number("10".to_string(), false),
2722 Token::make_word("_000", None),
2723 ];
2724 compare(expected, tokens);
2725
2726 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2727 "SELECT 10_000, _10_000, 10_00_, 10___0",
2728 vec![
2729 Token::make_keyword("SELECT"),
2730 Token::Whitespace(Whitespace::Space),
2731 Token::Number("10_000".to_string(), false),
2732 Token::Comma,
2733 Token::Whitespace(Whitespace::Space),
2734 Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2735 Token::Comma,
2736 Token::Whitespace(Whitespace::Space),
2737 Token::Number("10_00".to_string(), false),
2738 Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2739 Token::Comma,
2740 Token::Whitespace(Whitespace::Space),
2741 Token::Number("10".to_string(), false),
2742 Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2743 ],
2744 );
2745 }
2746
2747 #[test]
2748 fn tokenize_select_exponent() {
2749 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2750 let dialect = GenericDialect {};
2751 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2752
2753 let expected = vec![
2754 Token::make_keyword("SELECT"),
2755 Token::Whitespace(Whitespace::Space),
2756 Token::Number(String::from("1e10"), false),
2757 Token::Comma,
2758 Token::Whitespace(Whitespace::Space),
2759 Token::Number(String::from("1e-10"), false),
2760 Token::Comma,
2761 Token::Whitespace(Whitespace::Space),
2762 Token::Number(String::from("1e+10"), false),
2763 Token::Comma,
2764 Token::Whitespace(Whitespace::Space),
2765 Token::Number(String::from("1"), false),
2766 Token::make_word("ea", None),
2767 Token::Comma,
2768 Token::Whitespace(Whitespace::Space),
2769 Token::Number(String::from("1e-10"), false),
2770 Token::make_word("a", None),
2771 Token::Comma,
2772 Token::Whitespace(Whitespace::Space),
2773 Token::Number(String::from("1e-10"), false),
2774 Token::Minus,
2775 Token::Number(String::from("10"), false),
2776 ];
2777
2778 compare(expected, tokens);
2779 }
2780
2781 #[test]
2782 fn tokenize_scalar_function() {
2783 let sql = String::from("SELECT sqrt(1)");
2784 let dialect = GenericDialect {};
2785 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2786
2787 let expected = vec![
2788 Token::make_keyword("SELECT"),
2789 Token::Whitespace(Whitespace::Space),
2790 Token::make_word("sqrt", None),
2791 Token::LParen,
2792 Token::Number(String::from("1"), false),
2793 Token::RParen,
2794 ];
2795
2796 compare(expected, tokens);
2797 }
2798
2799 #[test]
2800 fn tokenize_string_string_concat() {
2801 let sql = String::from("SELECT 'a' || 'b'");
2802 let dialect = GenericDialect {};
2803 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2804
2805 let expected = vec![
2806 Token::make_keyword("SELECT"),
2807 Token::Whitespace(Whitespace::Space),
2808 Token::SingleQuotedString(String::from("a")),
2809 Token::Whitespace(Whitespace::Space),
2810 Token::StringConcat,
2811 Token::Whitespace(Whitespace::Space),
2812 Token::SingleQuotedString(String::from("b")),
2813 ];
2814
2815 compare(expected, tokens);
2816 }
2817 #[test]
2818 fn tokenize_bitwise_op() {
2819 let sql = String::from("SELECT one | two ^ three");
2820 let dialect = GenericDialect {};
2821 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2822
2823 let expected = vec![
2824 Token::make_keyword("SELECT"),
2825 Token::Whitespace(Whitespace::Space),
2826 Token::make_word("one", None),
2827 Token::Whitespace(Whitespace::Space),
2828 Token::Pipe,
2829 Token::Whitespace(Whitespace::Space),
2830 Token::make_word("two", None),
2831 Token::Whitespace(Whitespace::Space),
2832 Token::Caret,
2833 Token::Whitespace(Whitespace::Space),
2834 Token::make_word("three", None),
2835 ];
2836 compare(expected, tokens);
2837 }
2838
2839 #[test]
2840 fn tokenize_logical_xor() {
2841 let sql =
2842 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2843 let dialect = GenericDialect {};
2844 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2845
2846 let expected = vec![
2847 Token::make_keyword("SELECT"),
2848 Token::Whitespace(Whitespace::Space),
2849 Token::make_keyword("true"),
2850 Token::Whitespace(Whitespace::Space),
2851 Token::make_keyword("XOR"),
2852 Token::Whitespace(Whitespace::Space),
2853 Token::make_keyword("true"),
2854 Token::Comma,
2855 Token::Whitespace(Whitespace::Space),
2856 Token::make_keyword("false"),
2857 Token::Whitespace(Whitespace::Space),
2858 Token::make_keyword("XOR"),
2859 Token::Whitespace(Whitespace::Space),
2860 Token::make_keyword("false"),
2861 Token::Comma,
2862 Token::Whitespace(Whitespace::Space),
2863 Token::make_keyword("true"),
2864 Token::Whitespace(Whitespace::Space),
2865 Token::make_keyword("XOR"),
2866 Token::Whitespace(Whitespace::Space),
2867 Token::make_keyword("false"),
2868 Token::Comma,
2869 Token::Whitespace(Whitespace::Space),
2870 Token::make_keyword("false"),
2871 Token::Whitespace(Whitespace::Space),
2872 Token::make_keyword("XOR"),
2873 Token::Whitespace(Whitespace::Space),
2874 Token::make_keyword("true"),
2875 ];
2876 compare(expected, tokens);
2877 }
2878
2879 #[test]
2880 fn tokenize_simple_select() {
2881 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2882 let dialect = GenericDialect {};
2883 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2884
2885 let expected = vec![
2886 Token::make_keyword("SELECT"),
2887 Token::Whitespace(Whitespace::Space),
2888 Token::Mul,
2889 Token::Whitespace(Whitespace::Space),
2890 Token::make_keyword("FROM"),
2891 Token::Whitespace(Whitespace::Space),
2892 Token::make_word("customer", None),
2893 Token::Whitespace(Whitespace::Space),
2894 Token::make_keyword("WHERE"),
2895 Token::Whitespace(Whitespace::Space),
2896 Token::make_word("id", None),
2897 Token::Whitespace(Whitespace::Space),
2898 Token::Eq,
2899 Token::Whitespace(Whitespace::Space),
2900 Token::Number(String::from("1"), false),
2901 Token::Whitespace(Whitespace::Space),
2902 Token::make_keyword("LIMIT"),
2903 Token::Whitespace(Whitespace::Space),
2904 Token::Number(String::from("5"), false),
2905 ];
2906
2907 compare(expected, tokens);
2908 }
2909
2910 #[test]
2911 fn tokenize_explain_select() {
2912 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2913 let dialect = GenericDialect {};
2914 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2915
2916 let expected = vec![
2917 Token::make_keyword("EXPLAIN"),
2918 Token::Whitespace(Whitespace::Space),
2919 Token::make_keyword("SELECT"),
2920 Token::Whitespace(Whitespace::Space),
2921 Token::Mul,
2922 Token::Whitespace(Whitespace::Space),
2923 Token::make_keyword("FROM"),
2924 Token::Whitespace(Whitespace::Space),
2925 Token::make_word("customer", None),
2926 Token::Whitespace(Whitespace::Space),
2927 Token::make_keyword("WHERE"),
2928 Token::Whitespace(Whitespace::Space),
2929 Token::make_word("id", None),
2930 Token::Whitespace(Whitespace::Space),
2931 Token::Eq,
2932 Token::Whitespace(Whitespace::Space),
2933 Token::Number(String::from("1"), false),
2934 ];
2935
2936 compare(expected, tokens);
2937 }
2938
2939 #[test]
2940 fn tokenize_explain_analyze_select() {
2941 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2942 let dialect = GenericDialect {};
2943 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2944
2945 let expected = vec![
2946 Token::make_keyword("EXPLAIN"),
2947 Token::Whitespace(Whitespace::Space),
2948 Token::make_keyword("ANALYZE"),
2949 Token::Whitespace(Whitespace::Space),
2950 Token::make_keyword("SELECT"),
2951 Token::Whitespace(Whitespace::Space),
2952 Token::Mul,
2953 Token::Whitespace(Whitespace::Space),
2954 Token::make_keyword("FROM"),
2955 Token::Whitespace(Whitespace::Space),
2956 Token::make_word("customer", None),
2957 Token::Whitespace(Whitespace::Space),
2958 Token::make_keyword("WHERE"),
2959 Token::Whitespace(Whitespace::Space),
2960 Token::make_word("id", None),
2961 Token::Whitespace(Whitespace::Space),
2962 Token::Eq,
2963 Token::Whitespace(Whitespace::Space),
2964 Token::Number(String::from("1"), false),
2965 ];
2966
2967 compare(expected, tokens);
2968 }
2969
2970 #[test]
2971 fn tokenize_string_predicate() {
2972 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2973 let dialect = GenericDialect {};
2974 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2975
2976 let expected = vec![
2977 Token::make_keyword("SELECT"),
2978 Token::Whitespace(Whitespace::Space),
2979 Token::Mul,
2980 Token::Whitespace(Whitespace::Space),
2981 Token::make_keyword("FROM"),
2982 Token::Whitespace(Whitespace::Space),
2983 Token::make_word("customer", None),
2984 Token::Whitespace(Whitespace::Space),
2985 Token::make_keyword("WHERE"),
2986 Token::Whitespace(Whitespace::Space),
2987 Token::make_word("salary", None),
2988 Token::Whitespace(Whitespace::Space),
2989 Token::Neq,
2990 Token::Whitespace(Whitespace::Space),
2991 Token::SingleQuotedString(String::from("Not Provided")),
2992 ];
2993
2994 compare(expected, tokens);
2995 }
2996
2997 #[test]
2998 fn tokenize_invalid_string() {
2999 let sql = String::from("\n💝مصطفىh");
3000
3001 let dialect = GenericDialect {};
3002 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3003 // println!("tokens: {:#?}", tokens);
3004 let expected = vec![
3005 Token::Whitespace(Whitespace::Newline),
3006 Token::Char('💝'),
3007 Token::make_word("مصطفىh", None),
3008 ];
3009 compare(expected, tokens);
3010 }
3011
3012 #[test]
3013 fn tokenize_newline_in_string_literal() {
3014 let sql = String::from("'foo\r\nbar\nbaz'");
3015
3016 let dialect = GenericDialect {};
3017 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3018 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
3019 compare(expected, tokens);
3020 }
3021
3022 #[test]
3023 fn tokenize_unterminated_string_literal() {
3024 let sql = String::from("select 'foo");
3025
3026 let dialect = GenericDialect {};
3027 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3028 assert_eq!(
3029 tokenizer.tokenize(),
3030 Err(TokenizerError {
3031 message: "Unterminated string literal".to_string(),
3032 location: Location { line: 1, column: 8 },
3033 })
3034 );
3035 }
3036
3037 #[test]
3038 fn tokenize_unterminated_string_literal_utf8() {
3039 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
3040
3041 let dialect = GenericDialect {};
3042 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3043 assert_eq!(
3044 tokenizer.tokenize(),
3045 Err(TokenizerError {
3046 message: "Unterminated string literal".to_string(),
3047 location: Location {
3048 line: 1,
3049 column: 35
3050 }
3051 })
3052 );
3053 }
3054
3055 #[test]
3056 fn tokenize_invalid_string_cols() {
3057 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
3058
3059 let dialect = GenericDialect {};
3060 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3061 // println!("tokens: {:#?}", tokens);
3062 let expected = vec![
3063 Token::Whitespace(Whitespace::Newline),
3064 Token::Whitespace(Whitespace::Newline),
3065 Token::make_keyword("SELECT"),
3066 Token::Whitespace(Whitespace::Space),
3067 Token::Mul,
3068 Token::Whitespace(Whitespace::Space),
3069 Token::make_keyword("FROM"),
3070 Token::Whitespace(Whitespace::Space),
3071 Token::make_keyword("table"),
3072 Token::Whitespace(Whitespace::Tab),
3073 Token::Char('💝'),
3074 Token::make_word("مصطفىh", None),
3075 ];
3076 compare(expected, tokens);
3077 }
3078
3079 #[test]
3080 fn tokenize_dollar_quoted_string_tagged() {
3081 let test_cases = vec![
3082 (
3083 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
3084 vec![
3085 Token::make_keyword("SELECT"),
3086 Token::Whitespace(Whitespace::Space),
3087 Token::DollarQuotedString(DollarQuotedString {
3088 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
3089 tag: Some("tag".into()),
3090 })
3091 ]
3092 ),
3093 (
3094 String::from("SELECT $abc$x$ab$abc$"),
3095 vec![
3096 Token::make_keyword("SELECT"),
3097 Token::Whitespace(Whitespace::Space),
3098 Token::DollarQuotedString(DollarQuotedString {
3099 value: "x$ab".into(),
3100 tag: Some("abc".into()),
3101 })
3102 ]
3103 ),
3104 (
3105 String::from("SELECT $abc$$abc$"),
3106 vec![
3107 Token::make_keyword("SELECT"),
3108 Token::Whitespace(Whitespace::Space),
3109 Token::DollarQuotedString(DollarQuotedString {
3110 value: "".into(),
3111 tag: Some("abc".into()),
3112 })
3113 ]
3114 ),
3115 (
3116 String::from("0$abc$$abc$1"),
3117 vec![
3118 Token::Number("0".into(), false),
3119 Token::DollarQuotedString(DollarQuotedString {
3120 value: "".into(),
3121 tag: Some("abc".into()),
3122 }),
3123 Token::Number("1".into(), false),
3124 ]
3125 ),
3126 (
3127 String::from("$function$abc$q$data$q$$function$"),
3128 vec![
3129 Token::DollarQuotedString(DollarQuotedString {
3130 value: "abc$q$data$q$".into(),
3131 tag: Some("function".into()),
3132 }),
3133 ]
3134 ),
3135 ];
3136
3137 let dialect = GenericDialect {};
3138 for (sql, expected) in test_cases {
3139 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3140 compare(expected, tokens);
3141 }
3142 }
3143
3144 #[test]
3145 fn tokenize_dollar_quoted_string_tagged_unterminated() {
3146 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
3147 let dialect = GenericDialect {};
3148 assert_eq!(
3149 Tokenizer::new(&dialect, &sql).tokenize(),
3150 Err(TokenizerError {
3151 message: "Unterminated dollar-quoted, expected $".into(),
3152 location: Location {
3153 line: 1,
3154 column: 91
3155 }
3156 })
3157 );
3158 }
3159
3160 #[test]
3161 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
3162 let sql = String::from("SELECT $abc$abc$");
3163 let dialect = GenericDialect {};
3164 assert_eq!(
3165 Tokenizer::new(&dialect, &sql).tokenize(),
3166 Err(TokenizerError {
3167 message: "Unterminated dollar-quoted, expected $".into(),
3168 location: Location {
3169 line: 1,
3170 column: 17
3171 }
3172 })
3173 );
3174 }
3175
3176 #[test]
3177 fn tokenize_dollar_placeholder() {
3178 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3179 let dialect = SQLiteDialect {};
3180 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3181 assert_eq!(
3182 tokens,
3183 vec![
3184 Token::make_keyword("SELECT"),
3185 Token::Whitespace(Whitespace::Space),
3186 Token::Placeholder("$$".into()),
3187 Token::Comma,
3188 Token::Whitespace(Whitespace::Space),
3189 Token::Placeholder("$$ABC$$".into()),
3190 Token::Comma,
3191 Token::Whitespace(Whitespace::Space),
3192 Token::Placeholder("$ABC$".into()),
3193 Token::Comma,
3194 Token::Whitespace(Whitespace::Space),
3195 Token::Placeholder("$ABC".into()),
3196 ]
3197 );
3198 }
3199
3200 #[test]
3201 fn tokenize_nested_dollar_quoted_strings() {
3202 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3203 let dialect = GenericDialect {};
3204 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3205 let expected = vec![
3206 Token::make_keyword("SELECT"),
3207 Token::Whitespace(Whitespace::Space),
3208 Token::DollarQuotedString(DollarQuotedString {
3209 value: "dollar $nested$ string".into(),
3210 tag: Some("tag".into()),
3211 }),
3212 ];
3213 compare(expected, tokens);
3214 }
3215
3216 #[test]
3217 fn tokenize_dollar_quoted_string_untagged_empty() {
3218 let sql = String::from("SELECT $$$$");
3219 let dialect = GenericDialect {};
3220 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3221 let expected = vec![
3222 Token::make_keyword("SELECT"),
3223 Token::Whitespace(Whitespace::Space),
3224 Token::DollarQuotedString(DollarQuotedString {
3225 value: "".into(),
3226 tag: None,
3227 }),
3228 ];
3229 compare(expected, tokens);
3230 }
3231
3232 #[test]
3233 fn tokenize_dollar_quoted_string_untagged() {
3234 let sql =
3235 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3236 let dialect = GenericDialect {};
3237 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3238 let expected = vec![
3239 Token::make_keyword("SELECT"),
3240 Token::Whitespace(Whitespace::Space),
3241 Token::DollarQuotedString(DollarQuotedString {
3242 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3243 tag: None,
3244 }),
3245 ];
3246 compare(expected, tokens);
3247 }
3248
3249 #[test]
3250 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3251 let sql = String::from(
3252 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3253 );
3254 let dialect = GenericDialect {};
3255 assert_eq!(
3256 Tokenizer::new(&dialect, &sql).tokenize(),
3257 Err(TokenizerError {
3258 message: "Unterminated dollar-quoted string".into(),
3259 location: Location {
3260 line: 1,
3261 column: 86
3262 }
3263 })
3264 );
3265 }
3266
3267 #[test]
3268 fn tokenize_right_arrow() {
3269 let sql = String::from("FUNCTION(key=>value)");
3270 let dialect = GenericDialect {};
3271 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3272 let expected = vec![
3273 Token::make_word("FUNCTION", None),
3274 Token::LParen,
3275 Token::make_word("key", None),
3276 Token::RArrow,
3277 Token::make_word("value", None),
3278 Token::RParen,
3279 ];
3280 compare(expected, tokens);
3281 }
3282
3283 #[test]
3284 fn tokenize_is_null() {
3285 let sql = String::from("a IS NULL");
3286 let dialect = GenericDialect {};
3287 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3288
3289 let expected = vec![
3290 Token::make_word("a", None),
3291 Token::Whitespace(Whitespace::Space),
3292 Token::make_keyword("IS"),
3293 Token::Whitespace(Whitespace::Space),
3294 Token::make_keyword("NULL"),
3295 ];
3296
3297 compare(expected, tokens);
3298 }
3299
3300 #[test]
3301 fn tokenize_comment() {
3302 let test_cases = vec![
3303 (
3304 String::from("0--this is a comment\n1"),
3305 vec![
3306 Token::Number("0".to_string(), false),
3307 Token::Whitespace(Whitespace::SingleLineComment {
3308 prefix: "--".to_string(),
3309 comment: "this is a comment\n".to_string(),
3310 }),
3311 Token::Number("1".to_string(), false),
3312 ],
3313 ),
3314 (
3315 String::from("0--this is a comment\r1"),
3316 vec![
3317 Token::Number("0".to_string(), false),
3318 Token::Whitespace(Whitespace::SingleLineComment {
3319 prefix: "--".to_string(),
3320 comment: "this is a comment\r1".to_string(),
3321 }),
3322 ],
3323 ),
3324 (
3325 String::from("0--this is a comment\r\n1"),
3326 vec![
3327 Token::Number("0".to_string(), false),
3328 Token::Whitespace(Whitespace::SingleLineComment {
3329 prefix: "--".to_string(),
3330 comment: "this is a comment\r\n".to_string(),
3331 }),
3332 Token::Number("1".to_string(), false),
3333 ],
3334 ),
3335 ];
3336
3337 let dialect = GenericDialect {};
3338
3339 for (sql, expected) in test_cases {
3340 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3341 compare(expected, tokens);
3342 }
3343 }
3344
3345 #[test]
3346 fn tokenize_comment_postgres() {
3347 let sql = String::from("1--\r0");
3348
3349 let dialect = PostgreSqlDialect {};
3350 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3351 let expected = vec![
3352 Token::Number("1".to_string(), false),
3353 Token::Whitespace(Whitespace::SingleLineComment {
3354 prefix: "--".to_string(),
3355 comment: "\r".to_string(),
3356 }),
3357 Token::Number("0".to_string(), false),
3358 ];
3359 compare(expected, tokens);
3360 }
3361
3362 #[test]
3363 fn tokenize_comment_at_eof() {
3364 let sql = String::from("--this is a comment");
3365
3366 let dialect = GenericDialect {};
3367 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3368 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3369 prefix: "--".to_string(),
3370 comment: "this is a comment".to_string(),
3371 })];
3372 compare(expected, tokens);
3373 }
3374
3375 #[test]
3376 fn tokenize_multiline_comment() {
3377 let sql = String::from("0/*multi-line\n* /comment*/1");
3378
3379 let dialect = GenericDialect {};
3380 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3381 let expected = vec![
3382 Token::Number("0".to_string(), false),
3383 Token::Whitespace(Whitespace::MultiLineComment(
3384 "multi-line\n* /comment".to_string(),
3385 )),
3386 Token::Number("1".to_string(), false),
3387 ];
3388 compare(expected, tokens);
3389 }
3390
3391 #[test]
3392 fn tokenize_nested_multiline_comment() {
3393 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3394 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3395 vec![
3396 Token::Number("0".to_string(), false),
3397 Token::Whitespace(Whitespace::MultiLineComment(
3398 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3399 )),
3400 Token::Whitespace(Whitespace::Space),
3401 Token::Div,
3402 Token::Word(Word {
3403 value: "comment".to_string(),
3404 quote_style: None,
3405 keyword: Keyword::COMMENT,
3406 }),
3407 Token::Mul,
3408 Token::Div,
3409 Token::Number("1".to_string(), false),
3410 ],
3411 );
3412
3413 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3414 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3415 vec![
3416 Token::Number("0".to_string(), false),
3417 Token::Whitespace(Whitespace::MultiLineComment(
3418 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3419 )),
3420 Token::Number("1".to_string(), false),
3421 ],
3422 );
3423
3424 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3425 "SELECT 1/* a /* b */ c */0",
3426 vec![
3427 Token::make_keyword("SELECT"),
3428 Token::Whitespace(Whitespace::Space),
3429 Token::Number("1".to_string(), false),
3430 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3431 Token::Number("0".to_string(), false),
3432 ],
3433 );
3434 }
3435
3436 #[test]
3437 fn tokenize_nested_multiline_comment_empty() {
3438 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3439 "select 1/*/**/*/0",
3440 vec![
3441 Token::make_keyword("select"),
3442 Token::Whitespace(Whitespace::Space),
3443 Token::Number("1".to_string(), false),
3444 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3445 Token::Number("0".to_string(), false),
3446 ],
3447 );
3448 }
3449
3450 #[test]
3451 fn tokenize_nested_comments_if_not_supported() {
3452 all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3453 "SELECT 1/*/* nested comment */*/0",
3454 vec![
3455 Token::make_keyword("SELECT"),
3456 Token::Whitespace(Whitespace::Space),
3457 Token::Number("1".to_string(), false),
3458 Token::Whitespace(Whitespace::MultiLineComment(
3459 "/* nested comment ".to_string(),
3460 )),
3461 Token::Mul,
3462 Token::Div,
3463 Token::Number("0".to_string(), false),
3464 ],
3465 );
3466 }
3467
3468 #[test]
3469 fn tokenize_multiline_comment_with_even_asterisks() {
3470 let sql = String::from("\n/** Comment **/\n");
3471
3472 let dialect = GenericDialect {};
3473 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3474 let expected = vec![
3475 Token::Whitespace(Whitespace::Newline),
3476 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3477 Token::Whitespace(Whitespace::Newline),
3478 ];
3479 compare(expected, tokens);
3480 }
3481
3482 #[test]
3483 fn tokenize_unicode_whitespace() {
3484 let sql = String::from(" \u{2003}\n");
3485
3486 let dialect = GenericDialect {};
3487 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3488 let expected = vec![
3489 Token::Whitespace(Whitespace::Space),
3490 Token::Whitespace(Whitespace::Space),
3491 Token::Whitespace(Whitespace::Newline),
3492 ];
3493 compare(expected, tokens);
3494 }
3495
3496 #[test]
3497 fn tokenize_mismatched_quotes() {
3498 let sql = String::from("\"foo");
3499
3500 let dialect = GenericDialect {};
3501 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3502 assert_eq!(
3503 tokenizer.tokenize(),
3504 Err(TokenizerError {
3505 message: "Expected close delimiter '\"' before EOF.".to_string(),
3506 location: Location { line: 1, column: 1 },
3507 })
3508 );
3509 }
3510
3511 #[test]
3512 fn tokenize_newlines() {
3513 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3514
3515 let dialect = GenericDialect {};
3516 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3517 let expected = vec![
3518 Token::make_word("line1", None),
3519 Token::Whitespace(Whitespace::Newline),
3520 Token::make_word("line2", None),
3521 Token::Whitespace(Whitespace::Newline),
3522 Token::make_word("line3", None),
3523 Token::Whitespace(Whitespace::Newline),
3524 Token::make_word("line4", None),
3525 Token::Whitespace(Whitespace::Newline),
3526 ];
3527 compare(expected, tokens);
3528 }
3529
3530 #[test]
3531 fn tokenize_mssql_top() {
3532 let sql = "SELECT TOP 5 [bar] FROM foo";
3533 let dialect = MsSqlDialect {};
3534 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3535 let expected = vec![
3536 Token::make_keyword("SELECT"),
3537 Token::Whitespace(Whitespace::Space),
3538 Token::make_keyword("TOP"),
3539 Token::Whitespace(Whitespace::Space),
3540 Token::Number(String::from("5"), false),
3541 Token::Whitespace(Whitespace::Space),
3542 Token::make_word("bar", Some('[')),
3543 Token::Whitespace(Whitespace::Space),
3544 Token::make_keyword("FROM"),
3545 Token::Whitespace(Whitespace::Space),
3546 Token::make_word("foo", None),
3547 ];
3548 compare(expected, tokens);
3549 }
3550
3551 #[test]
3552 fn tokenize_pg_regex_match() {
3553 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3554 let dialect = GenericDialect {};
3555 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3556 let expected = vec![
3557 Token::make_keyword("SELECT"),
3558 Token::Whitespace(Whitespace::Space),
3559 Token::make_word("col", None),
3560 Token::Whitespace(Whitespace::Space),
3561 Token::Tilde,
3562 Token::Whitespace(Whitespace::Space),
3563 Token::SingleQuotedString("^a".into()),
3564 Token::Comma,
3565 Token::Whitespace(Whitespace::Space),
3566 Token::make_word("col", None),
3567 Token::Whitespace(Whitespace::Space),
3568 Token::TildeAsterisk,
3569 Token::Whitespace(Whitespace::Space),
3570 Token::SingleQuotedString("^a".into()),
3571 Token::Comma,
3572 Token::Whitespace(Whitespace::Space),
3573 Token::make_word("col", None),
3574 Token::Whitespace(Whitespace::Space),
3575 Token::ExclamationMarkTilde,
3576 Token::Whitespace(Whitespace::Space),
3577 Token::SingleQuotedString("^a".into()),
3578 Token::Comma,
3579 Token::Whitespace(Whitespace::Space),
3580 Token::make_word("col", None),
3581 Token::Whitespace(Whitespace::Space),
3582 Token::ExclamationMarkTildeAsterisk,
3583 Token::Whitespace(Whitespace::Space),
3584 Token::SingleQuotedString("^a".into()),
3585 ];
3586 compare(expected, tokens);
3587 }
3588
3589 #[test]
3590 fn tokenize_pg_like_match() {
3591 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3592 let dialect = GenericDialect {};
3593 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3594 let expected = vec![
3595 Token::make_keyword("SELECT"),
3596 Token::Whitespace(Whitespace::Space),
3597 Token::make_word("col", None),
3598 Token::Whitespace(Whitespace::Space),
3599 Token::DoubleTilde,
3600 Token::Whitespace(Whitespace::Space),
3601 Token::SingleQuotedString("_a%".into()),
3602 Token::Comma,
3603 Token::Whitespace(Whitespace::Space),
3604 Token::make_word("col", None),
3605 Token::Whitespace(Whitespace::Space),
3606 Token::DoubleTildeAsterisk,
3607 Token::Whitespace(Whitespace::Space),
3608 Token::SingleQuotedString("_a%".into()),
3609 Token::Comma,
3610 Token::Whitespace(Whitespace::Space),
3611 Token::make_word("col", None),
3612 Token::Whitespace(Whitespace::Space),
3613 Token::ExclamationMarkDoubleTilde,
3614 Token::Whitespace(Whitespace::Space),
3615 Token::SingleQuotedString("_a%".into()),
3616 Token::Comma,
3617 Token::Whitespace(Whitespace::Space),
3618 Token::make_word("col", None),
3619 Token::Whitespace(Whitespace::Space),
3620 Token::ExclamationMarkDoubleTildeAsterisk,
3621 Token::Whitespace(Whitespace::Space),
3622 Token::SingleQuotedString("_a%".into()),
3623 ];
3624 compare(expected, tokens);
3625 }
3626
3627 #[test]
3628 fn tokenize_quoted_identifier() {
3629 let sql = r#" "a "" b" "a """ "c """"" "#;
3630 let dialect = GenericDialect {};
3631 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3632 let expected = vec![
3633 Token::Whitespace(Whitespace::Space),
3634 Token::make_word(r#"a " b"#, Some('"')),
3635 Token::Whitespace(Whitespace::Space),
3636 Token::make_word(r#"a ""#, Some('"')),
3637 Token::Whitespace(Whitespace::Space),
3638 Token::make_word(r#"c """#, Some('"')),
3639 Token::Whitespace(Whitespace::Space),
3640 ];
3641 compare(expected, tokens);
3642 }
3643
3644 #[test]
3645 fn tokenize_snowflake_div() {
3646 let sql = r#"field/1000"#;
3647 let dialect = SnowflakeDialect {};
3648 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3649 let expected = vec![
3650 Token::make_word(r#"field"#, None),
3651 Token::Div,
3652 Token::Number("1000".to_string(), false),
3653 ];
3654 compare(expected, tokens);
3655 }
3656
3657 #[test]
3658 fn tokenize_quoted_identifier_with_no_escape() {
3659 let sql = r#" "a "" b" "a """ "c """"" "#;
3660 let dialect = GenericDialect {};
3661 let tokens = Tokenizer::new(&dialect, sql)
3662 .with_unescape(false)
3663 .tokenize()
3664 .unwrap();
3665 let expected = vec![
3666 Token::Whitespace(Whitespace::Space),
3667 Token::make_word(r#"a "" b"#, Some('"')),
3668 Token::Whitespace(Whitespace::Space),
3669 Token::make_word(r#"a """#, Some('"')),
3670 Token::Whitespace(Whitespace::Space),
3671 Token::make_word(r#"c """""#, Some('"')),
3672 Token::Whitespace(Whitespace::Space),
3673 ];
3674 compare(expected, tokens);
3675 }
3676
3677 #[test]
3678 fn tokenize_with_location() {
3679 let sql = "SELECT a,\n b";
3680 let dialect = GenericDialect {};
3681 let tokens = Tokenizer::new(&dialect, sql)
3682 .tokenize_with_location()
3683 .unwrap();
3684 let expected = vec![
3685 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3686 TokenWithSpan::at(
3687 Token::Whitespace(Whitespace::Space),
3688 (1, 7).into(),
3689 (1, 8).into(),
3690 ),
3691 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3692 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3693 TokenWithSpan::at(
3694 Token::Whitespace(Whitespace::Newline),
3695 (1, 10).into(),
3696 (2, 1).into(),
3697 ),
3698 TokenWithSpan::at(
3699 Token::Whitespace(Whitespace::Space),
3700 (2, 1).into(),
3701 (2, 2).into(),
3702 ),
3703 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3704 ];
3705 compare(expected, tokens);
3706 }
3707
3708 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3709 //println!("------------------------------");
3710 //println!("tokens = {:?}", actual);
3711 //println!("expected = {:?}", expected);
3712 //println!("------------------------------");
3713 assert_eq!(expected, actual);
3714 }
3715
3716 fn check_unescape(s: &str, expected: Option<&str>) {
3717 let s = format!("'{s}'");
3718 let mut state = State {
3719 peekable: s.chars().peekable(),
3720 line: 0,
3721 col: 0,
3722 };
3723
3724 assert_eq!(
3725 unescape_single_quoted_string(&mut state),
3726 expected.map(|s| s.to_string())
3727 );
3728 }
3729
3730 #[test]
3731 fn test_unescape() {
3732 check_unescape(r"\b", Some("\u{0008}"));
3733 check_unescape(r"\f", Some("\u{000C}"));
3734 check_unescape(r"\t", Some("\t"));
3735 check_unescape(r"\r\n", Some("\r\n"));
3736 check_unescape(r"\/", Some("/"));
3737 check_unescape(r"/", Some("/"));
3738 check_unescape(r"\\", Some("\\"));
3739
3740 // 16 and 32-bit hexadecimal Unicode character value
3741 check_unescape(r"\u0001", Some("\u{0001}"));
3742 check_unescape(r"\u4c91", Some("\u{4c91}"));
3743 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3744 check_unescape(r"\u4c", None);
3745 check_unescape(r"\u0000", None);
3746 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3747 check_unescape(r"\U00110000", None);
3748 check_unescape(r"\U00000000", None);
3749 check_unescape(r"\u", None);
3750 check_unescape(r"\U", None);
3751 check_unescape(r"\U1010FFFF", None);
3752
3753 // hexadecimal byte value
3754 check_unescape(r"\x4B", Some("\u{004b}"));
3755 check_unescape(r"\x4", Some("\u{0004}"));
3756 check_unescape(r"\x4L", Some("\u{0004}L"));
3757 check_unescape(r"\x", Some("x"));
3758 check_unescape(r"\xP", Some("xP"));
3759 check_unescape(r"\x0", None);
3760 check_unescape(r"\xCAD", None);
3761 check_unescape(r"\xA9", None);
3762
3763 // octal byte value
3764 check_unescape(r"\1", Some("\u{0001}"));
3765 check_unescape(r"\12", Some("\u{000a}"));
3766 check_unescape(r"\123", Some("\u{0053}"));
3767 check_unescape(r"\1232", Some("\u{0053}2"));
3768 check_unescape(r"\4", Some("\u{0004}"));
3769 check_unescape(r"\45", Some("\u{0025}"));
3770 check_unescape(r"\450", Some("\u{0028}"));
3771 check_unescape(r"\603", None);
3772 check_unescape(r"\0", None);
3773 check_unescape(r"\080", None);
3774
3775 // others
3776 check_unescape(r"\9", Some("9"));
3777 check_unescape(r"''", Some("'"));
3778 check_unescape(
3779 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3780 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3781 );
3782 check_unescape(r"Hello\0", None);
3783 check_unescape(r"Hello\xCADRust", None);
3784 }
3785
3786 #[test]
3787 fn tokenize_numeric_prefix_trait() {
3788 #[derive(Debug)]
3789 struct NumericPrefixDialect;
3790
3791 impl Dialect for NumericPrefixDialect {
3792 fn is_identifier_start(&self, ch: char) -> bool {
3793 ch.is_ascii_lowercase()
3794 || ch.is_ascii_uppercase()
3795 || ch.is_ascii_digit()
3796 || ch == '$'
3797 }
3798
3799 fn is_identifier_part(&self, ch: char) -> bool {
3800 ch.is_ascii_lowercase()
3801 || ch.is_ascii_uppercase()
3802 || ch.is_ascii_digit()
3803 || ch == '_'
3804 || ch == '$'
3805 || ch == '{'
3806 || ch == '}'
3807 }
3808
3809 fn supports_numeric_prefix(&self) -> bool {
3810 true
3811 }
3812 }
3813
3814 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3815 tokenize_numeric_prefix_inner(&HiveDialect {});
3816 tokenize_numeric_prefix_inner(&MySqlDialect {});
3817 }
3818
3819 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3820 let sql = r#"SELECT * FROM 1"#;
3821 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3822 let expected = vec![
3823 Token::make_keyword("SELECT"),
3824 Token::Whitespace(Whitespace::Space),
3825 Token::Mul,
3826 Token::Whitespace(Whitespace::Space),
3827 Token::make_keyword("FROM"),
3828 Token::Whitespace(Whitespace::Space),
3829 Token::Number(String::from("1"), false),
3830 ];
3831 compare(expected, tokens);
3832 }
3833
3834 #[test]
3835 fn tokenize_quoted_string_escape() {
3836 let dialect = SnowflakeDialect {};
3837 for (sql, expected, expected_unescaped) in [
3838 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3839 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3840 (r#"'\\'"#, r#"\\"#, r#"\"#),
3841 (
3842 r#"'\0\a\b\f\n\r\t\Z'"#,
3843 r#"\0\a\b\f\n\r\t\Z"#,
3844 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3845 ),
3846 (r#"'\"'"#, r#"\""#, "\""),
3847 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3848 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3849 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3850 (r#"'\q'"#, r#"\q"#, r#"q"#),
3851 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3852 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3853 ] {
3854 let tokens = Tokenizer::new(&dialect, sql)
3855 .with_unescape(false)
3856 .tokenize()
3857 .unwrap();
3858 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3859 compare(expected, tokens);
3860
3861 let tokens = Tokenizer::new(&dialect, sql)
3862 .with_unescape(true)
3863 .tokenize()
3864 .unwrap();
3865 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3866 compare(expected, tokens);
3867 }
3868
3869 for sql in [r#"'\'"#, r#"'ab\'"#] {
3870 let mut tokenizer = Tokenizer::new(&dialect, sql);
3871 assert_eq!(
3872 "Unterminated string literal",
3873 tokenizer.tokenize().unwrap_err().message.as_str(),
3874 );
3875 }
3876
3877 // Non-escape dialect
3878 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3879 let dialect = GenericDialect {};
3880 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3881
3882 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3883
3884 compare(expected, tokens);
3885 }
3886
3887 // MySQL special case for LIKE escapes
3888 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3889 let dialect = MySqlDialect {};
3890 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3891
3892 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3893
3894 compare(expected, tokens);
3895 }
3896 }
3897
3898 #[test]
3899 fn tokenize_triple_quoted_string() {
3900 fn check<F>(
3901 q: char, // The quote character to test
3902 r: char, // An alternate quote character.
3903 quote_token: F,
3904 ) where
3905 F: Fn(String) -> Token,
3906 {
3907 let dialect = BigQueryDialect {};
3908
3909 for (sql, expected, expected_unescaped) in [
3910 // Empty string
3911 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3912 // Should not count escaped quote as end of string.
3913 (
3914 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3915 format!(r#"ab{q}{q}\{q}{q}cd"#),
3916 format!(r#"ab{q}{q}{q}{q}cd"#),
3917 ),
3918 // Simple string
3919 (
3920 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3921 "abc".into(),
3922 "abc".into(),
3923 ),
3924 // Mix single-double quotes unescaped.
3925 (
3926 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3927 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3928 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3929 ),
3930 // Escaped quote.
3931 (
3932 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3933 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3934 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3935 ),
3936 // backslash-escaped quote characters.
3937 (
3938 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3939 r#"a\'\'b\'c\'d"#.into(),
3940 r#"a''b'c'd"#.into(),
3941 ),
3942 // backslash-escaped characters
3943 (
3944 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3945 r#"abc\0\n\rdef"#.into(),
3946 "abc\0\n\rdef".into(),
3947 ),
3948 ] {
3949 let tokens = Tokenizer::new(&dialect, sql.as_str())
3950 .with_unescape(false)
3951 .tokenize()
3952 .unwrap();
3953 let expected = vec![quote_token(expected.to_string())];
3954 compare(expected, tokens);
3955
3956 let tokens = Tokenizer::new(&dialect, sql.as_str())
3957 .with_unescape(true)
3958 .tokenize()
3959 .unwrap();
3960 let expected = vec![quote_token(expected_unescaped.to_string())];
3961 compare(expected, tokens);
3962 }
3963
3964 for sql in [
3965 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3966 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3967 format!(r#"{q}{q}{q}{q}"#),
3968 format!(r#"{q}{q}{q}{r}{r}"#),
3969 format!(r#"{q}{q}{q}abc{q}"#),
3970 format!(r#"{q}{q}{q}abc{q}{q}"#),
3971 format!(r#"{q}{q}{q}abc"#),
3972 ] {
3973 let dialect = BigQueryDialect {};
3974 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3975 assert_eq!(
3976 "Unterminated string literal",
3977 tokenizer.tokenize().unwrap_err().message.as_str(),
3978 );
3979 }
3980 }
3981
3982 check('"', '\'', Token::TripleDoubleQuotedString);
3983
3984 check('\'', '"', Token::TripleSingleQuotedString);
3985
3986 let dialect = BigQueryDialect {};
3987
3988 let sql = r#"""''"#;
3989 let tokens = Tokenizer::new(&dialect, sql)
3990 .with_unescape(true)
3991 .tokenize()
3992 .unwrap();
3993 let expected = vec![
3994 Token::DoubleQuotedString("".to_string()),
3995 Token::SingleQuotedString("".to_string()),
3996 ];
3997 compare(expected, tokens);
3998
3999 let sql = r#"''"""#;
4000 let tokens = Tokenizer::new(&dialect, sql)
4001 .with_unescape(true)
4002 .tokenize()
4003 .unwrap();
4004 let expected = vec![
4005 Token::SingleQuotedString("".to_string()),
4006 Token::DoubleQuotedString("".to_string()),
4007 ];
4008 compare(expected, tokens);
4009
4010 // Non-triple quoted string dialect
4011 let dialect = SnowflakeDialect {};
4012 let sql = r#"''''''"#;
4013 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4014 let expected = vec![Token::SingleQuotedString("''".to_string())];
4015 compare(expected, tokens);
4016 }
4017
4018 #[test]
4019 fn test_mysql_users_grantees() {
4020 let dialect = MySqlDialect {};
4021
4022 let sql = "CREATE USER `root`@`%`";
4023 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4024 let expected = vec![
4025 Token::make_keyword("CREATE"),
4026 Token::Whitespace(Whitespace::Space),
4027 Token::make_keyword("USER"),
4028 Token::Whitespace(Whitespace::Space),
4029 Token::make_word("root", Some('`')),
4030 Token::AtSign,
4031 Token::make_word("%", Some('`')),
4032 ];
4033 compare(expected, tokens);
4034 }
4035
4036 #[test]
4037 fn test_postgres_abs_without_space_and_string_literal() {
4038 let dialect = MySqlDialect {};
4039
4040 let sql = "SELECT @'1'";
4041 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4042 let expected = vec![
4043 Token::make_keyword("SELECT"),
4044 Token::Whitespace(Whitespace::Space),
4045 Token::AtSign,
4046 Token::SingleQuotedString("1".to_string()),
4047 ];
4048 compare(expected, tokens);
4049 }
4050
4051 #[test]
4052 fn test_postgres_abs_without_space_and_quoted_column() {
4053 let dialect = MySqlDialect {};
4054
4055 let sql = r#"SELECT @"bar" FROM foo"#;
4056 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4057 let expected = vec![
4058 Token::make_keyword("SELECT"),
4059 Token::Whitespace(Whitespace::Space),
4060 Token::AtSign,
4061 Token::DoubleQuotedString("bar".to_string()),
4062 Token::Whitespace(Whitespace::Space),
4063 Token::make_keyword("FROM"),
4064 Token::Whitespace(Whitespace::Space),
4065 Token::make_word("foo", None),
4066 ];
4067 compare(expected, tokens);
4068 }
4069
4070 #[test]
4071 fn test_national_strings_backslash_escape_not_supported() {
4072 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
4073 .tokenizes_to(
4074 "select n'''''\\'",
4075 vec![
4076 Token::make_keyword("select"),
4077 Token::Whitespace(Whitespace::Space),
4078 Token::NationalStringLiteral("''\\".to_string()),
4079 ],
4080 );
4081 }
4082
4083 #[test]
4084 fn test_national_strings_backslash_escape_supported() {
4085 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
4086 .tokenizes_to(
4087 "select n'''''\\''",
4088 vec![
4089 Token::make_keyword("select"),
4090 Token::Whitespace(Whitespace::Space),
4091 Token::NationalStringLiteral("'''".to_string()),
4092 ],
4093 );
4094 }
4095
4096 #[test]
4097 fn test_string_escape_constant_not_supported() {
4098 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4099 "select e'...'",
4100 vec![
4101 Token::make_keyword("select"),
4102 Token::Whitespace(Whitespace::Space),
4103 Token::make_word("e", None),
4104 Token::SingleQuotedString("...".to_string()),
4105 ],
4106 );
4107
4108 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4109 "select E'...'",
4110 vec![
4111 Token::make_keyword("select"),
4112 Token::Whitespace(Whitespace::Space),
4113 Token::make_word("E", None),
4114 Token::SingleQuotedString("...".to_string()),
4115 ],
4116 );
4117 }
4118
4119 #[test]
4120 fn test_string_escape_constant_supported() {
4121 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4122 "select e'\\''",
4123 vec![
4124 Token::make_keyword("select"),
4125 Token::Whitespace(Whitespace::Space),
4126 Token::EscapedStringLiteral("'".to_string()),
4127 ],
4128 );
4129
4130 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4131 "select E'\\''",
4132 vec![
4133 Token::make_keyword("select"),
4134 Token::Whitespace(Whitespace::Space),
4135 Token::EscapedStringLiteral("'".to_string()),
4136 ],
4137 );
4138 }
4139
4140 #[test]
4141 fn test_whitespace_required_after_single_line_comment() {
4142 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4143 .tokenizes_to(
4144 "SELECT --'abc'",
4145 vec![
4146 Token::make_keyword("SELECT"),
4147 Token::Whitespace(Whitespace::Space),
4148 Token::Minus,
4149 Token::Minus,
4150 Token::SingleQuotedString("abc".to_string()),
4151 ],
4152 );
4153
4154 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4155 .tokenizes_to(
4156 "SELECT -- 'abc'",
4157 vec![
4158 Token::make_keyword("SELECT"),
4159 Token::Whitespace(Whitespace::Space),
4160 Token::Whitespace(Whitespace::SingleLineComment {
4161 prefix: "--".to_string(),
4162 comment: " 'abc'".to_string(),
4163 }),
4164 ],
4165 );
4166
4167 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4168 .tokenizes_to(
4169 "SELECT --",
4170 vec![
4171 Token::make_keyword("SELECT"),
4172 Token::Whitespace(Whitespace::Space),
4173 Token::Minus,
4174 Token::Minus,
4175 ],
4176 );
4177
4178 all_dialects_where(|d| d.requires_single_line_comment_whitespace()).tokenizes_to(
4179 "--\n-- Table structure for table...\n--\n",
4180 vec![
4181 Token::Whitespace(Whitespace::SingleLineComment {
4182 prefix: "--".to_string(),
4183 comment: "\n".to_string(),
4184 }),
4185 Token::Whitespace(Whitespace::SingleLineComment {
4186 prefix: "--".to_string(),
4187 comment: " Table structure for table...\n".to_string(),
4188 }),
4189 Token::Whitespace(Whitespace::SingleLineComment {
4190 prefix: "--".to_string(),
4191 comment: "\n".to_string(),
4192 }),
4193 ],
4194 );
4195 }
4196
4197 #[test]
4198 fn test_whitespace_not_required_after_single_line_comment() {
4199 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4200 .tokenizes_to(
4201 "SELECT --'abc'",
4202 vec![
4203 Token::make_keyword("SELECT"),
4204 Token::Whitespace(Whitespace::Space),
4205 Token::Whitespace(Whitespace::SingleLineComment {
4206 prefix: "--".to_string(),
4207 comment: "'abc'".to_string(),
4208 }),
4209 ],
4210 );
4211
4212 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4213 .tokenizes_to(
4214 "SELECT -- 'abc'",
4215 vec![
4216 Token::make_keyword("SELECT"),
4217 Token::Whitespace(Whitespace::Space),
4218 Token::Whitespace(Whitespace::SingleLineComment {
4219 prefix: "--".to_string(),
4220 comment: " 'abc'".to_string(),
4221 }),
4222 ],
4223 );
4224
4225 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4226 .tokenizes_to(
4227 "SELECT --",
4228 vec![
4229 Token::make_keyword("SELECT"),
4230 Token::Whitespace(Whitespace::Space),
4231 Token::Whitespace(Whitespace::SingleLineComment {
4232 prefix: "--".to_string(),
4233 comment: "".to_string(),
4234 }),
4235 ],
4236 );
4237 }
4238
4239 #[test]
4240 fn test_tokenize_identifiers_numeric_prefix() {
4241 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4242 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4243
4244 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4245 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4246
4247 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4248 "t.12e34",
4249 vec![
4250 Token::make_word("t", None),
4251 Token::Period,
4252 Token::make_word("12e34", None),
4253 ],
4254 );
4255
4256 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4257 "t.1two3",
4258 vec![
4259 Token::make_word("t", None),
4260 Token::Period,
4261 Token::make_word("1two3", None),
4262 ],
4263 );
4264 }
4265
4266 #[test]
4267 fn tokenize_period_underscore() {
4268 let sql = String::from("SELECT table._col");
4269 // a dialect that supports underscores in numeric literals
4270 let dialect = PostgreSqlDialect {};
4271 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4272
4273 let expected = vec![
4274 Token::make_keyword("SELECT"),
4275 Token::Whitespace(Whitespace::Space),
4276 Token::Word(Word {
4277 value: "table".to_string(),
4278 quote_style: None,
4279 keyword: Keyword::TABLE,
4280 }),
4281 Token::Period,
4282 Token::Word(Word {
4283 value: "_col".to_string(),
4284 quote_style: None,
4285 keyword: Keyword::NoKeyword,
4286 }),
4287 ];
4288
4289 compare(expected, tokens);
4290
4291 let sql = String::from("SELECT ._123");
4292 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4293 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4294 }
4295
4296 let sql = String::from("SELECT ._abc");
4297 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4298 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4299 }
4300 }
4301
4302 #[test]
4303 fn tokenize_question_mark() {
4304 let dialect = PostgreSqlDialect {};
4305 let sql = "SELECT x ? y";
4306 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4307 compare(
4308 tokens,
4309 vec![
4310 Token::make_keyword("SELECT"),
4311 Token::Whitespace(Whitespace::Space),
4312 Token::make_word("x", None),
4313 Token::Whitespace(Whitespace::Space),
4314 Token::Question,
4315 Token::Whitespace(Whitespace::Space),
4316 Token::make_word("y", None),
4317 ],
4318 );
4319 }
4320
4321 #[test]
4322 fn tokenize_multiline_comment_with_comment_hint() {
4323 let sql = String::from("0/*! word */1");
4324
4325 let dialect = MySqlDialect {};
4326 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4327 let expected = vec![
4328 Token::Number("0".to_string(), false),
4329 Token::Whitespace(Whitespace::Space),
4330 Token::Word(Word {
4331 value: "word".to_string(),
4332 quote_style: None,
4333 keyword: Keyword::NoKeyword,
4334 }),
4335 Token::Whitespace(Whitespace::Space),
4336 Token::Number("1".to_string(), false),
4337 ];
4338 compare(expected, tokens);
4339 }
4340
4341 #[test]
4342 fn tokenize_multiline_comment_with_comment_hint_and_version() {
4343 let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
4344 let dialect = MySqlDialect {};
4345 let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
4346 let expected = vec![
4347 Token::Number("0".to_string(), false),
4348 Token::Whitespace(Whitespace::Space),
4349 Token::Whitespace(Whitespace::Space),
4350 Token::Word(Word {
4351 value: "KEY_BLOCK_SIZE".to_string(),
4352 quote_style: None,
4353 keyword: Keyword::KEY_BLOCK_SIZE,
4354 }),
4355 Token::Whitespace(Whitespace::Space),
4356 Token::Eq,
4357 Token::Whitespace(Whitespace::Space),
4358 Token::Number("1024".to_string(), false),
4359 Token::Whitespace(Whitespace::Space),
4360 Token::Number("1".to_string(), false),
4361 ];
4362 compare(expected, tokens);
4363
4364 let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
4365 .tokenize()
4366 .unwrap();
4367 compare(
4368 vec![
4369 Token::Number("0".to_string(), false),
4370 Token::Whitespace(Whitespace::Space),
4371 Token::Whitespace(Whitespace::Space),
4372 Token::Whitespace(Whitespace::Space),
4373 Token::Number("1".to_string(), false),
4374 ],
4375 tokens,
4376 );
4377
4378 let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
4379 compare(
4380 vec![
4381 Token::Number("0".to_string(), false),
4382 Token::Whitespace(Whitespace::Space),
4383 Token::Whitespace(Whitespace::Space),
4384 Token::Number("1".to_string(), false),
4385 ],
4386 tokens,
4387 );
4388 let tokens = Tokenizer::new(&dialect, "0 /*! */ 1").tokenize().unwrap();
4389 compare(
4390 vec![
4391 Token::Number("0".to_string(), false),
4392 Token::Whitespace(Whitespace::Space),
4393 Token::Whitespace(Whitespace::Space),
4394 Token::Whitespace(Whitespace::Space),
4395 Token::Whitespace(Whitespace::Space),
4396 Token::Whitespace(Whitespace::Space),
4397 Token::Number("1".to_string(), false),
4398 ],
4399 tokens,
4400 );
4401 }
4402}