bigframes.bigquery._operations.sql — bigframes documentation

Source code for bigframes.bigquery._operations.sql

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""SQL escape hatch features."""

from __future__ import annotations

from typing import cast, Optional, Sequence, Union

import google.cloud.bigquery

from bigframes.core.compile.sqlglot import sql
import bigframes.dataframe
import bigframes.dtypes
import bigframes.operations
import bigframes.series


def _format_names(sql_template: str, dataframe: bigframes.dataframe.DataFrame):
    """Turn sql_template from a template that uses names to one that uses
    numbers.
    """
    names_to_numbers = {name: f"{{{i}}}" for i, name in enumerate(dataframe.columns)}
    numbers = [f"{{{i}}}" for i in range(len(dataframe.columns))]
    return sql_template.format(*numbers, **names_to_numbers)


[docs]
def sql_scalar(
    sql_template: str,
    columns: Union[bigframes.dataframe.DataFrame, Sequence[bigframes.series.Series]],
    *,
    output_dtype: Optional[bigframes.dtypes.Dtype] = None,
) -> bigframes.series.Series:
    """Create a Series from a SQL template.

    **Examples:**

        >>> import bigframes.pandas as bpd
        >>> import bigframes.bigquery as bbq

    Either pass in a sequence of series, in which case use  integers in the
    format strings.

        >>> s = bpd.Series(["1.5", "2.5", "3.5"])
        >>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9)))
        >>> bbq.sql_scalar("ROUND({0}, 0, 'ROUND_HALF_EVEN')", [s])
        0    2.000000000
        1    2.000000000
        2    4.000000000
        dtype: decimal128(38, 9)[pyarrow]

    Or pass in a DataFrame, in which case use the column names in the format
    strings.

        >>> df = bpd.DataFrame({"a": ["1.5", "2.5", "3.5"]})
        >>> df = df.astype({"a": pd.ArrowDtype(pa.decimal128(38, 9))})
        >>> bbq.sql_scalar("ROUND({a}, 0, 'ROUND_HALF_EVEN')", df)
        0    2.000000000
        1    2.000000000
        2    4.000000000
        dtype: decimal128(38, 9)[pyarrow]

    You can also use the `.bigquery` DataFrame accessor to apply a SQL scalar function.

        Compute SQL scalar using a pandas DataFrame:

        >>> import pandas as pd
        >>> df = pd.DataFrame({"x": [1, 2, 3]})
        >>> bpd.options.display.progress_bar = None # doctest: +SKIP
        >>> pandas_s = df.bigquery.sql_scalar("POW({0}, 2)") # doctest: +SKIP
        >>> type(pandas_s) # doctest: +SKIP
        <class 'pandas.core.series.Series'>

        Compute SQL scalar using a BigFrames DataFrame:

        >>> bf_df = bpd.DataFrame({"x": [1, 2, 3]})
        >>> bf_s = bf_df.bigquery.sql_scalar("POW({0}, 2)") # doctest: +SKIP
        >>> type(bf_s) # doctest: +SKIP
        <class 'bigframes.series.Series'>


    Args:
        sql_template (str):
            A SQL format string with Python-style {0} placeholders for each of
            the Series objects in ``columns``.
        columns (
            Sequence[bigframes.pandas.Series] | bigframes.pandas.DataFrame
        ):
            Series objects representing the column inputs to the
            ``sql_template``. Must contain at least one Series.
        output_dtype (a BigQuery DataFrames compatible dtype, optional):
            If provided, BigQuery DataFrames uses this to determine the output
            of the returned Series. This avoids a dry run query.

    Returns:
        bigframes.pandas.Series:
            A Series with the SQL applied.

    Raises:
        ValueError: If ``columns`` is empty.
    """
    if isinstance(columns, bigframes.dataframe.DataFrame):
        sql_template = _format_names(sql_template, columns)
        columns = [
            cast(bigframes.series.Series, columns[column]) for column in columns.columns
        ]

    if len(columns) == 0:
        raise ValueError("Must provide at least one column in columns")

    base_series = columns[0]

    # To integrate this into our expression trees, we need to get the output
    # type, so we do some manual compilation and a dry run query to get that.
    # Another benefit of this is that if there is a syntax error in the SQL
    # template, then this will fail with an error earlier in the process,
    # aiding users in debugging.
    if output_dtype is None:
        literals_sql = [
            sql.to_sql(sql.literal(None, column.dtype)) for column in columns
        ]
        select_sql = sql_template.format(*literals_sql)
        dry_run_sql = f"SELECT {select_sql}"

        # Use the executor directly, because we want the original column IDs, not
        # the user-friendly column names that block.to_sql_query() would produce.
        bqclient = base_series._session.bqclient
        job = bqclient.query(
            dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
        )
        _, output_dtype = bigframes.dtypes.convert_schema_field(job.schema[0])

    op = bigframes.operations.SqlScalarOp(
        _output_type=output_dtype, sql_template=sql_template
    )
    return base_series._apply_nary_op(op, columns[1:])