feat: add pandas_gbq.sample (#983) · googleapis/python-bigquery-pandas@ac771c1

1+# Copyright (c) 2025 pandas-gbq Authors All rights reserved.2+# Use of this source code is governed by a BSD-style3+# license that can be found in the LICENSE file.4+5+from __future__ import annotations6+7+import typing8+from typing import Any, Dict, Optional, Sequence9+import warnings10+11+import google.cloud.bigquery12+import google.cloud.bigquery.table13+import numpy as np14+15+import pandas_gbq16+import pandas_gbq.constants17+import pandas_gbq.exceptions18+import pandas_gbq.features19+import pandas_gbq.timestamp20+21+# Only import at module-level at type checking time to avoid circular22+# dependencies in the pandas package, which has an optional dependency on23+# pandas-gbq.24+if typing.TYPE_CHECKING: # pragma: NO COVER25+import pandas26+27+28+def _bqschema_to_nullsafe_dtypes(schema_fields):29+"""Specify explicit dtypes based on BigQuery schema.30+31+ This function only specifies a dtype when the dtype allows nulls.32+ Otherwise, use pandas's default dtype choice.33+34+ See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html35+ #missing-data-casting-rules-and-indexing36+ """37+import db_dtypes38+39+# If you update this mapping, also update the table at40+# `docs/reading.rst`.41+dtype_map = {42+"FLOAT": np.dtype(float),43+"INTEGER": "Int64",44+"TIME": db_dtypes.TimeDtype(),45+# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype()46+# are not included because the pandas range does not align with the47+# BigQuery range. We need to attempt a conversion to those types and48+# fall back to 'object' when there are out-of-range values.49+ }50+51+# Amend dtype_map with newer extension types if pandas version allows.52+if pandas_gbq.features.FEATURES.pandas_has_boolean_dtype:53+dtype_map["BOOLEAN"] = "boolean"54+55+dtypes = {}56+for field in schema_fields:57+name = str(field["name"])58+# Array BigQuery type is represented as an object column containing59+# list objects.60+if field["mode"].upper() == "REPEATED":61+dtypes[name] = "object"62+continue63+64+dtype = dtype_map.get(field["type"].upper())65+if dtype:66+dtypes[name] = dtype67+68+return dtypes69+70+71+def _finalize_dtypes(72+df: pandas.DataFrame, schema_fields: Sequence[Dict[str, Any]]73+) -> pandas.DataFrame:74+"""75+ Attempt to change the dtypes of those columns that don't map exactly.76+77+ For example db_dtypes.DateDtype() and datetime64[ns] cannot represent78+ 0001-01-01, but they can represent dates within a couple hundred years of79+ 1970. See:80+ https://github.com/googleapis/python-bigquery-pandas/issues/36581+ """82+import db_dtypes83+import pandas.api.types84+85+# If you update this mapping, also update the table at86+# `docs/reading.rst`.87+dtype_map = {88+"DATE": db_dtypes.DateDtype(),89+"DATETIME": "datetime64[ns]",90+"TIMESTAMP": "datetime64[ns]",91+ }92+93+for field in schema_fields:94+# This method doesn't modify ARRAY/REPEATED columns.95+if field["mode"].upper() == "REPEATED":96+continue97+98+name = str(field["name"])99+dtype = dtype_map.get(field["type"].upper())100+101+# Avoid deprecated conversion to timezone-naive dtype by only casting102+# object dtypes.103+if dtype and pandas.api.types.is_object_dtype(df[name]):104+df[name] = df[name].astype(dtype, errors="ignore")105+106+# Ensure any TIMESTAMP columns are tz-aware.107+df = pandas_gbq.timestamp.localize_df(df, schema_fields)108+109+return df110+111+112+def download_results(113+results: google.cloud.bigquery.table.RowIterator,114+*,115+bqclient: google.cloud.bigquery.Client,116+progress_bar_type: Optional[str],117+warn_on_large_results: bool = True,118+max_results: Optional[int],119+user_dtypes: Optional[dict],120+use_bqstorage_api: bool,121+) -> Optional[pandas.DataFrame]:122+# No results are desired, so don't bother downloading anything.123+if max_results == 0:124+return None125+126+if user_dtypes is None:127+user_dtypes = {}128+129+create_bqstorage_client = use_bqstorage_api130+if max_results is not None:131+create_bqstorage_client = False132+133+# If we're downloading a large table, BigQuery DataFrames might be a134+# better fit. Not all code paths will populate rows_iter._table, but135+# if it's not populated that means we are working with a small result136+# set.137+if (138+warn_on_large_results139+and (table_ref := getattr(results, "_table", None)) is not None140+ ):141+table = bqclient.get_table(table_ref)142+if (143+isinstance((num_bytes := table.num_bytes), int)144+and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES145+ ):146+num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB147+warnings.warn(148+f"Recommendation: Your results are {num_gib:.1f} GiB. "149+"Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)"150+"to process large results with pandas compatible APIs with transparent SQL "151+"pushdown to BigQuery engine. This provides an opportunity to save on costs "152+"and improve performance. "153+"Please reach out to bigframes-feedback@google.com with any "154+"questions or concerns. To disable this message, run "155+"warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)",156+category=pandas_gbq.exceptions.LargeResultsWarning,157+# user's code158+# -> read_gbq159+# -> run_query160+# -> download_results161+stacklevel=4,162+ )163+164+try:165+schema_fields = [field.to_api_repr() for field in results.schema]166+conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)167+conversion_dtypes.update(user_dtypes)168+df = results.to_dataframe(169+dtypes=conversion_dtypes,170+progress_bar_type=progress_bar_type,171+create_bqstorage_client=create_bqstorage_client,172+ )173+except pandas_gbq.constants.HTTP_ERRORS as ex:174+raise pandas_gbq.exceptions.translate_exception(ex) from ex175+176+df = _finalize_dtypes(df, schema_fields)177+178+pandas_gbq.logger.debug("Got {} rows.\n".format(results.total_rows))179+return df