feat: add pandas_gbq.sample (#983) · googleapis/python-bigquery-pandas@ac771c1

1+

# Copyright (c) 2025 pandas-gbq Authors All rights reserved.

2+

# Use of this source code is governed by a BSD-style

3+

# license that can be found in the LICENSE file.

4+5+

from __future__ import annotations

6+7+

import typing

8+

from typing import Any, Dict, Optional, Sequence

9+

import warnings

10+11+

import google.cloud.bigquery

12+

import google.cloud.bigquery.table

13+

import numpy as np

14+15+

import pandas_gbq

16+

import pandas_gbq.constants

17+

import pandas_gbq.exceptions

18+

import pandas_gbq.features

19+

import pandas_gbq.timestamp

20+21+

# Only import at module-level at type checking time to avoid circular

22+

# dependencies in the pandas package, which has an optional dependency on

23+

# pandas-gbq.

24+

if typing.TYPE_CHECKING: # pragma: NO COVER

25+

import pandas

26+27+28+

def _bqschema_to_nullsafe_dtypes(schema_fields):

29+

"""Specify explicit dtypes based on BigQuery schema.

30+31+

This function only specifies a dtype when the dtype allows nulls.

32+

Otherwise, use pandas's default dtype choice.

33+34+

See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html

35+

#missing-data-casting-rules-and-indexing

36+

"""

37+

import db_dtypes

38+39+

# If you update this mapping, also update the table at

40+

# `docs/reading.rst`.

41+

dtype_map = {

42+

"FLOAT": np.dtype(float),

43+

"INTEGER": "Int64",

44+

"TIME": db_dtypes.TimeDtype(),

45+

# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype()

46+

# are not included because the pandas range does not align with the

47+

# BigQuery range. We need to attempt a conversion to those types and

48+

# fall back to 'object' when there are out-of-range values.

49+

}

50+51+

# Amend dtype_map with newer extension types if pandas version allows.

52+

if pandas_gbq.features.FEATURES.pandas_has_boolean_dtype:

53+

dtype_map["BOOLEAN"] = "boolean"

54+55+

dtypes = {}

56+

for field in schema_fields:

57+

name = str(field["name"])

58+

# Array BigQuery type is represented as an object column containing

59+

# list objects.

60+

if field["mode"].upper() == "REPEATED":

61+

dtypes[name] = "object"

62+

continue

63+64+

dtype = dtype_map.get(field["type"].upper())

65+

if dtype:

66+

dtypes[name] = dtype

67+68+

return dtypes

69+70+71+

def _finalize_dtypes(

72+

df: pandas.DataFrame, schema_fields: Sequence[Dict[str, Any]]

73+

) -> pandas.DataFrame:

74+

"""

75+

Attempt to change the dtypes of those columns that don't map exactly.

76+77+

For example db_dtypes.DateDtype() and datetime64[ns] cannot represent

78+

0001-01-01, but they can represent dates within a couple hundred years of

79+

1970. See:

80+

https://github.com/googleapis/python-bigquery-pandas/issues/365

81+

"""

82+

import db_dtypes

83+

import pandas.api.types

84+85+

# If you update this mapping, also update the table at

86+

# `docs/reading.rst`.

87+

dtype_map = {

88+

"DATE": db_dtypes.DateDtype(),

89+

"DATETIME": "datetime64[ns]",

90+

"TIMESTAMP": "datetime64[ns]",

91+

}

92+93+

for field in schema_fields:

94+

# This method doesn't modify ARRAY/REPEATED columns.

95+

if field["mode"].upper() == "REPEATED":

96+

continue

97+98+

name = str(field["name"])

99+

dtype = dtype_map.get(field["type"].upper())

100+101+

# Avoid deprecated conversion to timezone-naive dtype by only casting

102+

# object dtypes.

103+

if dtype and pandas.api.types.is_object_dtype(df[name]):

104+

df[name] = df[name].astype(dtype, errors="ignore")

105+106+

# Ensure any TIMESTAMP columns are tz-aware.

107+

df = pandas_gbq.timestamp.localize_df(df, schema_fields)

108+109+

return df

110+111+112+

def download_results(

113+

results: google.cloud.bigquery.table.RowIterator,

114+

*,

115+

bqclient: google.cloud.bigquery.Client,

116+

progress_bar_type: Optional[str],

117+

warn_on_large_results: bool = True,

118+

max_results: Optional[int],

119+

user_dtypes: Optional[dict],

120+

use_bqstorage_api: bool,

121+

) -> Optional[pandas.DataFrame]:

122+

# No results are desired, so don't bother downloading anything.

123+

if max_results == 0:

124+

return None

125+126+

if user_dtypes is None:

127+

user_dtypes = {}

128+129+

create_bqstorage_client = use_bqstorage_api

130+

if max_results is not None:

131+

create_bqstorage_client = False

132+133+

# If we're downloading a large table, BigQuery DataFrames might be a

134+

# better fit. Not all code paths will populate rows_iter._table, but

135+

# if it's not populated that means we are working with a small result

136+

# set.

137+

if (

138+

warn_on_large_results

139+

and (table_ref := getattr(results, "_table", None)) is not None

140+

):

141+

table = bqclient.get_table(table_ref)

142+

if (

143+

isinstance((num_bytes := table.num_bytes), int)

144+

and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES

145+

):

146+

num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB

147+

warnings.warn(

148+

f"Recommendation: Your results are {num_gib:.1f} GiB. "

149+

"Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)"

150+

"to process large results with pandas compatible APIs with transparent SQL "

151+

"pushdown to BigQuery engine. This provides an opportunity to save on costs "

152+

"and improve performance. "

153+

"Please reach out to bigframes-feedback@google.com with any "

154+

"questions or concerns. To disable this message, run "

155+

"warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)",

156+

category=pandas_gbq.exceptions.LargeResultsWarning,

157+

# user's code

158+

# -> read_gbq

159+

# -> run_query

160+

# -> download_results

161+

stacklevel=4,

162+

)

163+164+

try:

165+

schema_fields = [field.to_api_repr() for field in results.schema]

166+

conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)

167+

conversion_dtypes.update(user_dtypes)

168+

df = results.to_dataframe(

169+

dtypes=conversion_dtypes,

170+

progress_bar_type=progress_bar_type,

171+

create_bqstorage_client=create_bqstorage_client,

172+

)

173+

except pandas_gbq.constants.HTTP_ERRORS as ex:

174+

raise pandas_gbq.exceptions.translate_exception(ex) from ex

175+176+

df = _finalize_dtypes(df, schema_fields)

177+178+

pandas_gbq.logger.debug("Got {} rows.\n".format(results.total_rows))

179+

return df