feat: add pandas_gbq.sample (#983) · googleapis/python-bigquery-pandas@ac771c1
1+# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
2+# Use of this source code is governed by a BSD-style
3+# license that can be found in the LICENSE file.
4+5+from __future__ import annotations
6+7+import typing
8+from typing import Any, Dict, Optional, Sequence
9+import warnings
10+11+import google.cloud.bigquery
12+import google.cloud.bigquery.table
13+import numpy as np
14+15+import pandas_gbq
16+import pandas_gbq.constants
17+import pandas_gbq.exceptions
18+import pandas_gbq.features
19+import pandas_gbq.timestamp
20+21+# Only import at module-level at type checking time to avoid circular
22+# dependencies in the pandas package, which has an optional dependency on
23+# pandas-gbq.
24+if typing.TYPE_CHECKING: # pragma: NO COVER
25+import pandas
26+27+28+def _bqschema_to_nullsafe_dtypes(schema_fields):
29+"""Specify explicit dtypes based on BigQuery schema.
30+31+ This function only specifies a dtype when the dtype allows nulls.
32+ Otherwise, use pandas's default dtype choice.
33+34+ See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
35+ #missing-data-casting-rules-and-indexing
36+ """
37+import db_dtypes
38+39+# If you update this mapping, also update the table at
40+# `docs/reading.rst`.
41+dtype_map = {
42+"FLOAT": np.dtype(float),
43+"INTEGER": "Int64",
44+"TIME": db_dtypes.TimeDtype(),
45+# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype()
46+# are not included because the pandas range does not align with the
47+# BigQuery range. We need to attempt a conversion to those types and
48+# fall back to 'object' when there are out-of-range values.
49+ }
50+51+# Amend dtype_map with newer extension types if pandas version allows.
52+if pandas_gbq.features.FEATURES.pandas_has_boolean_dtype:
53+dtype_map["BOOLEAN"] = "boolean"
54+55+dtypes = {}
56+for field in schema_fields:
57+name = str(field["name"])
58+# Array BigQuery type is represented as an object column containing
59+# list objects.
60+if field["mode"].upper() == "REPEATED":
61+dtypes[name] = "object"
62+continue
63+64+dtype = dtype_map.get(field["type"].upper())
65+if dtype:
66+dtypes[name] = dtype
67+68+return dtypes
69+70+71+def _finalize_dtypes(
72+df: pandas.DataFrame, schema_fields: Sequence[Dict[str, Any]]
73+) -> pandas.DataFrame:
74+"""
75+ Attempt to change the dtypes of those columns that don't map exactly.
76+77+ For example db_dtypes.DateDtype() and datetime64[ns] cannot represent
78+ 0001-01-01, but they can represent dates within a couple hundred years of
79+ 1970. See:
80+ https://github.com/googleapis/python-bigquery-pandas/issues/365
81+ """
82+import db_dtypes
83+import pandas.api.types
84+85+# If you update this mapping, also update the table at
86+# `docs/reading.rst`.
87+dtype_map = {
88+"DATE": db_dtypes.DateDtype(),
89+"DATETIME": "datetime64[ns]",
90+"TIMESTAMP": "datetime64[ns]",
91+ }
92+93+for field in schema_fields:
94+# This method doesn't modify ARRAY/REPEATED columns.
95+if field["mode"].upper() == "REPEATED":
96+continue
97+98+name = str(field["name"])
99+dtype = dtype_map.get(field["type"].upper())
100+101+# Avoid deprecated conversion to timezone-naive dtype by only casting
102+# object dtypes.
103+if dtype and pandas.api.types.is_object_dtype(df[name]):
104+df[name] = df[name].astype(dtype, errors="ignore")
105+106+# Ensure any TIMESTAMP columns are tz-aware.
107+df = pandas_gbq.timestamp.localize_df(df, schema_fields)
108+109+return df
110+111+112+def download_results(
113+results: google.cloud.bigquery.table.RowIterator,
114+*,
115+bqclient: google.cloud.bigquery.Client,
116+progress_bar_type: Optional[str],
117+warn_on_large_results: bool = True,
118+max_results: Optional[int],
119+user_dtypes: Optional[dict],
120+use_bqstorage_api: bool,
121+) -> Optional[pandas.DataFrame]:
122+# No results are desired, so don't bother downloading anything.
123+if max_results == 0:
124+return None
125+126+if user_dtypes is None:
127+user_dtypes = {}
128+129+create_bqstorage_client = use_bqstorage_api
130+if max_results is not None:
131+create_bqstorage_client = False
132+133+# If we're downloading a large table, BigQuery DataFrames might be a
134+# better fit. Not all code paths will populate rows_iter._table, but
135+# if it's not populated that means we are working with a small result
136+# set.
137+if (
138+warn_on_large_results
139+and (table_ref := getattr(results, "_table", None)) is not None
140+ ):
141+table = bqclient.get_table(table_ref)
142+if (
143+isinstance((num_bytes := table.num_bytes), int)
144+and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES
145+ ):
146+num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB
147+warnings.warn(
148+f"Recommendation: Your results are {num_gib:.1f} GiB. "
149+"Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)"
150+"to process large results with pandas compatible APIs with transparent SQL "
151+"pushdown to BigQuery engine. This provides an opportunity to save on costs "
152+"and improve performance. "
153+"Please reach out to bigframes-feedback@google.com with any "
154+"questions or concerns. To disable this message, run "
155+"warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)",
156+category=pandas_gbq.exceptions.LargeResultsWarning,
157+# user's code
158+# -> read_gbq
159+# -> run_query
160+# -> download_results
161+stacklevel=4,
162+ )
163+164+try:
165+schema_fields = [field.to_api_repr() for field in results.schema]
166+conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
167+conversion_dtypes.update(user_dtypes)
168+df = results.to_dataframe(
169+dtypes=conversion_dtypes,
170+progress_bar_type=progress_bar_type,
171+create_bqstorage_client=create_bqstorage_client,
172+ )
173+except pandas_gbq.constants.HTTP_ERRORS as ex:
174+raise pandas_gbq.exceptions.translate_exception(ex) from ex
175+176+df = _finalize_dtypes(df, schema_fields)
177+178+pandas_gbq.logger.debug("Got {} rows.\n".format(results.total_rows))
179+return df