# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
When using BigQuery DataFrames, columns containing data in BigQuery’s JSON format (a lightweight standard) are represented as pandas.ArrowDtype. The exact underlying Arrow type depends on your library versions. Older environments typically use db_dtypes.JSONArrowType() for compatibility, which is an Arrow extension type acting as a light wrapper around pa.string(). In contrast, newer setups (pandas 3.0+ and pyarrow 19.0+) utilize the more recent pa.json_(pa.string()) representation.
import bigframes.pandas as bpd import bigframes.bigquery as bbq import db_dtypes import pandas as pd import pyarrow as pa
REGION = "US" # @param {type: "string"} bpd.options.display.progress_bar = None bpd.options.bigquery.location = REGION
Create Series with JSON columns#
Example 1: Create a Series with a JSON dtype from local data
This example demonstrates creating a JSON Series from a list of JSON strings. Note that BigQuery standardizes these strings, for instance, by removing extra spaces and ordering dictionary keys. Specifying the dtype is essential; if omitted, a string-type Series will be generated.
json_data = [ "1", '"str"', "false", '["a", {"b": 1}, null]', '{"a": {"b": [1, 2, 3], "c": true}}', None, ] bpd.Series(json_data, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()))
0 1
1 "str"
2 false
3 ["a",{"b":1},null]
4 {"a":{"b":[1,2,3],"c":true}}
5 <NA>
dtype: extension<dbjson<JSONArrowType>>[pyarrow]
Example 2: Create a Series with a Nested JSON dtype from local data
To create a BigQuery DataFrame Series containing JSON data nested within a STRUCT or LIST type, you must represent the JSON data in a pa.array defined with the pa.string type. This workaround is necessary because Pyarrow lacks support for creating structs or lists that directly contain extension types (see issue).
list_data = [ [{"key": "1"}], [{"key": None}], [{"key": '["1","3","5"]'}], [{"key": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], ] pa_array = pa.array(list_data, type=pa.list_(pa.struct([("key", pa.string())]))) bpd.Series( pd.arrays.ArrowExtensionArray(pa_array), dtype=pd.ArrowDtype( pa.list_(pa.struct([("key", db_dtypes.JSONArrowType())])), ), )
0 [{'key': '1'}]
1 [{'key': None}]
2 [{'key': '["1","3","5"]'}]
3 [{'key': '{"a":1,"b":["x","y"],"c":{"x":[],"z"...
dtype: list<item: struct<key: extension<dbjson<JSONArrowType>>>>[pyarrow]
Example 3: Create a Series with a Nested JSON dtype using BigQuery SQLs
sql = """ SELECT 0 AS id, STRUCT(JSON_OBJECT('a', True) AS data, 1 AS number) AS struct_col UNION ALL SELECT 1, STRUCT(JSON_OBJECT('b', 100), 2), """ df = bpd.read_gbq(sql) df
| id | struct_col | |
|---|---|---|
| 0 | 1 | {'data': '{"b":100}', 'number': 2} |
| 1 | 0 | {'data': '{"a":true}', 'number': 1} |
2 rows × 2 columns
[2 rows x 2 columns in total]
id Int64 struct_col struct<data: extension<dbjson<JSONArrowType>>,... dtype: object
Operate on JSON data#
The bigframes.bigquery module (often abbreviated as bbq) provides access within BigQuery DataFrames to various BigQuery built-in functions. Examples relevant for JSON data include json_extract and parse_json.