# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
Note: This notebook describes a feature that is currently in preview. There may be breaking changes to the functionality when using “NULL index” objects.
Use the “NULL” index for more efficient query generation, but some pandas-compatible methods may not be possible without an index.
import bigframes.enums import bigframes.exceptions import bigframes.pandas as bpd df = bpd.read_gbq( "bigquery-public-data.baseball.schedules", index_col=bigframes.enums.DefaultIndexKind.NULL, )
Use peek() to view an arbitrary selection of rows from the DataFrame. This is much more efficient than head(), which requires a total ordering for determinism.
Query job 1b8726ce-c4ea-47fe-a47c-d6fae50d8fb0 is DONE. 582.8 kB processed. Open Job
| gameId | gameNumber | seasonId | year | type | dayNight | duration | duration_minutes | homeTeamId | homeTeamName | awayTeamId | awayTeamName | startTime | attendance | status | created | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e14b6493-9e7f-404f-840a-8a680cc364bf | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 3:07 | 187 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-26 17:10:00+00:00 | 27318 | closed | 2016-10-06 06:25:15+00:00 |
| 1 | 1f32b347-cbcb-4c31-a145-0e685306d168 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 3:09 | 189 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-25 20:10:00+00:00 | 29457 | closed | 2016-10-06 06:25:15+00:00 |
| 2 | 0c2292d1-7398-48be-bf8e-b41dad5e1a43 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 2:45 | 165 | 12079497-e414-450a-8bf2-29f91de646bf | Braves | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-11 20:10:00+00:00 | 43114 | closed | 2016-10-06 06:25:15+00:00 |
| 3 | 8fbec734-a15a-42ab-8d51-60790de7750b | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 3:42 | 222 | 12079497-e414-450a-8bf2-29f91de646bf | Braves | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-12 17:35:00+00:00 | 31625 | closed | 2016-10-06 06:25:15+00:00 |
| 4 | 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | D | 2:44 | 164 | 2142e1ba-3b40-445c-b8bb-f1f8b1054220 | Phillies | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-08 17:05:00+00:00 | 28650 | closed | 2016-10-06 06:25:15+00:00 |
Inspect the properties of the DataFrame#
Some properties, such as dtypes, can be retrieved without executing a query job.
gameId string[pyarrow] gameNumber Int64 seasonId string[pyarrow] year Int64 type string[pyarrow] dayNight string[pyarrow] duration string[pyarrow] duration_minutes Int64 homeTeamId string[pyarrow] homeTeamName string[pyarrow] awayTeamId string[pyarrow] awayTeamName string[pyarrow] startTime timestamp[us, tz=UTC][pyarrow] attendance Int64 status string[pyarrow] created timestamp[us, tz=UTC][pyarrow] dtype: object
Other properties, such as shape require a query. In this case, shape runs a COUNT(1) query.
Query job 0f85f12c-227c-4001-b851-6e9b9087ab7e is DONE. 0 Bytes processed. Open Job
Select a subset of the DataFrame#
Filter columns by selecting a list of columns from the DataFrame.
Note: Even with index_col=bigframes.enums.DefaultIndexKind.NULL, it is more efficient to do this selection in read_gbq / read_gbq_table except in cases where the total ordering ID columns can be pruned.
column_filtered = df[["gameId", "year", "homeTeamName", "awayTeamName", "duration_minutes"]] column_filtered.peek()
Query job efa6b4be-cf60-4951-9125-7d77fb6b6b44 is DONE. 174.4 kB processed. Open Job
| gameId | year | homeTeamName | awayTeamName | duration_minutes | |
|---|---|---|---|---|---|
| 0 | e14b6493-9e7f-404f-840a-8a680cc364bf | 2016 | Marlins | Cubs | 187 |
| 1 | 1f32b347-cbcb-4c31-a145-0e685306d168 | 2016 | Marlins | Cubs | 189 |
| 2 | 0c2292d1-7398-48be-bf8e-b41dad5e1a43 | 2016 | Braves | Cubs | 165 |
| 3 | 8fbec734-a15a-42ab-8d51-60790de7750b | 2016 | Braves | Cubs | 222 |
| 4 | 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd | 2016 | Phillies | Cubs | 164 |
Filter by rows using a boolean Series. This Series must be derived from the DataFrame being filtered so that the NULL index can still align correctly.
night_games = df[df['dayNight'] == 'N'] night_games.peek()
Query job 0be8e44d-854a-45ca-950b-269280e3de41 is DONE. 582.8 kB processed. Open Job
| gameId | gameNumber | seasonId | year | type | dayNight | duration | duration_minutes | homeTeamId | homeTeamName | awayTeamId | awayTeamName | startTime | attendance | status | created | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 63f14670-c28e-432b-84ee-1a2c6ac29527 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 2:43 | 163 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-23 23:10:00+00:00 | 25291 | closed | 2016-10-06 06:25:15+00:00 |
| 1 | bf4e80d1-3125-44fa-8a89-de93d039d465 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 3:24 | 204 | 03556285-bdbb-4576-a06d-42f71f46ddc5 | Marlins | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-24 23:10:00+00:00 | 24385 | closed | 2016-10-06 06:25:15+00:00 |
| 2 | e8af534c-36ed-4ff9-8511-780825fdd041 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 2:51 | 171 | 12079497-e414-450a-8bf2-29f91de646bf | Braves | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-10 23:35:00+00:00 | 30547 | closed | 2016-10-06 06:25:15+00:00 |
| 3 | e599c525-ac42-4b54-928d-7ee5fbe67dd9 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 2:45 | 165 | 2142e1ba-3b40-445c-b8bb-f1f8b1054220 | Phillies | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-07 23:05:00+00:00 | 27381 | closed | 2016-10-06 06:25:15+00:00 |
| 4 | d80ffb65-57a4-42c9-ae1c-2c51d0650336 | 1 | 565de4be-dc80-4849-a7e1-54bc79156cc8 | 2016 | REG | N | 3:05 | 185 | 2142e1ba-3b40-445c-b8bb-f1f8b1054220 | Phillies | 55714da8-fcaf-4574-8443-59bfb511a524 | Cubs | 2016-06-06 23:05:00+00:00 | 22162 | closed | 2016-10-06 06:25:15+00:00 |
Join two DataFrames#
Even though pandas usually joins by the index, NULL index objects can still be manually joined by a column using the on parameter in merge.
df1 = df[["gameId", "homeTeamName"]] df1.peek()
Query job 5d2c69d2-33fe-4513-923b-fd64f4da098b is DONE. 113.9 kB processed. Open Job
| gameId | homeTeamName | |
|---|---|---|
| 0 | e14b6493-9e7f-404f-840a-8a680cc364bf | Marlins |
| 1 | 1f32b347-cbcb-4c31-a145-0e685306d168 | Marlins |
| 2 | 0c2292d1-7398-48be-bf8e-b41dad5e1a43 | Braves |
| 3 | 8fbec734-a15a-42ab-8d51-60790de7750b | Braves |
| 4 | 89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd | Phillies |
df2 = df[["gameId", "awayTeamName"]].head(2) df2.peek()
merged = df1.merge(df2, on="gameId", how="inner") merged.peek()
Query job 0ac171dd-3859-4589-b7ff-59fd81ec3c3a is DONE. 582.9 kB processed. Open Job
Query job 034f8807-c128-444a-8033-0c95f34b0e32 is DONE. 111 Bytes processed. Open Job
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | af72a0b9-65f7-49fb-9b30-d505068bdf6d | Reds | Brewers |
| 1 | d60c6036-0ce1-4c90-8dd9-de3b403c92a8 | Nationals | Brewers |
merged = df1.merge(df2, on="gameId", how="outer") merged.peek()
Query job 30fd5a60-772c-4ef0-a151-5ab390ff4322 is DONE. 582.9 kB processed. Open Job
Query job 701fa9a8-1ec6-49b9-ac41-228cb34d4c8c is DONE. 114.0 kB processed. Open Job
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | 039bb40e-7613-4674-a653-584b93e9b21b | American League | <NA> |
| 1 | 78000e12-2ef3-4246-adc1-c8a4d157631c | Angels | <NA> |
| 2 | de5555dc-9228-4f7c-88ae-4451e3ffb980 | Angels | <NA> |
| 3 | f29a2754-004b-436c-91fe-3d86c0bb17a8 | Angels | <NA> |
| 4 | 8e5af008-8a07-4f9a-90cb-336ca4c84c71 | Angels | <NA> |
merged = df1.merge(df2, on="gameId", how="left") merged.peek()
Query job e3d8168c-48e9-4ba9-a916-10259ad9c0ea is DONE. 582.9 kB processed. Open Job
Query job 82d2a5e4-66a8-4478-92de-57d3f806aa76 is DONE. 114.0 kB processed. Open Job
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | 039bb40e-7613-4674-a653-584b93e9b21b | American League | <NA> |
| 1 | f6fcd83c-e130-487c-a0cc-d00b2712d08b | Angels | <NA> |
| 2 | fe401dd2-089c-4822-8657-4d510d460f38 | Angels | <NA> |
| 3 | c894bdee-5dda-49f4-87c8-53b9b9bfcd3b | Angels | <NA> |
| 4 | bbda59d9-fd52-4bed-bcfb-2ceed4be997c | Angels | <NA> |
merged = df1.merge(df2, on="gameId", how="right") merged.peek()
Query job 518ed511-606a-42b2-a28d-61a601eccfa7 is DONE. 582.9 kB processed. Open Job
Query job bc381640-74e0-4885-9c32-87805a49f357 is DONE. 111 Bytes processed. Open Job
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | af72a0b9-65f7-49fb-9b30-d505068bdf6d | Reds | Brewers |
| 1 | d60c6036-0ce1-4c90-8dd9-de3b403c92a8 | Nationals | Brewers |
Download the result as (in-memory) pandas DataFrame#
Use the ordered=False argument for more efficient query execution.
dfp = merged.to_pandas(ordered=False) dfp
Query job 2d4fbd55-ba6a-46d2-87ae-5da416ad3642 is DONE. 159 Bytes processed. Open Job
| gameId | homeTeamName | awayTeamName | |
|---|---|---|---|
| 0 | d60c6036-0ce1-4c90-8dd9-de3b403c92a8 | Nationals | Brewers |
| 1 | af72a0b9-65f7-49fb-9b30-d505068bdf6d | Reds | Brewers |