# Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
import bigframes.pandas as bpd
df = bpd.read_gbq("bigquery-public-data.baseball.schedules")[["homeTeamName", "awayTeamName", "duration_minutes"]] df.peek()
Query job 1f6094e9-1942-477c-9ce3-87a614d71294 is DONE. 0 Bytes processed. Open Job
Query job ba19f29c-33d3-4f12-9605-ddeafb74918e is DONE. 582.8 kB processed. Open Job
Query job dd1ff8be-700a-4ce5-91a0-31413f70cfad is DONE. 82.0 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | |
|---|---|---|---|
| 88 | Royals | Athletics | 176 |
| 106 | Dodgers | Giants | 216 |
| 166 | Phillies | Royals | 162 |
| 247 | Rangers | Royals | 161 |
| 374 | Athletics | Astros | 161 |
Notes#
The API reference documentation for the
remote_functioncan be found at https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.session.Session#bigframes_session_Session_remote_functionMore code samples for
remote_functioncan be found in the BigQuery DataFrames API reference documentation, e.g.https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_apply
https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_map
https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_apply
The following examples are only for the purpose of demonstrating
remote_functionusage. They are not necessarily the best way to achieve the end result.In the examples in this notebook we are using
reuse=Falsejust as a caution to avoid concurrent runs of this notebook in the same google cloud project stepping over each other’s remote function deployment. It may not be neccesary in a simple use case.
Self-contained function#
Let’s consider a scenario where we want to categorize the matches as short,
medium or long duration based on the duration_minutes column.
@bpd.remote_function(reuse=False, cloud_function_service_account="default") def duration_category(duration_minutes: int) -> str: if duration_minutes < 90: return "short" elif duration_minutes < 180: return "medium" else: return "long" print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/functions/_function_session.py:335: UserWarning: You have not explicitly set a user-managed cloud_function_service_account. Using the default compute service account, {cloud_function_service_account}. To use Bigframes 2.0, please set an explicit user-managed cloud_function_service_account or set cloud_function_service_account explicitly to `default`.See, https://cloud.google.com/functions/docs/securing/function-identity.
warnings.warn(msg, category=UserWarning)
Query job 7c021760-59c4-4f3a-846c-9693a4d16eef is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-sessionca6012-ca541a90249f8b62951f38b7aba6a711-49to' and BQ remote function 'bigframes-dev._ed1e4d0f7d41174ba506d34d15dccf040d13f69e.bigframes_sessionca6012_ca541a90249f8b62951f38b7aba6a711_49to'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category)) df1.peek()
Query job 4b116e3e-d4d3-4eb6-9764-0a29a7c5d036 is DONE. 58.3 kB processed. Open Job
Query job d62ac4f0-47c9-47ae-8611-c9ecf78f20c9 is DONE. 157.2 kB processed. Open Job
Query job 5f876ebb-2d95-4c68-9d84-947e02b37bad is DONE. 98.8 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | medium |
| 2365 | Athletics | Angels | 134 | medium |
| 1977 | Athletics | Angels | 139 | medium |
| 554 | Cubs | Angels | 142 | medium |
| 654 | Astros | Angels | 143 | medium |
Function referring to variables outside the function body#
Let’s consider a slight variation of the earlier example where the labels for
the short, medium and long duration matches are defined outside the function
body. They would be captured at the time of remote_function deployment and
any change in their values in the notebook after the deployment will not
automatically propagate to the remote_function.
DURATION_CATEGORY_SHORT = "S" DURATION_CATEGORY_MEDIUM = "M" DURATION_CATEGORY_LONG = "L"
@bpd.remote_function(reuse=False, cloud_function_service_account="default") def duration_category(duration_minutes: int) -> str: if duration_minutes < 90: return DURATION_CATEGORY_SHORT elif duration_minutes < 180: return DURATION_CATEGORY_MEDIUM else: return DURATION_CATEGORY_LONG print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job 1909a652-5735-401b-8a77-674d8539ded0 is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-4191f0fce98d46cc09359de47e203236-e009' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_4191f0fce98d46cc09359de47e203236_e009'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category)) df1.peek()
Query job a942bdc5-6a6d-4db8-b2aa-a556197377b3 is DONE. 58.3 kB processed. Open Job
Query job 175ae9d3-604f-495b-a167-8b06c0283bd2 is DONE. 147.7 kB processed. Open Job
Query job d331a785-e574-45c9-86c8-d29ddd79a4d1 is DONE. 89.3 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | M |
| 2365 | Athletics | Angels | 134 | M |
| 1977 | Athletics | Angels | 139 | M |
| 554 | Cubs | Angels | 142 | M |
| 654 | Astros | Angels | 143 | M |
Function referring to imports (built-in) outside the function body#
Let’s consider a scenario in which we want to categorize the matches in terms of
hour buckets. E.g. a match finishing in 0-60 minutes would be in 1h category,
61-120 minutes in 2h category and so on. The function itself makes use of the
math module (a built-in module in a standard python installation) which
happens to be imported outside the function body, let’s say in one of the
previous cells. For the demo purpose we have aliased the import to mymath, but
it is not necessary.
Later in the notebook we will see another example with a third-party module.
@bpd.remote_function(reuse=False, cloud_function_service_account="default") def duration_category(duration_minutes: int) -> str: duration_hours = mymath.ceil(duration_minutes / 60) return f"{duration_hours}h" print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job bbc0b78f-bc04-4bd5-b711-399786a51519 is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-cf31fc2d2c7fe111afa5526f5a9cdf06-gmmo' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_cf31fc2d2c7fe111afa5526f5a9cdf06_gmmo'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category)) df1.peek()
Query job 991b54ed-9eaa-450f-9208-3e73404bb112 is DONE. 58.3 kB processed. Open Job
Query job 4e464a58-ac5b-42fd-91e3-92c115bdd273 is DONE. 150.1 kB processed. Open Job
Query job d340f55d-1511-431a-970d-a70ed4356935 is DONE. 91.7 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | 3h |
| 2365 | Athletics | Angels | 134 | 3h |
| 1977 | Athletics | Angels | 139 | 3h |
| 554 | Cubs | Angels | 142 | 3h |
| 654 | Astros | Angels | 143 | 3h |
Function referring to another function outside the function body#
In this example let’s create a remote_function from a function
duration_category which depends upon another function get_hour_ceiling,
which further depends on another function get_minutes_in_hour. This dependency
chain could be even longer in a real world example. The behaviors of the
dependencies would be captured at the time of the remote function
deployment.
Please ntoe that any changes in those functions in the notebook after the deployment would not automatically propagate to the remote function.
import math def get_minutes_in_hour(): return 60 def get_hour_ceiling(minutes): return math.ceil(minutes / get_minutes_in_hour())
@bpd.remote_function(reuse=False, cloud_function_service_account="default") def duration_category(duration_minutes: int) -> str: duration_hours = get_hour_ceiling(duration_minutes) return f"{duration_hours} hrs" print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job 10d1afa3-349b-49a8-adbd-79a8309ce77c is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-3c03836c2044bf625d02e25ccdbfe101-k1m4' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_3c03836c2044bf625d02e25ccdbfe101_k1m4'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category)) df1.peek()
Query job 33aff336-48d6-4caa-8cae-f459d21b180e is DONE. 58.3 kB processed. Open Job
Query job 561e0aa7-3962-4ef3-b308-a117a0ac3a7d is DONE. 157.4 kB processed. Open Job
Query job 759dccf8-3d88-40e1-a38a-2a2064e1d269 is DONE. 99.0 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | 3 hrs |
| 2365 | Athletics | Angels | 134 | 3 hrs |
| 1977 | Athletics | Angels | 139 | 3 hrs |
| 554 | Cubs | Angels | 142 | 3 hrs |
| 654 | Astros | Angels | 143 | 3 hrs |
Function requiring external packages#
In this example let’s say we want to redact the homeTeamName values, and we
choose to use a third party library cryptography. Any third party dependencies
can be specified in pip format
(with or without version number) as a list via the packages parameter.
@bpd.remote_function(reuse=False, packages=["cryptography"], cloud_function_service_account="default") def get_hash(input: str) -> str: from cryptography.fernet import Fernet # handle missing value if input is None: input = "" key = Fernet.generate_key() f = Fernet(key) return f.encrypt(input.encode()).decode()
Query job e2a44878-2564-44a5-8dec-b7ea2f42afd4 is DONE. 0 Bytes processed. Open Job
df1 = df.assign(homeTeamNameRedacted=df["homeTeamName"].apply(get_hash)) df1.peek()
Query job bcfab000-ca19-4633-bf0e-45e7d053f3eb is DONE. 60.5 kB processed. Open Job
Query job 139a6449-c07e-41ff-9aed-c6fdd633740a is DONE. 388.3 kB processed. Open Job
Query job 035fa2fb-0a55-4358-bb50-3ef915f5bf54 is DONE. 330.0 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | homeTeamNameRedacted | |
|---|---|---|---|---|
| 641 | American League | National League | 185 | gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-... |
| 349 | Angels | Astros | 187 | gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM... |
| 2349 | Angels | Astros | 160 | gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF... |
| 557 | Angels | Astros | 166 | gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg... |
| 220 | Angels | Astros | 162 | gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z... |
Function referring to imports (third-party) outside the function body#
In this scenario the function depends on a third party library and the module
from the third party library used in the function is imported outside the
function body in a previous cell. Below is such an example where the third-party
dependency is humanize and its module of the same name is imported outside the
function body.
import datetime as dt import humanize
@bpd.remote_function(reuse=False, packages=["humanize"], cloud_function_service_account="default") def duration_category(duration_minutes: int) -> str: timedelta = dt.timedelta(minutes=duration_minutes) return humanize.naturaldelta(timedelta) print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")
Query job af73ab2d-8d88-4cbe-863f-d35e48af84e1 is DONE. 0 Bytes processed. Open Job
Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-a5e21a4ad488ce8b90de19c3c8cd33b6-0ab2' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_a5e21a4ad488ce8b90de19c3c8cd33b6_0ab2'.
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category)) df1.peek()
Query job 0a9ac329-619d-4303-8dbd-176a576d4ce8 is DONE. 58.3 kB processed. Open Job
Query job 456bb9b4-0576-4c04-b707-4a04496aa538 is DONE. 162.2 kB processed. Open Job
Query job 37f59939-5d2c-4fb1-839b-282ae3702d3d is DONE. 103.9 kB processed. Open Job
| homeTeamName | awayTeamName | duration_minutes | duration_cat | |
|---|---|---|---|---|
| 1911 | Dodgers | Angels | 132 | 2 hours |
| 2365 | Athletics | Angels | 134 | 2 hours |
| 1977 | Athletics | Angels | 139 | 2 hours |
| 554 | Cubs | Angels | 142 | 2 hours |
| 654 | Astros | Angels | 143 | 2 hours |