diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index af09616e05..a9695cf8c7 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -44,6 +44,7 @@ User Guide :maxdepth: 1 AI Functions <../notebooks/generative_ai/ai_functions.ipynb> + AI Functions for Poster Analysis <../notebooks/generative_ai/ai_movie_poster.ipynb> AI Forecast <../notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb> LLM Code Generation <../notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb> LLM KMeans <../notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb> diff --git a/notebooks/generative_ai/ai_movie_poster.ipynb b/notebooks/generative_ai/ai_movie_poster.ipynb new file mode 100644 index 0000000000..b25e2b556e --- /dev/null +++ b/notebooks/generative_ai/ai_movie_poster.ipynb @@ -0,0 +1,732 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "XZpKUoHjXw3_" + }, + "outputs": [], + "source": [ + "# Copyright 2026 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEKzWP6jW9Oj" + }, + "source": [ + "# Analyzing movie posters with BigQuery Dataframe AI functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c9CCKXG5XTb-" + }, + "source": [ + "BigQuery Dataframe provides a Pythonic way to use AI functions directly with your dataframes. In this notebook, you will use these functions to analyze old\n", + "movie posters. These posters are images stored in a public Google Cloud Storage bucket: `gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CUJDa_7MPbL9" + }, + "source": [ + "## Set up" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D3iYtBSkYpCK" + }, + "source": [ + "Before you begin, you need to\n", + "\n", + "* Set up your permissions for generative AI functions with [these instructions](https://docs.cloud.google.com/bigquery/docs/permissions-for-ai-functions)\n", + "* Set up your Cloud Resource connection by following [these instructions](https://docs.cloud.google.com/bigquery/docs/create-cloud-resource-connection)\n", + "\n", + "Once you have the permissions set up, import the `bigframes.pandas` package, and\n", + "set your cloud project ID." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6nqoRHYbPAx3" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "MY_RPOJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", + "\n", + "bpd.options.bigquery.project = MY_RPOJECT_ID" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2XHcNHtvPhNW" + }, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eS-9A7DijfoQ" + }, + "source": [ + "First, you load the data from the GCS bucket to a BigQuery Dataframe with the `from_glob_path` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ZNPzFjCyPap0", + "outputId": "346d20b2-d615-4094-d24e-2d40e5c90ee2" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:US.48a27954-7a4a-4b9e-8176-ea227fd188ad details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in a minute of slot time. [Job bigframes-dev:US.09c48ecb-e041-4c18-a390-ca5a36fd07c3 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poster
0
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " poster\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Replace with your own connection name.\n", + "MY_CONNECTION = 'bigframes-default-connection' # @param {type:\"string\"}\n", + "\n", + "movies = bpd.from_glob_path(\n", + " \"gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/*\",\n", + " connection = MY_CONNECTION,\n", + " name='poster')\n", + "movies.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EfkdDH08QnYw" + }, + "source": [ + "## Extract titles from posters" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "6CoZZ5tSQm1r", + "outputId": "1b3915ce-eb83-4be9-b1c1-d9a326dc9408" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in 2 minutes of slot time. [Job bigframes-dev:US.4a08a15f-5a2f-463b-bba8-734858ec992b details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitle
0Der Student von Prag
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " poster title\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag\n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "movies['title'] = bbq.ai.generate(\n", + " (\"What is the movie title for this poster? Name only\", movies['poster']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "movies.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cFQHQ9S2lr6t" + }, + "source": [ + "Notice that `ai.generate()` has a `struct` return type, which holds not only the LLM response, but also the status. If you do not provide a field name for your answer, `\"result\"` will be the default name. You can access LLM response content with the struct accessor (e.g. `my_response.struct.filed(\"result\")`);." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R8kkUhgoS5Xz" + }, + "source": [ + "## Get movie release year\n", + "\n", + "In the example below, you will use `ai.generate_int()` to find the release year for each movie poster:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 976 + }, + "id": "cKZdHq0XS1iW", + "outputId": "72cbad57-4518-4e1e-97bb-333d424dba73" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", + " return prop(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in 4 minutes of slot time. [Job bigframes-dev:US.b60a151a-6cbc-405e-9c40-8a7461981a00 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.3 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
0Der Student von Prag1913
\n", + "

1 rows × 3 columns

\n", + "
[1 rows x 3 columns in total]" + ], + "text/plain": [ + " poster title \\\n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag \n", + "\n", + " year \n", + "0 1913 \n", + "\n", + "[1 rows x 3 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies['year'] = bbq.ai.generate_int(\n", + " (\"What is the release year for this movie?\", movies['title']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "\n", + "movies.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 250 + }, + "id": "yqRiNRY8_8fs", + "outputId": "efa60107-6883-4f5c-8e40-43c7287ea7fb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
posterstruct<uri: string, version: string, authorize...
titlestring[pyarrow]
yearInt64
\n", + "

" + ], + "text/plain": [ + "poster structJob bigframes-dev:US.c9bb23f0-5ceb-4d6c-8241-960c496274ae details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " Query processed 1.2 kB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
8Shoulder Arms1918
\n", + "

1 rows × 3 columns

\n", + "
[1 rows x 3 columns in total]" + ], + "text/plain": [ + " poster title year\n", + "8 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Shoulder Arms 1918\n", + "\n", + "[1 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "us_movies = movies[bbq.ai.if_(\n", + " (\"The movie \", movies['title'], \" was made in US\")\n", + ")]\n", + "us_movies.head(1)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}