{ "cells": [ { "cell_type": "markdown", "id": "8064a52e", "metadata": {}, "source": [ "### Use embeddings\n", "\n", "In this tutorial, we'll use product group embeddings to compare a customer's latest invoice with their past purchases from the last 26 weeks.\n", "\n", "To learn how to create a SQL Embedding User-Defined Function (UDF), check out the ['Bring Your Own Transformer' tutorials](https://docs.featurebyte.com/latest/get_started/bring_your_own_transformer/overview/).\n", "\n", "For our hosted tutorials, we have pre-configured a SQL UDF using the SBERT Transformer model on our Snowflake data warehouse. We'll register this UDF in the Catalog and apply it to analyze the ProductGroup descriptions." ] }, { "cell_type": "code", "execution_count": 1, "id": "f517d2ac-9dca-47a7-80d9-121269e43bf6", "metadata": { "execution": { "iopub.execute_input": "2024-06-12T08:09:03.033820Z", "iopub.status.busy": "2024-06-12T08:09:03.033733Z", "iopub.status.idle": "2024-06-12T08:09:06.652491Z", "shell.execute_reply": "2024-06-12T08:09:06.652142Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[33;20mWARNING \u001b[0m\u001b[0m | \u001b[1m\u001b[33;20mService endpoint is inaccessible: http://featurebyte-server:8088\u001b[0m\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[38;20mINFO \u001b[0m\u001b[0m | \u001b[1m\u001b[38;20mUsing profile: tutorial\u001b[0m\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[38;20mINFO \u001b[0m\u001b[0m | \u001b[1m\u001b[38;20mUsing configuration file at: /Users/gxav/.featurebyte/config.yaml\u001b[0m\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[38;20mINFO \u001b[0m\u001b[0m | \u001b[1m\u001b[38;20mActive profile: tutorial (https://tutorials.featurebyte.com/api/v1)\u001b[0m\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[33;20mWARNING \u001b[0m\u001b[0m | \u001b[1m\u001b[33;20mRemote SDK version (1.1.0.dev7) is different from local (1.1.0.dev1). Update local SDK to avoid unexpected behavior.\u001b[0m\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[38;20mINFO \u001b[0m\u001b[0m | \u001b[1m\u001b[38;20mNo catalog activated.\u001b[0m\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32;20m16:09:06\u001b[0m | \u001b[1m\u001b[38;20mINFO \u001b[0m\u001b[0m | \u001b[1m\u001b[38;20mCatalog activated: Grocery Dataset Tutorial\u001b[0m\u001b[0m\n" ] } ], "source": [ "import featurebyte as fb\n", "\n", "# Set your profile to the tutorial environment\n", "fb.use_profile(\"tutorial\")\n", "\n", "catalog_name = \"Grocery Dataset Tutorial\"\n", "catalog = fb.Catalog.activate(catalog_name) " ] }, { "cell_type": "markdown", "id": "0a49a221-6061-4df7-8fc1-65fc07450ef5", "metadata": {}, "source": [ "#### Register the F_SBERT_EMBEDDING UDF to the Catalog" ] }, { "cell_type": "code", "execution_count": 2, "id": "b573c7ba-8321-4a4c-9c32-80591b9982b3", "metadata": { "execution": { "iopub.execute_input": "2024-06-12T08:09:06.654607Z", "iopub.status.busy": "2024-06-12T08:09:06.654443Z", "iopub.status.idle": "2024-06-12T08:09:48.769811Z", "shell.execute_reply": "2024-06-12T08:09:48.769535Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t
name | \n", "\t\t\tembedding | \n", "\t\t||||||||||
created_at | \n", "\t\t\t2024-06-12 08:09:06 | \n", "\t\t||||||||||
updated_at | \n", "\t\t\tNone | \n", "\t\t||||||||||
description | \n", "\t\t\tNone | \n", "\t\t||||||||||
sql_function_name | \n", "\t\t\tF_SBERT_EMBEDDING | \n", "\t\t||||||||||
function_parameters | \n", "\t\t\t\n",
"\t\t\t\t
| \n",
"\t\t||||||||||
signature | \n", "\t\t\tembedding(x: str) -> array | \n", "\t\t||||||||||
output_dtype | \n", "\t\t\tARRAY | \n", "\t\t||||||||||
feature_store_name | \n", "\t\t\tplayground | \n", "\t\t||||||||||
used_by_features | \n", "\t\t\t[] | \n", "\t\t
\n", " | POINT_IN_TIME | \n", "GROCERYINVOICEITEMGUID | \n", "CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w | \n", "
---|---|---|---|
0 | \n", "2023-04-11 17:23:57 | \n", "6084f39f-9d2c-4111-b1cc-502e1559c0c0 | \n", "[-0.049583778360973, 0.040724096369544, -0.019... | \n", "
1 | \n", "2023-02-07 11:04:26 | \n", "fd1caae1-77e6-4667-8c83-df13f05bf2f5 | \n", "[-0.047664110747538006, 0.040520216686427, -0.... | \n", "
2 | \n", "2023-03-17 11:15:09 | \n", "1b627a25-7eb4-4f61-b243-c93db487bff0 | \n", "[-0.05412369494863901, 0.028804376621762003, -... | \n", "
3 | \n", "2022-09-18 18:52:36 | \n", "ac7edfb5-63ed-49fb-9b89-76b0288ed2f8 | \n", "[-0.051115894625539005, 0.033894273326497006, ... | \n", "
4 | \n", "2023-05-28 19:27:14 | \n", "15973b2f-2256-4caa-b65b-cbbfdff0905b | \n", "[-0.046703376632649, 0.035018777767219005, -0.... | \n", "
5 | \n", "2022-12-26 15:01:07 | \n", "264f79fd-c24a-47cc-8a68-fe3753a4d74b | \n", "[-0.057210504562501006, 0.026236914188101004, ... | \n", "
6 | \n", "2023-03-31 18:50:00 | \n", "213ef7d3-c27b-43e0-bc0a-57d6c7c254b0 | \n", "[-0.049429968671181004, 0.033228590529173, -0.... | \n", "
7 | \n", "2022-12-10 21:08:26 | \n", "77d02174-f1e1-41c1-9fb9-01c6246b0009 | \n", "[-0.051941385508354004, 0.032673704067397, -0.... | \n", "
8 | \n", "2022-08-17 19:13:52 | \n", "40a07ca4-a991-4d21-b5cf-74ee61220f96 | \n", "[-0.0460019625995, 0.045009310261646, -0.02764... | \n", "
9 | \n", "2023-05-05 08:00:42 | \n", "57ca0770-eb8b-4769-8e67-eb1b7cc0a934 | \n", "[-0.05695966097050201, 0.03538753235396, -0.01... | \n", "
\n", " | POINT_IN_TIME | \n", "GROCERYINVOICEITEMGUID | \n", "CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w_vs_latest_invoice | \n", "
---|---|---|---|
0 | \n", "2023-02-07 11:04:26 | \n", "fd1caae1-77e6-4667-8c83-df13f05bf2f5 | \n", "0.901249 | \n", "
1 | \n", "2023-05-28 19:27:14 | \n", "15973b2f-2256-4caa-b65b-cbbfdff0905b | \n", "0.577883 | \n", "
2 | \n", "2022-09-18 18:52:36 | \n", "ac7edfb5-63ed-49fb-9b89-76b0288ed2f8 | \n", "0.955820 | \n", "
3 | \n", "2023-03-31 18:50:00 | \n", "213ef7d3-c27b-43e0-bc0a-57d6c7c254b0 | \n", "0.943237 | \n", "
4 | \n", "2022-12-26 15:01:07 | \n", "264f79fd-c24a-47cc-8a68-fe3753a4d74b | \n", "0.927726 | \n", "
5 | \n", "2023-04-11 17:23:57 | \n", "6084f39f-9d2c-4111-b1cc-502e1559c0c0 | \n", "0.802008 | \n", "
6 | \n", "2022-08-17 19:13:52 | \n", "40a07ca4-a991-4d21-b5cf-74ee61220f96 | \n", "0.892769 | \n", "
7 | \n", "2022-12-10 21:08:26 | \n", "77d02174-f1e1-41c1-9fb9-01c6246b0009 | \n", "0.912434 | \n", "
8 | \n", "2023-03-17 11:15:09 | \n", "1b627a25-7eb4-4f61-b243-c93db487bff0 | \n", "0.758240 | \n", "
9 | \n", "2023-05-05 08:00:42 | \n", "57ca0770-eb8b-4769-8e67-eb1b7cc0a934 | \n", "0.843593 | \n", "
# Generated by SDK version: 1.1.0.dev7\n",
"from bson import ObjectId\n",
"from featurebyte import DimensionTable\n",
"from featurebyte import EventTable\n",
"from featurebyte import FeatureJobSetting\n",
"from featurebyte import ItemTable\n",
"from featurebyte import UserDefinedFunction\n",
"\n",
"\n",
"# dimension_table name: "GROCERYPRODUCT"\n",
"dimension_table = DimensionTable.get_by_id(ObjectId("666956c78080c62d0dc616e2"))\n",
"dimension_view = dimension_table.get_view(\n",
" view_mode="manual", drop_column_names=[], column_cleaning_operations=[]\n",
")\n",
"col = dimension_view["ProductGroup"]\n",
"\n",
"# udf_name: embedding, sql_function_name: F_SBERT_EMBEDDING\n",
"udf_embedding = UserDefinedFunction.get_by_id(\n",
" ObjectId("666957a23fab5208644858ad")\n",
")\n",
"col_1 = udf_embedding(col)\n",
"view = dimension_view.copy()\n",
"view["ProductGroup_embedding"] = col_1\n",
"\n",
"# item_table name: "INVOICEITEMS", event_table name: "GROCERYINVOICE"\n",
"item_table = ItemTable.get_by_id(ObjectId("666956c58080c62d0dc616e1"))\n",
"item_view = item_table.get_view(\n",
" event_suffix=None,\n",
" view_mode="manual",\n",
" drop_column_names=["record_available_at"],\n",
" column_cleaning_operations=[],\n",
" event_drop_column_names=["record_available_at"],\n",
" event_column_cleaning_operations=[],\n",
" event_join_column_names=[\n",
" "Timestamp",\n",
" "GroceryInvoiceGuid",\n",
" "GroceryCustomerGuid",\n",
" "tz_offset",\n",
" ],\n",
")\n",
"joined_view = item_view.join(\n",
" view, on="GroceryProductGuid", how="left", rsuffix="", rprefix="product_"\n",
")\n",
"grouped = joined_view.groupby(\n",
" by_keys=["GroceryCustomerGuid"], category=None\n",
").aggregate_over(\n",
" value_column="product_ProductGroup_embedding",\n",
" method="avg",\n",
" windows=["26w"],\n",
" feature_names=[\n",
" "CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"\n",
" ],\n",
" feature_job_setting=FeatureJobSetting(\n",
" blind_spot="120s", period="3600s", offset="120s"\n",
" ),\n",
" skip_fill_na=True,\n",
" offset=None,\n",
")\n",
"feat = grouped[\n",
" "CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"\n",
"]\n",
"feat_1 = joined_view.groupby(\n",
" by_keys=["GroceryInvoiceGuid"], category=None\n",
").aggregate(\n",
" value_column="product_ProductGroup_embedding",\n",
" method="avg",\n",
" feature_name="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",\n",
" skip_fill_na=True,\n",
")\n",
"\n",
"# event_table name: "GROCERYINVOICE"\n",
"event_table = EventTable.get_by_id(ObjectId("666956c38080c62d0dc616e0"))\n",
"event_view = event_table.get_view(\n",
" view_mode="manual",\n",
" drop_column_names=["record_available_at"],\n",
" column_cleaning_operations=[],\n",
")\n",
"joined_view_1 = event_view.add_feature(\n",
" new_column_name="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",\n",
" feature=feat_1,\n",
" entity_column="GroceryInvoiceGuid",\n",
")\n",
"grouped_1 = joined_view_1.groupby(\n",
" by_keys=["GroceryCustomerGuid"], category=None\n",
").aggregate_over(\n",
" value_column="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",\n",
" method="latest",\n",
" windows=[None],\n",
" feature_names=[\n",
" "CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"\n",
" ],\n",
" feature_job_setting=FeatureJobSetting(\n",
" blind_spot="120s", period="3600s", offset="120s"\n",
" ),\n",
" skip_fill_na=True,\n",
" offset=None,\n",
")\n",
"feat_2 = grouped_1[\n",
" "CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"\n",
"]\n",
"feat_3 = feat_2.vec.cosine_similarity(other=feat)\n",
"feat_3.name = "CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w_vs_latest_invoice"\n",
"output = feat_3\n",
"output.save(_id=ObjectId("666957cd3fab5208644858b2"))\n",
"