12. Use Embeddings
Use embeddings¶
In this tutorial, we'll use product group embeddings to compare a customer's latest invoice with their past purchases from the last 26 weeks.
To learn how to create a SQL Embedding User-Defined Function (UDF), check out the 'Bring Your Own Transformer' tutorials.
For our hosted tutorials, we have pre-configured a SQL UDF using the SBERT Transformer model on our Snowflake data warehouse. We'll register this UDF in the Catalog and apply it to analyze the ProductGroup descriptions.
In [1]:
Copied!
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
16:09:06 | WARNING | Service endpoint is inaccessible: http://featurebyte-server:8088 16:09:06 | INFO | Using profile: tutorial 16:09:06 | INFO | Using configuration file at: /Users/gxav/.featurebyte/config.yaml 16:09:06 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 16:09:06 | WARNING | Remote SDK version (1.1.0.dev7) is different from local (1.1.0.dev1). Update local SDK to avoid unexpected behavior. 16:09:06 | INFO | No catalog activated. 16:09:06 | INFO | Catalog activated: Grocery Dataset Tutorial
Register the F_SBERT_EMBEDDING UDF to the Catalog¶
In [2]:
Copied!
fb.UserDefinedFunction.create(
name='embedding',
sql_function_name='F_SBERT_EMBEDDING',
function_parameters=[fb.FunctionParameter(name="x", dtype=fb.enum.DBVarType.VARCHAR)],
output_dtype=fb.enum.DBVarType.ARRAY,
is_global=False,
)
fb.UserDefinedFunction.create(
name='embedding',
sql_function_name='F_SBERT_EMBEDDING',
function_parameters=[fb.FunctionParameter(name="x", dtype=fb.enum.DBVarType.VARCHAR)],
output_dtype=fb.enum.DBVarType.ARRAY,
is_global=False,
)
Out[2]:
User Defined Function
name | embedding | ||||||||||
created_at | 2024-06-12 08:09:06 | ||||||||||
updated_at | None | ||||||||||
description | None | ||||||||||
sql_function_name | F_SBERT_EMBEDDING | ||||||||||
function_parameters |
|
||||||||||
signature | embedding(x: str) -> array | ||||||||||
output_dtype | ARRAY | ||||||||||
feature_store_name | playground | ||||||||||
used_by_features | [] |
Apply the embedding UDF instance to ProductGroup¶
In [3]:
Copied!
# Get embedding UDF instance.
embedding_udf = catalog.get_user_defined_function(
"embedding"
)
# Get embedding UDF instance.
embedding_udf = catalog.get_user_defined_function(
"embedding"
)
In [4]:
Copied!
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Apply embedding to ProductGroup column in GROCERYPRODUCT view.
groceryproduct_view["ProductGroup_embedding"] = embedding_udf(groceryproduct_view["ProductGroup"])
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Apply embedding to ProductGroup column in GROCERYPRODUCT view.
groceryproduct_view["ProductGroup_embedding"] = embedding_udf(groceryproduct_view["ProductGroup"])
Get other views¶
In [5]:
Copied!
# Get view from GROCERYINVOICE event table.
groceryinvoice_view = catalog.get_view("GROCERYINVOICE")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
# Get view from GROCERYINVOICE event table.
groceryinvoice_view = catalog.get_view("GROCERYINVOICE")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
Join views¶
In [6]:
Copied!
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(
groceryproduct_view, rprefix="product_"
)
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(
groceryproduct_view, rprefix="product_"
)
Get the mean vector of an invoice's Product Group descriptions¶
In [7]:
Copied!
# Group invoiceitems_view by invoice entity (GroceryInvoiceGuid).
invoiceitems_view_by_invoice =\
invoiceitems_view.groupby("GroceryInvoiceGuid")
# Group invoiceitems_view by invoice entity (GroceryInvoiceGuid).
invoiceitems_view_by_invoice =\
invoiceitems_view.groupby("GroceryInvoiceGuid")
In [8]:
Copied!
# Mean vector of product_ProductGroup_embedding for the invoice.
invoice_mean_vector_of_item_product_productgroup_embedding =\
invoiceitems_view_by_invoice.aggregate(
"product_ProductGroup_embedding", method=fb.AggFunc.AVG,
feature_name="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"
)
# Mean vector of product_ProductGroup_embedding for the invoice.
invoice_mean_vector_of_item_product_productgroup_embedding =\
invoiceitems_view_by_invoice.aggregate(
"product_ProductGroup_embedding", method=fb.AggFunc.AVG,
feature_name="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"
)
Get the mean vector of the Customer's latest invoice¶
In [9]:
Copied!
# Add INVOICE_Mean_vector_of_item_product_ProductGroup_embedding feature to the GROCERYINVOICE view
# as a column.
groceryinvoice_view =\
groceryinvoice_view.add_feature(
"INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",
invoice_mean_vector_of_item_product_productgroup_embedding
)
# Add INVOICE_Mean_vector_of_item_product_ProductGroup_embedding feature to the GROCERYINVOICE view
# as a column.
groceryinvoice_view =\
groceryinvoice_view.add_feature(
"INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",
invoice_mean_vector_of_item_product_productgroup_embedding
)
In [10]:
Copied!
# Group GROCERYINVOICE view by customer entity (GroceryCustomerGuid).
groceryinvoice_view_by_customer =\
groceryinvoice_view.groupby(['GroceryCustomerGuid'])
# Group GROCERYINVOICE view by customer entity (GroceryCustomerGuid).
groceryinvoice_view_by_customer =\
groceryinvoice_view.groupby(['GroceryCustomerGuid'])
In [11]:
Copied!
# Get Latest Mean vector of item product_ProductGroup_embedding for the customer
customer_latest_invoice_mean_vector_of_item_product_productgroup_embedding =\
groceryinvoice_view_by_customer.aggregate_over(
"INVOICE_Mean_vector_of_item_product_ProductGroup_embedding", method="latest",
feature_names=["CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"],
windows=[None]
)["CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"]
# Get Latest Mean vector of item product_ProductGroup_embedding for the customer
customer_latest_invoice_mean_vector_of_item_product_productgroup_embedding =\
groceryinvoice_view_by_customer.aggregate_over(
"INVOICE_Mean_vector_of_item_product_ProductGroup_embedding", method="latest",
feature_names=["CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"],
windows=[None]
)["CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"]
Get the mean vector for the Customer's Product Group descriptions over past 26 weeks¶
In [12]:
Copied!
# Group INVOICEITEMS view by customer entity (GroceryCustomerGuid).
invoiceitems_view_by_customer =\
invoiceitems_view.groupby(['GroceryCustomerGuid'])
# Group INVOICEITEMS view by customer entity (GroceryCustomerGuid).
invoiceitems_view_by_customer =\
invoiceitems_view.groupby(['GroceryCustomerGuid'])
In [13]:
Copied!
# Get Mean vector of product_ProductGroup_embedding for the customer over time.
feature_group =\
invoiceitems_view_by_customer.aggregate_over(
"product_ProductGroup_embedding", method="avg",
feature_names=[
"CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"
],
windows=["26w"],
)
# Get CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w object from feature group.
customer_mean_vector_of_item_product_productgroup_embedding_26w =\
feature_group["CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"]
# Get Mean vector of product_ProductGroup_embedding for the customer over time.
feature_group =\
invoiceitems_view_by_customer.aggregate_over(
"product_ProductGroup_embedding", method="avg",
feature_names=[
"CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"
],
windows=["26w"],
)
# Get CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w object from feature group.
customer_mean_vector_of_item_product_productgroup_embedding_26w =\
feature_group["CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"]
Derive Similarity between latest invoice and 26 weeks purchases¶
In [14]:
Copied!
# Derive Similarity feature from cosine similarity between
# CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding
# and CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice =\
customer_latest_invoice_mean_vector_of_item_product_productgroup_embedding.vec.cosine_similarity(
customer_mean_vector_of_item_product_productgroup_embedding_26w
)
# Give a name to new feature
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.name = \
"CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w_vs_latest_invoice"
# Derive Similarity feature from cosine similarity between
# CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding
# and CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice =\
customer_latest_invoice_mean_vector_of_item_product_productgroup_embedding.vec.cosine_similarity(
customer_mean_vector_of_item_product_productgroup_embedding_26w
)
# Give a name to new feature
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.name = \
"CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w_vs_latest_invoice"
Preview feature¶
In [15]:
Copied!
# Get observation table: 'Preview Table with 10 items'
preview_table = catalog.get_observation_table("Preview Table with 10 items")
# Get observation table: 'Preview Table with 10 items'
preview_table = catalog.get_observation_table("Preview Table with 10 items")
In [16]:
Copied!
# Preview customer_mean_vector_of_item_product_productgroup_embedding_26w
customer_mean_vector_of_item_product_productgroup_embedding_26w.preview(
preview_table
)
# Preview customer_mean_vector_of_item_product_productgroup_embedding_26w
customer_mean_vector_of_item_product_productgroup_embedding_26w.preview(
preview_table
)
Out[16]:
POINT_IN_TIME | GROCERYINVOICEITEMGUID | CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w | |
---|---|---|---|
0 | 2023-04-11 17:23:57 | 6084f39f-9d2c-4111-b1cc-502e1559c0c0 | [-0.049583778360973, 0.040724096369544, -0.019... |
1 | 2023-02-07 11:04:26 | fd1caae1-77e6-4667-8c83-df13f05bf2f5 | [-0.047664110747538006, 0.040520216686427, -0.... |
2 | 2023-03-17 11:15:09 | 1b627a25-7eb4-4f61-b243-c93db487bff0 | [-0.05412369494863901, 0.028804376621762003, -... |
3 | 2022-09-18 18:52:36 | ac7edfb5-63ed-49fb-9b89-76b0288ed2f8 | [-0.051115894625539005, 0.033894273326497006, ... |
4 | 2023-05-28 19:27:14 | 15973b2f-2256-4caa-b65b-cbbfdff0905b | [-0.046703376632649, 0.035018777767219005, -0.... |
5 | 2022-12-26 15:01:07 | 264f79fd-c24a-47cc-8a68-fe3753a4d74b | [-0.057210504562501006, 0.026236914188101004, ... |
6 | 2023-03-31 18:50:00 | 213ef7d3-c27b-43e0-bc0a-57d6c7c254b0 | [-0.049429968671181004, 0.033228590529173, -0.... |
7 | 2022-12-10 21:08:26 | 77d02174-f1e1-41c1-9fb9-01c6246b0009 | [-0.051941385508354004, 0.032673704067397, -0.... |
8 | 2022-08-17 19:13:52 | 40a07ca4-a991-4d21-b5cf-74ee61220f96 | [-0.0460019625995, 0.045009310261646, -0.02764... |
9 | 2023-05-05 08:00:42 | 57ca0770-eb8b-4769-8e67-eb1b7cc0a934 | [-0.05695966097050201, 0.03538753235396, -0.01... |
In [17]:
Copied!
# Preview customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.preview(
preview_table
)
# Preview customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.preview(
preview_table
)
Out[17]:
POINT_IN_TIME | GROCERYINVOICEITEMGUID | CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w_vs_latest_invoice | |
---|---|---|---|
0 | 2023-02-07 11:04:26 | fd1caae1-77e6-4667-8c83-df13f05bf2f5 | 0.901249 |
1 | 2023-05-28 19:27:14 | 15973b2f-2256-4caa-b65b-cbbfdff0905b | 0.577883 |
2 | 2022-09-18 18:52:36 | ac7edfb5-63ed-49fb-9b89-76b0288ed2f8 | 0.955820 |
3 | 2023-03-31 18:50:00 | 213ef7d3-c27b-43e0-bc0a-57d6c7c254b0 | 0.943237 |
4 | 2022-12-26 15:01:07 | 264f79fd-c24a-47cc-8a68-fe3753a4d74b | 0.927726 |
5 | 2023-04-11 17:23:57 | 6084f39f-9d2c-4111-b1cc-502e1559c0c0 | 0.802008 |
6 | 2022-08-17 19:13:52 | 40a07ca4-a991-4d21-b5cf-74ee61220f96 | 0.892769 |
7 | 2022-12-10 21:08:26 | 77d02174-f1e1-41c1-9fb9-01c6246b0009 | 0.912434 |
8 | 2023-03-17 11:15:09 | 1b627a25-7eb4-4f61-b243-c93db487bff0 | 0.758240 |
9 | 2023-05-05 08:00:42 | 57ca0770-eb8b-4769-8e67-eb1b7cc0a934 | 0.843593 |
Save feature¶
In [18]:
Copied!
# Save feature
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.save()
# Save feature
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.save()
Done! |████████████████████████████████████████| 100% in 6.1s (0.17%/s)
As always, add description and view definition file¶
In [19]:
Copied!
# Add description
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.update_description(
"Compare the customer's 4w Mean vector of item "
"product_ProductGroup_embedding with the customer's most recent "
"invoice. This comparison is done using the Cosine Similarity metric to"
" measure how similar these mean vector embeddings are."
)
# See feature definition file
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.definition
# Add description
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.update_description(
"Compare the customer's 4w Mean vector of item "
"product_ProductGroup_embedding with the customer's most recent "
"invoice. This comparison is done using the Cosine Similarity metric to"
" measure how similar these mean vector embeddings are."
)
# See feature definition file
customer_mean_vector_of_item_product_productgroup_embedding_26w_vs_latest_invoice.definition
Out[19]:
# Generated by SDK version: 1.1.0.dev7
from bson import ObjectId
from featurebyte import DimensionTable
from featurebyte import EventTable
from featurebyte import FeatureJobSetting
from featurebyte import ItemTable
from featurebyte import UserDefinedFunction
# dimension_table name: "GROCERYPRODUCT"
dimension_table = DimensionTable.get_by_id(ObjectId("666956c78080c62d0dc616e2"))
dimension_view = dimension_table.get_view(
view_mode="manual", drop_column_names=[], column_cleaning_operations=[]
)
col = dimension_view["ProductGroup"]
# udf_name: embedding, sql_function_name: F_SBERT_EMBEDDING
udf_embedding = UserDefinedFunction.get_by_id(
ObjectId("666957a23fab5208644858ad")
)
col_1 = udf_embedding(col)
view = dimension_view.copy()
view["ProductGroup_embedding"] = col_1
# item_table name: "INVOICEITEMS", event_table name: "GROCERYINVOICE"
item_table = ItemTable.get_by_id(ObjectId("666956c58080c62d0dc616e1"))
item_view = item_table.get_view(
event_suffix=None,
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[],
event_drop_column_names=["record_available_at"],
event_column_cleaning_operations=[],
event_join_column_names=[
"Timestamp",
"GroceryInvoiceGuid",
"GroceryCustomerGuid",
"tz_offset",
],
)
joined_view = item_view.join(
view, on="GroceryProductGuid", how="left", rsuffix="", rprefix="product_"
)
grouped = joined_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="product_ProductGroup_embedding",
method="avg",
windows=["26w"],
feature_names=[
"CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"
],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", period="3600s", offset="120s"
),
skip_fill_na=True,
offset=None,
)
feat = grouped[
"CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w"
]
feat_1 = joined_view.groupby(
by_keys=["GroceryInvoiceGuid"], category=None
).aggregate(
value_column="product_ProductGroup_embedding",
method="avg",
feature_name="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",
skip_fill_na=True,
)
# event_table name: "GROCERYINVOICE"
event_table = EventTable.get_by_id(ObjectId("666956c38080c62d0dc616e0"))
event_view = event_table.get_view(
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[],
)
joined_view_1 = event_view.add_feature(
new_column_name="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",
feature=feat_1,
entity_column="GroceryInvoiceGuid",
)
grouped_1 = joined_view_1.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="INVOICE_Mean_vector_of_item_product_ProductGroup_embedding",
method="latest",
windows=[None],
feature_names=[
"CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"
],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", period="3600s", offset="120s"
),
skip_fill_na=True,
offset=None,
)
feat_2 = grouped_1[
"CUSTOMER_Latest_INVOICE_Mean_vector_of_item_product_ProductGroup_embedding"
]
feat_3 = feat_2.vec.cosine_similarity(other=feat)
feat_3.name = "CUSTOMER_Mean_vector_of_item_product_ProductGroup_embedding_26w_vs_latest_invoice"
output = feat_3
output.save(_id=ObjectId("666957cd3fab5208644858b2"))