CUSTOMER Avg ProductGroup Embedding 14d vs 182d
Aggregate embedding features over time¶
Another useful way of aggregating embedding vectors is aggregation over time.
This capability allows to compute average/max embeddings over various time windows, which can be useful for capturing change of grocery customer's basket over time, compute similarities between baskets at current time vs the past.
In [1]:
Copied!
import featurebyte as fb
fb.use_profile("tutorial")
import featurebyte as fb
fb.use_profile("tutorial")
18:57:34 | INFO | Using configuration file at: /Users/viktor/.featurebyte/config.yaml 18:57:34 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 18:57:34 | WARNING | Remote SDK version (0.5.1.dev70) is different from local (0.5.1.dev63). Update local SDK to avoid unexpected behavior. 18:57:34 | INFO | No catalog activated. 18:57:35 | INFO | 10 feature lists, 59 features deployed 18:57:35 | INFO | Using profile: tutorial 18:57:35 | INFO | Using configuration file at: /Users/viktor/.featurebyte/config.yaml 18:57:35 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 18:57:35 | WARNING | Remote SDK version (0.5.1.dev70) is different from local (0.5.1.dev63). Update local SDK to avoid unexpected behavior. 18:57:35 | INFO | No catalog activated. 18:57:35 | INFO | 10 feature lists, 59 features deployed
In [2]:
Copied!
catalog = fb.Catalog.activate("Grocery Dataset Tutorial")
catalog = fb.Catalog.activate("Grocery Dataset Tutorial")
18:57:36 | INFO | Catalog activated: Grocery Dataset Tutorial
Create UDF function¶
F_SBERT_EMBEDDING
is a name of SQL function awailable in Data Warehouse, which calls deployed transformer model.
In [3]:
Copied!
embedding_udf = fb.UserDefinedFunction.create(
name='embedding',
sql_function_name='F_SBERT_EMBEDDING',
function_parameters=[fb.FunctionParameter(name="x", dtype=fb.enum.DBVarType.VARCHAR)],
output_dtype=fb.enum.DBVarType.ARRAY,
is_global=False,
)
embedding_udf = fb.UserDefinedFunction.create(
name='embedding',
sql_function_name='F_SBERT_EMBEDDING',
function_parameters=[fb.FunctionParameter(name="x", dtype=fb.enum.DBVarType.VARCHAR)],
output_dtype=fb.enum.DBVarType.ARRAY,
is_global=False,
)
Get views¶
In [4]:
Copied!
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
Run embedding UDF on the ProductGroup column¶
In [5]:
Copied!
groceryproduct_view["ProductGroupEmbedding"] = embedding_udf(groceryproduct_view["ProductGroup"])
groceryproduct_view["ProductGroupEmbedding"] = embedding_udf(groceryproduct_view["ProductGroup"])
Join views¶
In [6]:
Copied!
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(groceryproduct_view, rsuffix="")
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(groceryproduct_view, rsuffix="")
Create observation table¶
In [7]:
Copied!
observation_table = invoiceitems_view.create_observation_table(
name="Preview tables with Invoice Items",
sample_rows=10,
columns=["Timestamp", "GroceryInvoiceItemGuid"],
columns_rename_mapping={
"Timestamp": "POINT_IN_TIME",
"GroceryInvoiceItemGuid": "GROCERYINVOICEITEMGUID",
},
)
observation_table = invoiceitems_view.create_observation_table(
name="Preview tables with Invoice Items",
sample_rows=10,
columns=["Timestamp", "GroceryInvoiceItemGuid"],
columns_rename_mapping={
"Timestamp": "POINT_IN_TIME",
"GroceryInvoiceItemGuid": "GROCERYINVOICEITEMGUID",
},
)
Done! |████████████████████████████████████████| 100% in 9.7s (0.10%/s)
Create features from embedding column¶
In [8]:
Copied!
customer_avg_product_groups = invoiceitems_view.groupby("GroceryCustomerGuid").aggregate_over(
"ProductGroupEmbedding",
method=fb.AggFunc.AVG,
feature_names=["CUSTOMER_Avg_of_ProductGroup_Embedding_14d", "CUSTOMER_Avg_of_ProductGroup_Embedding_183d"],
windows=["14d", "183d"]
)
customer_avg_product_groups_cosine = customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_14d"].vec.cosine_similarity(
customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_183d"]
)
customer_avg_product_groups_cosine.name = "CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d"
customer_avg_product_groups = invoiceitems_view.groupby("GroceryCustomerGuid").aggregate_over(
"ProductGroupEmbedding",
method=fb.AggFunc.AVG,
feature_names=["CUSTOMER_Avg_of_ProductGroup_Embedding_14d", "CUSTOMER_Avg_of_ProductGroup_Embedding_183d"],
windows=["14d", "183d"]
)
customer_avg_product_groups_cosine = customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_14d"].vec.cosine_similarity(
customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_183d"]
)
customer_avg_product_groups_cosine.name = "CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d"
In [9]:
Copied!
observation_table = catalog.get_observation_table("Preview tables with Invoice Items")
customer_avg_product_groups_cosine.preview(observation_table.to_pandas())
observation_table = catalog.get_observation_table("Preview tables with Invoice Items")
customer_avg_product_groups_cosine.preview(observation_table.to_pandas())
Downloading table |████████████████████████████████████████| 10/10 [100%] in 0.1
Out[9]:
POINT_IN_TIME | GROCERYINVOICEITEMGUID | CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d | |
---|---|---|---|
0 | 2022-02-27 12:19:06 | d307efab-fc40-4b16-be88-2d13b70d8903 | 0.812086 |
1 | 2022-03-19 13:17:52 | e42054bf-fc35-4248-a279-16dc7ac8efa5 | 0.852374 |
2 | 2022-04-13 19:50:26 | b3afcbe1-dd98-41f3-bb79-9133bd316dae | 0.965045 |
3 | 2022-09-27 12:46:45 | 48156112-a586-42ee-ab1d-fc53629f438a | 0.889810 |
4 | 2022-11-04 12:15:04 | 23563a05-76be-44e7-9fe6-6bb4e4d11d2b | 0.944870 |
5 | 2023-03-09 12:15:30 | 57c7d176-f2e3-48d4-94a8-d7f3fbc726a3 | 0.881372 |
6 | 2023-04-20 12:17:35 | 20779d9e-69c3-4135-a42b-6e7a10819136 | 0.000000 |
7 | 2023-06-12 14:14:32 | 8717d1fa-6708-4a49-b022-b21f89d5060b | 0.880983 |
8 | 2023-07-03 13:14:46 | 4b00f6c0-0913-4608-b0f2-f344ad57481b | 0.971262 |
9 | 2023-08-09 11:17:46 | c851f86c-f52e-4b55-8311-b62f55be6945 | 0.786081 |
Save feature and view definition file¶
In [10]:
Copied!
customer_avg_product_groups_cosine.save()
customer_avg_product_groups_cosine.save()
Done! |████████████████████████████████████████| 100% in 6.5s (0.16%/s)
In [11]:
Copied!
# Add description
customer_avg_product_groups_cosine.update_description(
"Similarity between average customer's baskets in 14 and 183 days periods"
)
# See feature definition file
customer_avg_product_groups_cosine.definition
# Add description
customer_avg_product_groups_cosine.update_description(
"Similarity between average customer's baskets in 14 and 183 days periods"
)
# See feature definition file
customer_avg_product_groups_cosine.definition
Out[11]:
# Generated by SDK version: 0.5.1.dev70
from bson import ObjectId
from featurebyte import DimensionTable
from featurebyte import FeatureJobSetting
from featurebyte import ItemTable
from featurebyte import UserDefinedFunction
# dimension_table name: "GROCERYPRODUCT"
dimension_table = DimensionTable.get_by_id(ObjectId("6553b0012c001f76d263e059"))
dimension_view = dimension_table.get_view(
view_mode="manual", drop_column_names=[], column_cleaning_operations=[]
)
col = dimension_view["ProductGroup"]
# udf_name: embedding, sql_function_name: F_SBERT_EMBEDDING
udf_embedding = UserDefinedFunction.get_by_id(
ObjectId("6553b511850516ee23c8b734")
)
col_1 = udf_embedding(col)
view = dimension_view.copy()
view["ProductGroupEmbedding"] = col_1
# item_table name: "INVOICEITEMS", event_table name: "GROCERYINVOICE"
item_table = ItemTable.get_by_id(ObjectId("6553b0002c001f76d263e058"))
item_view = item_table.get_view(
event_suffix=None,
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[],
event_drop_column_names=["record_available_at"],
event_column_cleaning_operations=[],
event_join_column_names=[
"Timestamp",
"GroceryInvoiceGuid",
"GroceryCustomerGuid",
"tz_offset",
],
)
joined_view = item_view.join(
view, on="GroceryProductGuid", how="left", rsuffix="", rprefix=""
)
grouped = joined_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="ProductGroupEmbedding",
method="avg",
windows=["14d", "183d"],
feature_names=[
"CUSTOMER_Avg_of_ProductGroup_Embedding_14d",
"CUSTOMER_Avg_of_ProductGroup_Embedding_183d",
],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat = grouped["CUSTOMER_Avg_of_ProductGroup_Embedding_183d"]
feat_1 = grouped["CUSTOMER_Avg_of_ProductGroup_Embedding_14d"]
feat_2 = feat_1.vec.cosine_similarity(other=feat)
feat_2.name = "CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d"
output = feat_2
output.save(_id=ObjectId("6553b52a850516ee23c8b738"))
In [ ]:
Copied!