CUSTOMER Avg ProductGroup Embedding 14d vs 182d

Aggregate embedding features over time¶

Another useful way of aggregating embedding vectors is aggregation over time.

This capability allows to compute average/max embeddings over various time windows, which can be useful for capturing change of grocery customer's basket over time, compute similarities between baskets at current time vs the past.

In [1]:

            
                Copied!
                
import featurebyte as fb
fb.use_profile("tutorial")
import featurebyte as fb
fb.use_profile("tutorial")

18:57:34 | INFO     | Using configuration file at: /Users/viktor/.featurebyte/config.yaml
18:57:34 | INFO     | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1)
18:57:34 | WARNING  | Remote SDK version (0.5.1.dev70) is different from local (0.5.1.dev63). Update local SDK to avoid unexpected behavior.
18:57:34 | INFO     | No catalog activated.
18:57:35 | INFO     | 10 feature lists, 59 features deployed
18:57:35 | INFO     | Using profile: tutorial
18:57:35 | INFO     | Using configuration file at: /Users/viktor/.featurebyte/config.yaml
18:57:35 | INFO     | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1)
18:57:35 | WARNING  | Remote SDK version (0.5.1.dev70) is different from local (0.5.1.dev63). Update local SDK to avoid unexpected behavior.
18:57:35 | INFO     | No catalog activated.
18:57:35 | INFO     | 10 feature lists, 59 features deployed

In [2]:

            
                Copied!
                
catalog = fb.Catalog.activate("Grocery Dataset Tutorial")
catalog = fb.Catalog.activate("Grocery Dataset Tutorial")

18:57:36 | INFO     | Catalog activated: Grocery Dataset Tutorial

Create UDF function¶

F_SBERT_EMBEDDING is a name of SQL function awailable in Data Warehouse, which calls deployed transformer model.

In [3]:

            
                Copied!
                
                    
                    
                
                

        
embedding_udf = fb.UserDefinedFunction.create(
    name='embedding', 
    sql_function_name='F_SBERT_EMBEDDING',
    function_parameters=[fb.FunctionParameter(name="x", dtype=fb.enum.DBVarType.VARCHAR)],
    output_dtype=fb.enum.DBVarType.ARRAY,
    is_global=False,
)
embedding_udf = fb.UserDefinedFunction.create(
    name='embedding', 
    sql_function_name='F_SBERT_EMBEDDING',
    function_parameters=[fb.FunctionParameter(name="x", dtype=fb.enum.DBVarType.VARCHAR)],
    output_dtype=fb.enum.DBVarType.ARRAY,
    is_global=False,
)

Get views¶

In [4]:

            
                Copied!
                
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")

Run embedding UDF on the ProductGroup column¶

In [5]:

            
                Copied!
                
groceryproduct_view["ProductGroupEmbedding"] = embedding_udf(groceryproduct_view["ProductGroup"])
groceryproduct_view["ProductGroupEmbedding"] = embedding_udf(groceryproduct_view["ProductGroup"])

Join views¶

In [6]:

            
                Copied!
                
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(groceryproduct_view, rsuffix="")
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(groceryproduct_view, rsuffix="")

Create observation table¶

In [7]:

            
                Copied!
                
                    
                    
                
                

        
observation_table = invoiceitems_view.create_observation_table(
    name="Preview tables with Invoice Items",
    sample_rows=10,
    columns=["Timestamp", "GroceryInvoiceItemGuid"],
    columns_rename_mapping={
        "Timestamp": "POINT_IN_TIME",
        "GroceryInvoiceItemGuid": "GROCERYINVOICEITEMGUID",
    },
)
observation_table = invoiceitems_view.create_observation_table(
    name="Preview tables with Invoice Items",
    sample_rows=10,
    columns=["Timestamp", "GroceryInvoiceItemGuid"],
    columns_rename_mapping={
        "Timestamp": "POINT_IN_TIME",
        "GroceryInvoiceItemGuid": "GROCERYINVOICEITEMGUID",
    },
)

Done! |████████████████████████████████████████| 100% in 9.7s (0.10%/s)

Create features from embedding column¶

In [8]:

            
                Copied!
                
                    
                    
                
                

        
customer_avg_product_groups = invoiceitems_view.groupby("GroceryCustomerGuid").aggregate_over(
    "ProductGroupEmbedding",
    method=fb.AggFunc.AVG,
    feature_names=["CUSTOMER_Avg_of_ProductGroup_Embedding_14d", "CUSTOMER_Avg_of_ProductGroup_Embedding_183d"], 
    windows=["14d", "183d"]
)

customer_avg_product_groups_cosine = customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_14d"].vec.cosine_similarity(
    customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_183d"]
)
customer_avg_product_groups_cosine.name = "CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d"
customer_avg_product_groups = invoiceitems_view.groupby("GroceryCustomerGuid").aggregate_over(
    "ProductGroupEmbedding",
    method=fb.AggFunc.AVG,
    feature_names=["CUSTOMER_Avg_of_ProductGroup_Embedding_14d", "CUSTOMER_Avg_of_ProductGroup_Embedding_183d"], 
    windows=["14d", "183d"]
)

customer_avg_product_groups_cosine = customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_14d"].vec.cosine_similarity(
    customer_avg_product_groups["CUSTOMER_Avg_of_ProductGroup_Embedding_183d"]
)
customer_avg_product_groups_cosine.name = "CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d"

In [9]:

            
                Copied!
                
observation_table = catalog.get_observation_table("Preview tables with Invoice Items")
customer_avg_product_groups_cosine.preview(observation_table.to_pandas())
observation_table = catalog.get_observation_table("Preview tables with Invoice Items")
customer_avg_product_groups_cosine.preview(observation_table.to_pandas())

Downloading table |████████████████████████████████████████| 10/10 [100%] in 0.1

Out[9]:

	POINT_IN_TIME	GROCERYINVOICEITEMGUID	CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d
0	2022-02-27 12:19:06	d307efab-fc40-4b16-be88-2d13b70d8903	0.812086
1	2022-03-19 13:17:52	e42054bf-fc35-4248-a279-16dc7ac8efa5	0.852374
2	2022-04-13 19:50:26	b3afcbe1-dd98-41f3-bb79-9133bd316dae	0.965045
3	2022-09-27 12:46:45	48156112-a586-42ee-ab1d-fc53629f438a	0.889810
4	2022-11-04 12:15:04	23563a05-76be-44e7-9fe6-6bb4e4d11d2b	0.944870
5	2023-03-09 12:15:30	57c7d176-f2e3-48d4-94a8-d7f3fbc726a3	0.881372
6	2023-04-20 12:17:35	20779d9e-69c3-4135-a42b-6e7a10819136	0.000000
7	2023-06-12 14:14:32	8717d1fa-6708-4a49-b022-b21f89d5060b	0.880983
8	2023-07-03 13:14:46	4b00f6c0-0913-4608-b0f2-f344ad57481b	0.971262
9	2023-08-09 11:17:46	c851f86c-f52e-4b55-8311-b62f55be6945	0.786081

Save feature and view definition file¶

In [10]:

            
                Copied!
                
customer_avg_product_groups_cosine.save()
customer_avg_product_groups_cosine.save()

Done! |████████████████████████████████████████| 100% in 6.5s (0.16%/s)

In [11]:

            
                Copied!
                
                    
                    
                
                

        
# Add description
customer_avg_product_groups_cosine.update_description(
    "Similarity between average customer's baskets in 14 and 183 days periods"
)
# See feature definition file
customer_avg_product_groups_cosine.definition
# Add description
customer_avg_product_groups_cosine.update_description(
    "Similarity between average customer's baskets in 14 and 183 days periods"
)
# See feature definition file
customer_avg_product_groups_cosine.definition

Out[11]:

# Generated by SDK version: 0.5.1.dev70
from bson import ObjectId
from featurebyte import DimensionTable
from featurebyte import FeatureJobSetting
from featurebyte import ItemTable
from featurebyte import UserDefinedFunction


# dimension_table name: "GROCERYPRODUCT"
dimension_table = DimensionTable.get_by_id(ObjectId("6553b0012c001f76d263e059"))
dimension_view = dimension_table.get_view(
    view_mode="manual", drop_column_names=[], column_cleaning_operations=[]
)
col = dimension_view["ProductGroup"]

# udf_name: embedding, sql_function_name: F_SBERT_EMBEDDING
udf_embedding = UserDefinedFunction.get_by_id(
    ObjectId("6553b511850516ee23c8b734")
)
col_1 = udf_embedding(col)
view = dimension_view.copy()
view["ProductGroupEmbedding"] = col_1

# item_table name: "INVOICEITEMS", event_table name: "GROCERYINVOICE"
item_table = ItemTable.get_by_id(ObjectId("6553b0002c001f76d263e058"))
item_view = item_table.get_view(
    event_suffix=None,
    view_mode="manual",
    drop_column_names=["record_available_at"],
    column_cleaning_operations=[],
    event_drop_column_names=["record_available_at"],
    event_column_cleaning_operations=[],
    event_join_column_names=[
        "Timestamp",
        "GroceryInvoiceGuid",
        "GroceryCustomerGuid",
        "tz_offset",
    ],
)
joined_view = item_view.join(
    view, on="GroceryProductGuid", how="left", rsuffix="", rprefix=""
)
grouped = joined_view.groupby(
    by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
    value_column="ProductGroupEmbedding",
    method="avg",
    windows=["14d", "183d"],
    feature_names=[
        "CUSTOMER_Avg_of_ProductGroup_Embedding_14d",
        "CUSTOMER_Avg_of_ProductGroup_Embedding_183d",
    ],
    feature_job_setting=FeatureJobSetting(
        blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
    ),
    skip_fill_na=True,
)
feat = grouped["CUSTOMER_Avg_of_ProductGroup_Embedding_183d"]
feat_1 = grouped["CUSTOMER_Avg_of_ProductGroup_Embedding_14d"]
feat_2 = feat_1.vec.cosine_similarity(other=feat)
feat_2.name = "CUSTOMER_Consistency_of_Avg_of_ProductGroup_Embedding_14d_183d"
output = feat_2
output.save(_id=ObjectId("6553b52a850516ee23c8b738"))

In [ ]: