11. Derive similarity features from bucketing
Create similarity features¶
In this tutorial we will look into another way of deriving features from other features - creating similarity features.
We will create a feature that compares the customer purchase patterns across product groups to all customers purchase patterns. We will first do bucketing at the customer level and at the overall level. Then we will compare the 2 dictionaries with the cosine similarity.
In [1]:
Copied!
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
16:43:45 | INFO | Using configuration file at: /Users/viktor/.featurebyte/config.yaml 16:43:45 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 16:43:45 | INFO | SDK version: 0.6.0.dev121 16:43:45 | INFO | No catalog activated. 16:43:46 | INFO | 10 feature lists, 59 features deployed 16:43:46 | INFO | Using profile: tutorial 16:43:46 | INFO | Using configuration file at: /Users/viktor/.featurebyte/config.yaml 16:43:46 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 16:43:46 | INFO | SDK version: 0.6.0.dev121 16:43:46 | INFO | No catalog activated. 16:43:46 | INFO | 10 feature lists, 59 features deployed 16:43:47 | INFO | Catalog activated: Grocery Dataset Tutorial
In [2]:
Copied!
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
# Get view from GROCERYPRODUCT dimension table.
groceryproduct_view = catalog.get_view("GROCERYPRODUCT")
# Get view from INVOICEITEMS item table.
invoiceitems_view = catalog.get_view("INVOICEITEMS")
Join views¶
In [3]:
Copied!
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(groceryproduct_view, rsuffix="")
# Join GROCERYPRODUCT view to INVOICEITEMS view.
invoiceitems_view = invoiceitems_view.join(groceryproduct_view, rsuffix="")
Create distribution features across |product groups¶
In [4]:
Copied!
# Group INVOICEITEMS view by customer entity (GroceryCustomerGuid) across different ProductGroups.
invoiceitems_view_by_customer_across_productgroup =\
invoiceitems_view.groupby(
['GroceryCustomerGuid'], category="ProductGroup"
)
# Group INVOICEITEMS view by customer entity (GroceryCustomerGuid) across different ProductGroups.
invoiceitems_view_by_customer_across_productgroup =\
invoiceitems_view.groupby(
['GroceryCustomerGuid'], category="ProductGroup"
)
In [5]:
Copied!
# Create Buckets representing the cumulative TotalCost of item, categorized by ProductGroup,
# for the customer over the past 26 weeks.
# The result is presented as a dictionary where the ProductGroup serves as the key
# and its corresponding sum of item TotalCost for the customer forms the value.
feature_group =\
invoiceitems_view_by_customer_across_productgroup.aggregate_over(
"TotalCost", method=fb.AggFunc.SUM,
feature_names=[
"CUSTOMER_item_TotalCost_across_product_ProductGroups_26w"
],
windows=["26w"]
)
# Get CUSTOMER_item_TotalCost_across_product_ProductGroups_26w object from feature group.
customer_item_totalcost_across_product_productgroups_26w =\
feature_group["CUSTOMER_item_TotalCost_across_product_ProductGroups_26w"]
# Create Buckets representing the cumulative TotalCost of item, categorized by ProductGroup,
# for the customer over the past 26 weeks.
# The result is presented as a dictionary where the ProductGroup serves as the key
# and its corresponding sum of item TotalCost for the customer forms the value.
feature_group =\
invoiceitems_view_by_customer_across_productgroup.aggregate_over(
"TotalCost", method=fb.AggFunc.SUM,
feature_names=[
"CUSTOMER_item_TotalCost_across_product_ProductGroups_26w"
],
windows=["26w"]
)
# Get CUSTOMER_item_TotalCost_across_product_ProductGroups_26w object from feature group.
customer_item_totalcost_across_product_productgroups_26w =\
feature_group["CUSTOMER_item_TotalCost_across_product_ProductGroups_26w"]
In [6]:
Copied!
# Group INVOICEITEMS view across different ProductGroups.
invoiceitems_view_by_overall_across_productgroup =\
invoiceitems_view.groupby([], category="ProductGroup")
# Group INVOICEITEMS view across different ProductGroups.
invoiceitems_view_by_overall_across_productgroup =\
invoiceitems_view.groupby([], category="ProductGroup")
In [7]:
Copied!
# Create Buckets representing the cumulative TotalCost of item, categorized by ProductGroup,
# for ALL customers over the past 26 weeks.
# The result is presented as a dictionary where the ProductGroup serves as the key
# and its corresponding sum of item TotalCost forms the value.
feature_group =\
invoiceitems_view_by_overall_across_productgroup.aggregate_over(
"TotalCost", method=fb.AggFunc.SUM,
feature_names=[
"OVERALL_item_TotalCost_across_product_ProductGroups_26w"
],
windows=["26w"]
)
# Get OVERALL_item_TotalCost_across_product_ProductGroups_26w object from feature group.
overall_item_totalcost_across_product_productgroups_26w =\
feature_group["OVERALL_item_TotalCost_across_product_ProductGroups_26w"]
# Create Buckets representing the cumulative TotalCost of item, categorized by ProductGroup,
# for ALL customers over the past 26 weeks.
# The result is presented as a dictionary where the ProductGroup serves as the key
# and its corresponding sum of item TotalCost forms the value.
feature_group =\
invoiceitems_view_by_overall_across_productgroup.aggregate_over(
"TotalCost", method=fb.AggFunc.SUM,
feature_names=[
"OVERALL_item_TotalCost_across_product_ProductGroups_26w"
],
windows=["26w"]
)
# Get OVERALL_item_TotalCost_across_product_ProductGroups_26w object from feature group.
overall_item_totalcost_across_product_productgroups_26w =\
feature_group["OVERALL_item_TotalCost_across_product_ProductGroups_26w"]
Derive Similarity feature across entities¶
In [8]:
Copied!
# Derive Similarity feature from cosine similarity between
# CUSTOMER_item_TotalCost_across_product_ProductGroups_26w
# and OVERALL_item_TotalCost_across_product_ProductGroups_26w
customer_vs_overall_item_totalcost_across_product_productgroups_26w =\
customer_item_totalcost_across_product_productgroups_26w.cd.cosine_similarity(
overall_item_totalcost_across_product_productgroups_26w
)
# Give a name to new feature
customer_vs_overall_item_totalcost_across_product_productgroups_26w.name = \
"CUSTOMER_vs_OVERALL_item_TotalCost_across_product_ProductGroups_26w"
# Derive Similarity feature from cosine similarity between
# CUSTOMER_item_TotalCost_across_product_ProductGroups_26w
# and OVERALL_item_TotalCost_across_product_ProductGroups_26w
customer_vs_overall_item_totalcost_across_product_productgroups_26w =\
customer_item_totalcost_across_product_productgroups_26w.cd.cosine_similarity(
overall_item_totalcost_across_product_productgroups_26w
)
# Give a name to new feature
customer_vs_overall_item_totalcost_across_product_productgroups_26w.name = \
"CUSTOMER_vs_OVERALL_item_TotalCost_across_product_ProductGroups_26w"
Preview feature¶
In [9]:
Copied!
# Get observation table: 'Preview Table with 10 Customers'
preview_table = catalog.get_observation_table("Preview Table with 10 Customers").to_pandas()
# Get observation table: 'Preview Table with 10 Customers'
preview_table = catalog.get_observation_table("Preview Table with 10 Customers").to_pandas()
Downloading table |████████████████████████████████████████| 10/10 [100%] in 0.1
In [10]:
Copied!
# Preview CUSTOMER_item_TotalCost_across_product_ProductGroups_26w
customer_item_totalcost_across_product_productgroups_26w.preview(
preview_table
)
# Preview CUSTOMER_item_TotalCost_across_product_ProductGroups_26w
customer_item_totalcost_across_product_productgroups_26w.preview(
preview_table
)
Out[10]:
POINT_IN_TIME | GROCERYCUSTOMERGUID | CUSTOMER_item_TotalCost_across_product_ProductGroups_26w | |
---|---|---|---|
0 | 2022-11-28 11:36:31 | d4559f7d-eb28-42c6-b47d-847de24952c2 | {\n "Chips et Tortillas": 2.240000000000000e+... |
1 | 2022-10-09 15:47:55 | 3f8c7c4c-f2c2-408e-a08e-622de3d3a0b9 | {\n "Chips et Tortillas": 8.430000000000000e+... |
2 | 2022-09-14 15:42:42 | 35390325-8443-43c1-a934-18db923d9a47 | {\n "Adoucissants et Soin du linge": 4.390000... |
3 | 2022-12-26 18:39:46 | 4eb4ee84-ee13-4eec-9c26-61b6eb4ba35b | {\n "Animalerie, Soins et Hygiène": 5.4819999... |
4 | 2022-12-06 08:47:43 | e42fa5f3-7737-4c6a-9ef4-856f113e60bd | {\n "Aide à la Pâtisserie": 2.980000000000000... |
5 | 2022-11-09 12:14:40 | 8440debb-6abc-4adc-8c6c-749928141fd0 | {\n "Aide à la Pâtisserie": 2.380000000000000... |
6 | 2022-10-12 17:32:15 | 8a54e527-e9a4-47a9-a28f-8b3c6ecc02db | {\n "Adoucissants et Soin du linge": 2.000000... |
7 | 2023-01-01 11:51:28 | cea213d4-36e4-48c3-ae8d-c7a25911e11c | {\n "Adoucissants et Soin du linge": 1.067000... |
8 | 2023-02-05 15:48:23 | 3b4f2821-b761-40e9-a32a-5f09685cc597 | {\n "Aide à la Pâtisserie": 5.180000000000000... |
9 | 2023-03-10 16:15:46 | 91a64566-e212-4e36-8f23-c1f1f324a301 | {\n "Adoucissants et Soin du linge": 1.250000... |
In [11]:
Copied!
# Preview OVERALL_item_TotalCost_across_product_ProductGroups_26w
overall_item_totalcost_across_product_productgroups_26w.preview(
preview_table
)
# Preview OVERALL_item_TotalCost_across_product_ProductGroups_26w
overall_item_totalcost_across_product_productgroups_26w.preview(
preview_table
)
Out[11]:
POINT_IN_TIME | GROCERYCUSTOMERGUID | OVERALL_item_TotalCost_across_product_ProductGroups_26w | |
---|---|---|---|
0 | 2022-11-28 11:36:31 | d4559f7d-eb28-42c6-b47d-847de24952c2 | {\n "Adoucissants et Soin du linge": 1.120220... |
1 | 2022-10-09 15:47:55 | 3f8c7c4c-f2c2-408e-a08e-622de3d3a0b9 | {\n "Adoucissants et Soin du linge": 1.108720... |
2 | 2022-09-14 15:42:42 | 35390325-8443-43c1-a934-18db923d9a47 | {\n "Adoucissants et Soin du linge": 1.095090... |
3 | 2022-12-26 18:39:46 | 4eb4ee84-ee13-4eec-9c26-61b6eb4ba35b | {\n "Adoucissants et Soin du linge": 1.181730... |
4 | 2022-12-06 08:47:43 | e42fa5f3-7737-4c6a-9ef4-856f113e60bd | {\n "Adoucissants et Soin du linge": 1.121930... |
5 | 2022-11-09 12:14:40 | 8440debb-6abc-4adc-8c6c-749928141fd0 | {\n "Adoucissants et Soin du linge": 1.145870... |
6 | 2022-10-12 17:32:15 | 8a54e527-e9a4-47a9-a28f-8b3c6ecc02db | {\n "Adoucissants et Soin du linge": 1.141140... |
7 | 2023-01-01 11:51:28 | cea213d4-36e4-48c3-ae8d-c7a25911e11c | {\n "Adoucissants et Soin du linge": 1.217740... |
8 | 2023-02-05 15:48:23 | 3b4f2821-b761-40e9-a32a-5f09685cc597 | {\n "Adoucissants et Soin du linge": 1.209600... |
9 | 2023-03-10 16:15:46 | 91a64566-e212-4e36-8f23-c1f1f324a301 | {\n "Adoucissants et Soin du linge": 1.231790... |
In [12]:
Copied!
# Preview CUSTOMER_vs_OVERALL_item_TotalCost_across_product_ProductGroups_26w
customer_vs_overall_item_totalcost_across_product_productgroups_26w.preview(
preview_table
)
# Preview CUSTOMER_vs_OVERALL_item_TotalCost_across_product_ProductGroups_26w
customer_vs_overall_item_totalcost_across_product_productgroups_26w.preview(
preview_table
)
Out[12]:
POINT_IN_TIME | GROCERYCUSTOMERGUID | CUSTOMER_vs_OVERALL_item_TotalCost_across_product_ProductGroups_26w | |
---|---|---|---|
0 | 2022-11-28 11:36:31 | d4559f7d-eb28-42c6-b47d-847de24952c2 | 0.466220 |
1 | 2022-10-09 15:47:55 | 3f8c7c4c-f2c2-408e-a08e-622de3d3a0b9 | 0.558564 |
2 | 2022-09-14 15:42:42 | 35390325-8443-43c1-a934-18db923d9a47 | 0.872762 |
3 | 2022-12-26 18:39:46 | 4eb4ee84-ee13-4eec-9c26-61b6eb4ba35b | 0.713610 |
4 | 2022-12-06 08:47:43 | e42fa5f3-7737-4c6a-9ef4-856f113e60bd | 0.657608 |
5 | 2022-11-09 12:14:40 | 8440debb-6abc-4adc-8c6c-749928141fd0 | 0.509610 |
6 | 2022-10-12 17:32:15 | 8a54e527-e9a4-47a9-a28f-8b3c6ecc02db | 0.753250 |
7 | 2023-01-01 11:51:28 | cea213d4-36e4-48c3-ae8d-c7a25911e11c | 0.561260 |
8 | 2023-02-05 15:48:23 | 3b4f2821-b761-40e9-a32a-5f09685cc597 | 0.761539 |
9 | 2023-03-10 16:15:46 | 91a64566-e212-4e36-8f23-c1f1f324a301 | 0.729546 |
Save feature¶
In [13]:
Copied!
# Save feature
customer_vs_overall_item_totalcost_across_product_productgroups_26w.save()
# Save feature
customer_vs_overall_item_totalcost_across_product_productgroups_26w.save()
Done! |████████████████████████████████████████| 100% in 6.5s (0.16%/s)
As always, add description and view definition file¶
In [14]:
Copied!
# Add description
customer_vs_overall_item_totalcost_across_product_productgroups_26w.update_description(
"Similarity between the customer and all customers measured by the "
"Cosine Similarity between the Distribution representing the cumulative"
" TotalCost of item, categorized by their respective product's "
"ProductGroup, over 26w for both entities."
)
# See feature definition file
customer_vs_overall_item_totalcost_across_product_productgroups_26w.definition
# Add description
customer_vs_overall_item_totalcost_across_product_productgroups_26w.update_description(
"Similarity between the customer and all customers measured by the "
"Cosine Similarity between the Distribution representing the cumulative"
" TotalCost of item, categorized by their respective product's "
"ProductGroup, over 26w for both entities."
)
# See feature definition file
customer_vs_overall_item_totalcost_across_product_productgroups_26w.definition
Out[14]:
# Generated by SDK version: 0.6.0.dev121
from bson import ObjectId
from featurebyte import DimensionTable
from featurebyte import FeatureJobSetting
from featurebyte import ItemTable
# item_table name: "INVOICEITEMS", event_table name: "GROCERYINVOICE"
item_table = ItemTable.get_by_id(ObjectId("6564b7ecbeba6c193e0fe3bd"))
item_view = item_table.get_view(
event_suffix=None,
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[],
event_drop_column_names=["record_available_at"],
event_column_cleaning_operations=[],
event_join_column_names=[
"Timestamp",
"GroceryInvoiceGuid",
"GroceryCustomerGuid",
"tz_offset",
],
)
# dimension_table name: "GROCERYPRODUCT"
dimension_table = DimensionTable.get_by_id(ObjectId("6564b7edbeba6c193e0fe3be"))
dimension_view = dimension_table.get_view(
view_mode="manual", drop_column_names=[], column_cleaning_operations=[]
)
joined_view = item_view.join(
dimension_view, on="GroceryProductGuid", how="left", rsuffix="", rprefix=""
)
grouped = joined_view.groupby(
by_keys=[], category="ProductGroup"
).aggregate_over(
value_column="TotalCost",
method="sum",
windows=["26w"],
feature_names=["OVERALL_item_TotalCost_across_product_ProductGroups_26w"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat = grouped["OVERALL_item_TotalCost_across_product_ProductGroups_26w"]
grouped_1 = joined_view.groupby(
by_keys=["GroceryCustomerGuid"], category="ProductGroup"
).aggregate_over(
value_column="TotalCost",
method="sum",
windows=["26w"],
feature_names=["CUSTOMER_item_TotalCost_across_product_ProductGroups_26w"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat_1 = grouped_1["CUSTOMER_item_TotalCost_across_product_ProductGroups_26w"]
feat_2 = feat_1.cd.cosine_similarity(other=feat)
feat_2.name = (
"CUSTOMER_vs_OVERALL_item_TotalCost_across_product_ProductGroups_26w"
)
output = feat_2
output.save(_id=ObjectId("6564b93614db530858940f50"))
In [ ]:
Copied!