10. Derive features from other features
Derive feature from other features¶
Features can also be created based on other features.
In this tutorial we will create a feature that measures the Z-Score of the Latest invoice Amount for the customer compared to that customer's invoices over a 28d period. This feature is derived from 3 features we created previously: CUSTOMER_Latest_invoice_Amount, CUSTOMER_Avg_of_invoice_Amount_28d and CUSTOMER_Std_of_invoice_Amount_28d.
In [1]:
Copied!
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
15:31:33 | INFO | SDK version: 1.0.2.dev46 15:31:33 | INFO | No catalog activated. 15:31:33 | INFO | Using profile: tutorial 15:31:33 | INFO | Using configuration file at: /Users/gxav/.featurebyte/config.yaml 15:31:33 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 15:31:33 | INFO | SDK version: 1.0.2.dev46 15:31:33 | INFO | No catalog activated. 15:31:33 | INFO | Catalog activated: Grocery Dataset Tutorial
Get features we previously created and saved¶
In [2]:
Copied!
customer_latest_invoice_amount = catalog.get_feature("CUSTOMER_Latest_invoice_Amount")
customer_avg_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Avg_of_invoice_Amount_28d")
customer_std_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Std_of_invoice_Amount_28d")
customer_latest_invoice_amount = catalog.get_feature("CUSTOMER_Latest_invoice_Amount")
customer_avg_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Avg_of_invoice_Amount_28d")
customer_std_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Std_of_invoice_Amount_28d")
Derive a z-score feature from saved features¶
In [3]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d = (
customer_latest_invoice_amount
- customer_avg_of_invoice_amount_28d
) / customer_std_of_invoice_amount_28d
# Give a name to new feature
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.name = \
"CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d = (
customer_latest_invoice_amount
- customer_avg_of_invoice_amount_28d
) / customer_std_of_invoice_amount_28d
# Give a name to new feature
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.name = \
"CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
Preview feature¶
In [4]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.primary_entity
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.primary_entity
Out[4]:
[<featurebyte.api.entity.Entity at 0x13eda6b40> { 'name': 'customer', 'created_at': '2024-04-26T07:28:15.128000', 'updated_at': '2024-04-26T07:28:24.021000', 'description': None, 'serving_names': [ 'GROCERYCUSTOMERGUID' ], 'catalog_name': 'Grocery Dataset Tutorial' }]
In [5]:
Copied!
# Get observation table: 'Preview Table with 10 items'
preview_table = catalog.get_observation_table("Preview Table with 10 items")
# Get observation table: 'Preview Table with 10 items'
preview_table = catalog.get_observation_table("Preview Table with 10 items")
In [6]:
Copied!
# Preview CUSTOMER_Age
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.preview(preview_table)
# Preview CUSTOMER_Age
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.preview(preview_table)
Out[6]:
POINT_IN_TIME | GROCERYINVOICEITEMGUID | CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d | |
---|---|---|---|
0 | 2022-11-03 14:39:00 | 9faf5936-d4bb-4709-a530-c1624ec003a5 | 2.039603 |
1 | 2022-10-05 14:30:03 | fc4769cf-41d6-4fb0-9cfc-fe30502cfa18 | -0.153076 |
2 | 2022-08-16 15:37:21 | 6781acc6-652d-4867-a138-2d8adb278886 | -0.849703 |
3 | 2023-03-13 21:11:43 | cfc620a2-0054-4b4b-99d5-040cf87cfe2d | -0.799217 |
4 | 2023-03-19 15:22:18 | dcdb7c98-dd62-4287-b432-bfe3a2317ebc | 0.958202 |
5 | 2022-11-18 10:07:49 | ae2ccf38-4e5d-4c76-b1e5-04f12307e45b | -0.586120 |
6 | 2022-11-09 18:07:20 | 37122b82-478b-4d9e-b236-69629b592c0b | -1.000000 |
7 | 2023-02-15 21:10:22 | 5daf8edb-8625-4653-aaa0-9ac03df92017 | -1.168187 |
8 | 2023-02-27 13:27:47 | b807e05c-ff1c-4fb3-a760-e0e8ce29c859 | -0.512459 |
9 | 2023-04-19 15:25:05 | 8a60f455-aff1-4b4e-8c63-9ab89df2715d | -1.327036 |
Save feature¶
In [7]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.save()
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.save()
Done! |████████████████████████████████████████| 100% in 6.1s (0.16%/s)
Add description and view definition file¶
In [8]:
Copied!
# Add description
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.update_description(
"Z-Score of the Latest invoice Amount for the customer compared to customer invoices over a 28d period"
)
# See feature definition file
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.definition
# Add description
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.update_description(
"Z-Score of the Latest invoice Amount for the customer compared to customer invoices over a 28d period"
)
# See feature definition file
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.definition
Out[8]:
# Generated by SDK version: 1.0.2.dev46
from bson import ObjectId
from featurebyte import ColumnCleaningOperation
from featurebyte import DisguisedValueImputation
from featurebyte import EventTable
from featurebyte import FeatureJobSetting
from featurebyte import ValueBeyondEndpointImputation
# event_table name: "GROCERYINVOICE"
event_table = EventTable.get_by_id(ObjectId("662b5775aa13c89fa14554e1"))
event_view = event_table.get_view(
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[
ColumnCleaningOperation(
column_name="Amount",
cleaning_operations=[
DisguisedValueImputation(
imputed_value=None, disguised_values=[-99.0, -98.0]
),
ValueBeyondEndpointImputation(
type="less_than", end_point=0.0, imputed_value=0.0
),
ValueBeyondEndpointImputation(
type="greater_than", end_point=2000.0, imputed_value=2000.0
),
],
)
],
)
grouped = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="latest",
windows=[None],
feature_names=["CUSTOMER_Latest_invoice_Amount"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat = grouped["CUSTOMER_Latest_invoice_Amount"]
grouped_1 = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="avg",
windows=["28d"],
feature_names=["CUSTOMER_Avg_of_invoice_Amount_28d"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat_1 = grouped_1["CUSTOMER_Avg_of_invoice_Amount_28d"]
grouped_2 = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="std",
windows=["28d"],
feature_names=["CUSTOMER_Std_of_invoice_Amount_28d"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat_2 = grouped_2["CUSTOMER_Std_of_invoice_Amount_28d"]
feat_3 = (feat - feat_1) / feat_2
feat_3.name = "CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
output = feat_3
output.save(_id=ObjectId("662b5858869e92ad5091c274"))