10. Derive Features from other Features
Derive feature from other features¶
Features can also be created based on other features.
In this tutorial we will create a feature that measures the Z-Score of the Latest invoice Amount for the customer compared to that customer's invoices over a 28d period. This feature is derived from 3 features we created previously: CUSTOMER_Latest_invoice_Amount, CUSTOMER_Avg_of_invoice_Amount_28d and CUSTOMER_Std_of_invoice_Amount_28d.
In [1]:
Copied!
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
16:08:27 | WARNING | Service endpoint is inaccessible: http://featurebyte-server:8088 16:08:27 | INFO | Using profile: tutorial 16:08:27 | INFO | Using configuration file at: /Users/gxav/.featurebyte/config.yaml 16:08:27 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 16:08:27 | WARNING | Remote SDK version (1.1.0.dev7) is different from local (1.1.0.dev1). Update local SDK to avoid unexpected behavior. 16:08:27 | INFO | No catalog activated. 16:08:27 | INFO | Catalog activated: Grocery Dataset Tutorial
Get features we previously created and saved¶
In [2]:
Copied!
customer_latest_invoice_amount = catalog.get_feature("CUSTOMER_Latest_invoice_Amount")
customer_avg_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Avg_of_invoice_Amount_28d")
customer_std_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Std_of_invoice_Amount_28d")
customer_latest_invoice_amount = catalog.get_feature("CUSTOMER_Latest_invoice_Amount")
customer_avg_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Avg_of_invoice_Amount_28d")
customer_std_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Std_of_invoice_Amount_28d")
Derive a z-score feature from saved features¶
In [3]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d = (
customer_latest_invoice_amount
- customer_avg_of_invoice_amount_28d
) / customer_std_of_invoice_amount_28d
# Give a name to new feature
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.name = \
"CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d = (
customer_latest_invoice_amount
- customer_avg_of_invoice_amount_28d
) / customer_std_of_invoice_amount_28d
# Give a name to new feature
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.name = \
"CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
Preview feature¶
In [4]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.primary_entity
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.primary_entity
Out[4]:
[<featurebyte.api.entity.Entity at 0x175ed1900> { 'name': 'customer', 'created_at': '2024-06-12T08:05:47.417000', 'updated_at': '2024-06-12T08:05:50.497000', 'description': None, 'serving_names': [ 'GROCERYCUSTOMERGUID' ], 'catalog_name': 'Grocery Dataset Tutorial' }]
In [5]:
Copied!
# Get observation table: 'Preview Table with 10 items'
preview_table = catalog.get_observation_table("Preview Table with 10 items")
# Get observation table: 'Preview Table with 10 items'
preview_table = catalog.get_observation_table("Preview Table with 10 items")
In [6]:
Copied!
# Preview CUSTOMER_Age
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.preview(preview_table)
# Preview CUSTOMER_Age
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.preview(preview_table)
Out[6]:
POINT_IN_TIME | GROCERYINVOICEITEMGUID | CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d | |
---|---|---|---|
0 | 2023-02-07 11:04:26 | fd1caae1-77e6-4667-8c83-df13f05bf2f5 | -0.218570 |
1 | 2023-05-28 19:27:14 | 15973b2f-2256-4caa-b65b-cbbfdff0905b | NaN |
2 | 2023-03-31 18:50:00 | 213ef7d3-c27b-43e0-bc0a-57d6c7c254b0 | NaN |
3 | 2022-09-18 18:52:36 | ac7edfb5-63ed-49fb-9b89-76b0288ed2f8 | 2.357059 |
4 | 2022-12-26 15:01:07 | 264f79fd-c24a-47cc-8a68-fe3753a4d74b | -0.153150 |
5 | 2023-04-11 17:23:57 | 6084f39f-9d2c-4111-b1cc-502e1559c0c0 | 1.000000 |
6 | 2022-08-17 19:13:52 | 40a07ca4-a991-4d21-b5cf-74ee61220f96 | 1.658670 |
7 | 2022-12-10 21:08:26 | 77d02174-f1e1-41c1-9fb9-01c6246b0009 | NaN |
8 | 2023-05-05 08:00:42 | 57ca0770-eb8b-4769-8e67-eb1b7cc0a934 | -0.508099 |
9 | 2023-03-17 11:15:09 | 1b627a25-7eb4-4f61-b243-c93db487bff0 | -0.564107 |
Save feature¶
In [7]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.save()
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.save()
Done! |████████████████████████████████████████| 100% in 6.1s (0.17%/s)
Add description and view definition file¶
In [8]:
Copied!
# Add description
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.update_description(
"Z-Score of the Latest invoice Amount for the customer compared to customer invoices over a 28d period"
)
# See feature definition file
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.definition
# Add description
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.update_description(
"Z-Score of the Latest invoice Amount for the customer compared to customer invoices over a 28d period"
)
# See feature definition file
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.definition
Out[8]:
# Generated by SDK version: 1.1.0.dev7
from bson import ObjectId
from featurebyte import ColumnCleaningOperation
from featurebyte import DisguisedValueImputation
from featurebyte import EventTable
from featurebyte import FeatureJobSetting
from featurebyte import ValueBeyondEndpointImputation
# event_table name: "GROCERYINVOICE"
event_table = EventTable.get_by_id(ObjectId("666956c38080c62d0dc616e0"))
event_view = event_table.get_view(
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[
ColumnCleaningOperation(
column_name="Amount",
cleaning_operations=[
DisguisedValueImputation(
imputed_value=None, disguised_values=[-99.0, -98.0]
),
ValueBeyondEndpointImputation(
type="less_than", end_point=0.0, imputed_value=0.0
),
ValueBeyondEndpointImputation(
type="greater_than", end_point=2000.0, imputed_value=2000.0
),
],
)
],
)
grouped = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="avg",
windows=["28d"],
feature_names=["CUSTOMER_Avg_of_invoice_Amount_28d"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", period="3600s", offset="120s"
),
skip_fill_na=True,
offset=None,
)
feat = grouped["CUSTOMER_Avg_of_invoice_Amount_28d"]
grouped_1 = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="latest",
windows=[None],
feature_names=["CUSTOMER_Latest_invoice_Amount"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", period="3600s", offset="120s"
),
skip_fill_na=True,
offset=None,
)
feat_1 = grouped_1["CUSTOMER_Latest_invoice_Amount"]
grouped_2 = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="std",
windows=["28d"],
feature_names=["CUSTOMER_Std_of_invoice_Amount_28d"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", period="3600s", offset="120s"
),
skip_fill_na=True,
offset=None,
)
feat_2 = grouped_2["CUSTOMER_Std_of_invoice_Amount_28d"]
feat_3 = (feat_1 - feat) / feat_2
feat_3.name = "CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
output = feat_3
output.save(_id=ObjectId("6669577ca1b61f71af4710cd"))