10. Derive features from other features
Derive feature from other features¶
Features can also be created based on other features.
In this tutorial we will create a feature that measures the Z-Score of the Latest invoice Amount for the customer compared to that customer's invoices over a 28d period. This feature is derived from 3 features we created previously: CUSTOMER_Latest_invoice_Amount, CUSTOMER_Avg_of_invoice_Amount_28d and CUSTOMER_Std_of_invoice_Amount_28d.
In [1]:
Copied!
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
import featurebyte as fb
# Set your profile to the tutorial environment
fb.use_profile("tutorial")
catalog_name = "Grocery Dataset Tutorial"
catalog = fb.Catalog.activate(catalog_name)
22:00:49 | INFO | Using configuration file at: /Users/gxav/.featurebyte/config.yaml 22:00:49 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 22:00:49 | WARNING | Remote SDK version (0.5.0.dev6) is different from local (0.5.0.dev1). Update local SDK to avoid unexpected behavior. 22:00:49 | INFO | No catalog activated. 22:00:49 | INFO | 6 feature lists, 31 features deployed 22:00:49 | INFO | Using profile: tutorial 22:00:50 | INFO | Using configuration file at: /Users/gxav/.featurebyte/config.yaml 22:00:50 | INFO | Active profile: tutorial (https://tutorials.featurebyte.com/api/v1) 22:00:50 | WARNING | Remote SDK version (0.5.0.dev6) is different from local (0.5.0.dev1). Update local SDK to avoid unexpected behavior. 22:00:50 | INFO | No catalog activated. 22:00:51 | INFO | 6 feature lists, 31 features deployed 22:00:51 | INFO | Catalog activated: Grocery Dataset Tutorial
Get features we previously created and saved¶
In [2]:
Copied!
customer_latest_invoice_amount = catalog.get_feature("CUSTOMER_Latest_invoice_Amount")
customer_avg_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Avg_of_invoice_Amount_28d")
customer_std_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Std_of_invoice_Amount_28d")
customer_latest_invoice_amount = catalog.get_feature("CUSTOMER_Latest_invoice_Amount")
customer_avg_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Avg_of_invoice_Amount_28d")
customer_std_of_invoice_amount_28d = catalog.get_feature("CUSTOMER_Std_of_invoice_Amount_28d")
Derive a z-score feature from saved features¶
In [3]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d = (
customer_latest_invoice_amount
- customer_avg_of_invoice_amount_28d
) / customer_std_of_invoice_amount_28d
# Give a name to new feature
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.name = \
"CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d = (
customer_latest_invoice_amount
- customer_avg_of_invoice_amount_28d
) / customer_std_of_invoice_amount_28d
# Give a name to new feature
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.name = \
"CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
Preview feature¶
In [4]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.primary_entity
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.primary_entity
Out[4]:
[<featurebyte.api.entity.Entity at 0x7f87a16c5d40> { 'name': 'customer', 'created_at': '2023-09-11T13:56:58.863000', 'updated_at': '2023-09-11T13:57:16.943000', 'description': None, 'serving_names': [ 'GROCERYCUSTOMERGUID' ], 'catalog_name': 'Grocery Dataset Tutorial' }]
In [5]:
Copied!
# Get observation table: 'Preview Table with 10 Customers'
preview_table = catalog.get_observation_table("Preview Table with 10 Customers").to_pandas()
# Get observation table: 'Preview Table with 10 Customers'
preview_table = catalog.get_observation_table("Preview Table with 10 Customers").to_pandas()
Downloading table |████████████████████████████████████████| 10/10 [100%] in 0.1
In [6]:
Copied!
# Preview CUSTOMER_Age
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.preview(preview_table)
# Preview CUSTOMER_Age
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.preview(preview_table)
Out[6]:
POINT_IN_TIME | GROCERYCUSTOMERGUID | CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d | |
---|---|---|---|
0 | 2022-07-21 12:50:21 | dd0c74f0-9ba8-4ca9-bd84-8148095aa38a | 1.696160 |
1 | 2022-11-10 15:57:20 | 0401635c-e6ab-4525-bb5d-00aba7f6d0c4 | 1.706156 |
2 | 2022-08-14 19:00:14 | 54d86ef6-f9b8-40e2-9162-a60bd1b705db | 0.931491 |
3 | 2022-07-12 08:02:04 | 4eb4ee84-ee13-4eec-9c26-61b6eb4ba35b | 1.567570 |
4 | 2022-12-13 08:15:49 | 1e866814-e5a6-475d-87e3-b53377cc005b | -0.367178 |
5 | 2023-04-26 16:52:34 | 48072b52-39cf-452c-8531-02cc4d0fc32e | -0.521815 |
6 | 2023-03-01 11:31:00 | 081f111a-598b-43ae-a28a-3a5dc3d2a091 | -0.341443 |
7 | 2023-01-19 16:33:33 | f3415165-754c-40b6-af17-06ef952a3fa1 | -0.585856 |
8 | 2023-04-11 19:07:26 | d0ea14bf-038a-4ae5-887e-e2d4d68dd8f6 | -0.463131 |
9 | 2023-04-10 08:24:27 | 69d8718e-8c4a-4264-8edf-e0ffc1ef4737 | 1.813072 |
Save feature¶
In [7]:
Copied!
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.save()
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.save()
Done! |████████████████████████████████████████| 100% in 6.9s (0.15%/s)
Add description and view definition file¶
In [8]:
Copied!
# Add description
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.update_description(
"Z-Score of the Latest invoice Amount for the customer compared to customer invoices over a 28d period"
)
# See feature definition file
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.definition
# Add description
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.update_description(
"Z-Score of the Latest invoice Amount for the customer compared to customer invoices over a 28d period"
)
# See feature definition file
customer_latest_invoice_amount_Z_score_to_invoice_amount_28d.definition
Out[8]:
# Generated by SDK version: 0.5.0.dev6
from bson import ObjectId
from featurebyte import ColumnCleaningOperation
from featurebyte import DisguisedValueImputation
from featurebyte import EventTable
from featurebyte import FeatureJobSetting
from featurebyte import ValueBeyondEndpointImputation
# event_table name: "GROCERYINVOICE"
event_table = EventTable.get_by_id(ObjectId("64ff1c910d5bfbfb21bce78a"))
event_view = event_table.get_view(
view_mode="manual",
drop_column_names=["record_available_at"],
column_cleaning_operations=[
ColumnCleaningOperation(
column_name="Amount",
cleaning_operations=[
DisguisedValueImputation(
imputed_value=None, disguised_values=[-99, -98]
),
ValueBeyondEndpointImputation(
type="less_than", end_point=0, imputed_value=0
),
ValueBeyondEndpointImputation(
type="greater_than", end_point=2000, imputed_value=2000
),
],
)
],
)
grouped = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="std",
windows=["28d"],
feature_names=["CUSTOMER_Std_of_invoice_Amount_28d"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat = grouped["CUSTOMER_Std_of_invoice_Amount_28d"]
grouped_1 = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="latest",
windows=[None],
feature_names=["CUSTOMER_Latest_invoice_Amount"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat_1 = grouped_1["CUSTOMER_Latest_invoice_Amount"]
grouped_2 = event_view.groupby(
by_keys=["GroceryCustomerGuid"], category=None
).aggregate_over(
value_column="Amount",
method="avg",
windows=["28d"],
feature_names=["CUSTOMER_Avg_of_invoice_Amount_28d"],
feature_job_setting=FeatureJobSetting(
blind_spot="120s", frequency="3600s", time_modulo_frequency="120s"
),
skip_fill_na=True,
)
feat_2 = grouped_2["CUSTOMER_Avg_of_invoice_Amount_28d"]
feat_3 = (feat_1 - feat_2) / feat
feat_3.name = "CUSTOMER_Latest_invoice_Amount_Z_Score_to_invoice_Amount_28d"
output = feat_3
output.save(_id=ObjectId("64ff1d9a2fa89ef7c7f4f5b8"))
In [ ]:
Copied!