python-featuretools-feature-selection

import featuretools as ft

# 定义EntitySet
es = ft.EntitySet(id = 'engines')
es = es.entity_from_dataframe(dataframe = train,
                              entity_id = 'obs',
                              index = 'index', 
                              time_index = 'time')
# 设置2个table的关系
es.normalize_entity(base_entity_id = 'obs',
                    new_entity_id = 'engines', 
                    index = 'engine_no')
# Deep feature synthesis,Deep feature synthesis就是产生新特征的一个过程
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='engines',
                                       agg_primitives = ['min', 'max', 'mean', 
                                                         'count', 'sum'],
                                       trans_primitives = ['cum_mean', 'cum_sum'],
                                       max_depth = 1, n_jobs = -1, verbose = 1)
# test_data
test = pd.read_csv('../input/test_obs.csv').reset_index()
test_es = ft.EntitySet(id = 'test_engines')
test_es = test_es.entity_from_dataframe(dataframe = test,
                                        entity_id='obs', 
                                        index = 'index',
                                        time_index = 'time')

test_es.normalize_entity(base_entity_id='obs',
                         new_entity_id = 'engines', 
                         index = 'engine_no')
# 我们已经有features了,所有不用dfs,所以我们可以用calculate_feature_matrix过一遍features的列表
test_feature_matrix = ft.calculate_feature_matrix(feature_names, entityset=test_es, 
                                                   n_jobs = -1, verbose = 1)
# 特征选择
feature_matrix = feature_selection(feature_matrix, correlation_threshold=0.9)
test_feature_matrix = test_feature_matrix[feature_matrix.columns]
# 用cv评价feature_matrix
preds, fi = evaluate(feature_matrix, train_labels, test_feature_matrix, test_labels)
# 画出前十重要的特征
norm_fi = plot_feature_importances(fi, 10, color = 'red')

你可能感兴趣的:(机器学习)