python-featuretools-advanced-featuretools

from sklearn.cluster import KMeans
import featuretools as ft
import featuretools.variable_types as vtypes
from featuretools.primitives import make_agg_primitive
from tsfresh.feature_extraction.feature_calculators import (cid_ce, number_peaks, 
                                                             last_location_of_maximum, 
                                                             skewness, sample_entropy)

# 定义EntitySet
es = ft.EntitySet(id = 'engines')
es = es.entity_from_dataframe(dataframe = train,
                              entity_id='obs', 
                              index = 'index',
                              time_index = 'time')
# 建立关系
es.normalize_entity(base_entity_id='obs',
                    new_entity_id = 'engines', 
                    index = 'engine_no')
# 重新定义EntitySet                         
test_es = ft.EntitySet(id = 'test_engines')
test_es = test_es.entity_from_dataframe(dataframe = test,
                                        entity_id='obs', 
                                        index = 'index',
                                        time_index = 'time')
test_es.normalize_entity(base_entity_id='obs',
                         new_entity_id = 'engines', 
                         index = 'engine_no')
# 用来评估时间序列的复杂度,越复杂的序列有越多的谷峰
def cid_ce_func(x): return cid_ce(x , False)
# 峰值个数
def number_peaks_func(x): return number_peaks(x , 5)
# 最大值最后出现的位置
def last_location_of_maximum_func(x): return last_location_of_maximum(x)
# 偏度
def skewness_func(x): return skewness(x)
# 熵
def sample_entropy_func(x): return sample_entropy(x)

cid_ce_primitive = make_agg_primitive(cid_ce_func,
                                      input_types = [vtypes.Numeric],
                                      return_type = vtypes.Numeric,
                                      name = 'Complexity')

number_peaks_primitive = make_agg_primitive(number_peaks_func,
                                            input_types = [vtypes.Numeric],
                                            return_type = vtypes.Numeric,
                                            name = 'NumberPeaks')

last_location_of_maximum_primititive = make_agg_primitive(last_location_of_maximum_func,
                                                          input_types = [vtypes.Numeric],
                                                          return_type = vtypes.Numeric,
                                                          name = 'LastLocationMax')

skewness_primitive = make_agg_primitive(skewness_func,
                                        input_types = [vtypes.Numeric],
                                        return_type = vtypes.Numeric,
                                        name = 'Skewness')

sample_entropy_primitive = make_agg_primitive(sample_entropy_func,
                                              input_types = [vtypes.Numeric],
                                              return_type = vtypes.Numeric,
                                              name = 'Entropy')
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='engines',
                                       agg_primitives = ['min', 'max', 'mean', 
                                                         'count', 'sum', 'last',
                                                         'skew', 'std', 'trend',
                                                         cid_ce_primitive, number_peaks_primitive, 
                                                         last_location_of_maximum_primititive,
                                                         skewness_primitive, sample_entropy_primitive],
                                       trans_primitives = ['cum_mean', 'cum_sum'],
                                       max_depth = 1, n_jobs = -1, verbose = 1,
                                       chunk_size = 100)
test_feature_matrix = ft.calculate_feature_matrix(entityset=test_es, 
                                                  features=feature_names,
                                                  n_jobs = -1, verbose = 1,
                                                  chunk_size = 100)
# 增加模型选择
feature_matrix = feature_selection(feature_matrix)
test_feature_matrix = test_feature_matrix[feature_matrix.columns]
# 评价特征
preds, fi = evaluate(feature_matrix, train_labels, test_feature_matrix, test_labels)
norm_fi = plot_feature_importances(fi)
# 增加深度
feature_matrix, feature_names = ft.dfs(entityset=es, target_entity='engines',
                                       agg_primitives = ['min', 'max', 'mean', 
                                                         'count', 'sum', 'last',
                                                         'skew', 'std', 'trend',
                                                         cid_ce_primitive, number_peaks_primitive, 
                                                         last_location_of_maximum_primititive,
                                                         skewness_primitive, sample_entropy_primitive],
                                       trans_primitives = ['cum_mean', 'cum_sum'],
                                       max_depth = 2, n_jobs = -1, verbose = 1,
                                       chunk_size = 100)
# 
test_feature_matrix = ft.calculate_feature_matrix(entityset=test_es, 
                                                  features=feature_names,
                                                  n_jobs = -1, verbose = 1,
                                                  chunk_size = 100)
preds, fi = evaluate(feature_matrix, train_labels, test_feature_matrix, test_labels)
norm_fi = plot_feature_importances(fi)
                                                                                                                                                                                   

你可能感兴趣的:(机器学习)