python Featuretools实现自动特征工程

import featuretools as ft
from featuretools.selection import remove_low_information_features
import pandas as pd
import numpy as np

filename = 'data/ds76_tx_All_Data_74_2018_0912_070949.txt'
def datashop_to_entityset(filename):
	# 导入csv
	data = pd.read_csv(filename, '\t', parse_dates=True)
	data.index = data['Transaction Id']
	data = data.drop(['Row'], axis=1)
	data['Outcome'] = data['Outcome'].map({'INCORRECT': 0, 'CORRECT': 1})
	# end_time = start_time + duration
	data['End Time'] = pd.to_datetime(
        data['Time']) + pd.to_timedelta(pd.to_numeric(data['Duration (sec)']), 's')
	# 所有的columns以KC、CF
	kc_and_cf_cols = [x for x in data.columns if (
        x.startswith('KC ') or x.startswith('CF '))]
    # 创建一个实例集合叫dataset
    es = ft.EntitySet('Dataset') 
    es.entity_from_dataframe(entity_id='transactions',
                             index='Transaction Id',
                             dataframe=data,
                             variable_types={'Outcome': vtypes.Boolean, 'Attempt At Step': vtypes.Categorical},
                             time_index='Time',
                             secondary_time_index={'End Time': [
                                 'Outcome', 'Is Last Attempt', 'Duration (sec)']}
                             )
	es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='problem_steps',
                        index='Step Name',
                        additional_variables=['Problem Name'] + kc_and_cf_cols,
                        make_time_index=True)

    es.normalize_entity(base_entity_id='problem_steps',
                        new_entity_id='problems',
                        index='Problem Name',
                        make_time_index=True)

    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='sessions',
                        index='Session Id',
                        additional_variables=['Anon Student Id'],
                        make_time_index=True)

    es.normalize_entity(base_entity_id='sessions',
                        new_entity_id='students',
                        index='Anon Student Id',
                        make_time_index=True)

    # Every transaction has a `class` associated to a school
    es.normalize_entity(base_entity_id='transactions',
                        new_entity_id='classes',
                        index='Class',
                        additional_variables=['School'],
                        make_time_index=False)

    es.normalize_entity(base_entity_id='classes',
                        new_entity_id='schools',
                        index='School',
                        make_time_index=False)
    return es       
es = datashop_to_entityset(filename)
# 自动特征工程
fm, features = ft.dfs(entityset=es,
                      target_entity='transactions',
                      agg_primitives=['Sum', 'Mean', 'Percent_True'],
                      trans_primitives=['Hour'],
                      max_depth=3,
                      # approximate='2m',
                      cutoff_time=cutoff_times[1000:],
                      verbose=True)
# 用one_hot编码特征矩阵
fm_enc, f_enc = ft.encode_features(fm, features)
fm_enc = fm_enc.fillna(0)
# 移除较小信息的特征
fm_enc = remove_low_information_features(fm_enc)

你可能感兴趣的:(机器学习)