数据类型分为:
数值类型,如【次数等等】
分类类型,如【gender,是否点击等等,年龄,小时,风险等级】
2.1. 无序分类类型,如【性别】
2.2. 有序分类类型,如【年龄,小时】可作为分类类型,也可作数值类型,
分桶类型,如【年龄段,小时段】
神经网络处理 以上类型有 2种方式;
DenseFeature,在这个 层中 处理相应的 类型特征,具体可参考
Embedding ,又有两种方式
2.1. 单特征输入
2.2. 多个特征综合输入
# 将输入的 Dataframe 转化成tensor
def generate_feature_columns(df,fin_features,cate_feature):
# fin_feature 所有特征列名称
# cate_feature 分类类型特征
feature_columns = []
for feature_name in fin_features:
if feature_name != 'is_major_case' and feature_name not in cate_feature: ## numeric columns , they need to be preporcessed by standarization or normalization
feature_columns.append(tf.feature_column.numeric_column(feature_name))
elif feature_name in cate_feature:
thal = tf.feature_column.categorical_column_with_vocabulary_list(feature_name,[str(i) for i in df[feature_name].value_counts().index])
thal_one_hot = tf.feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)
# feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
return feature_columns
def do_column_normalization(df, column_list):
min_max_dict = {}
for item in column_list:
max_tmp = np.max(np.array(df[item]))
min_tmp = np.min(np.array(df[item]))
if (max_tmp != min_tmp):
df[item] = df[item].apply(lambda x: (x - min_tmp) / (max_tmp - min_tmp))
min_max_dict["item"] = (min_tmp, max_tmp)
return min_max_dict
def do_column_standarization(df, column_list):
for item in column_list:
mean_tmp = np.mean(np.array(df[item]))
std_tmp = np.std(np.array(df[item]))
if (std_tmp):
df[item] = df[item].apply(lambda x: (x - mean_tmp) / std_tmp)
return df
feature_layer = generate_feature_columns(df,fin_features,cate_feature)
def make_model(metrics = METRICS, output_bias=None,feature_layer=None):
if output_bias is not None:
output_bias = tf.keras.initializers.Constant(output_bias)
model = keras.Sequential()
if feature_layer is not None:
model.add(layers.DenseFeatures(feature_layer))
model.add(layers.Dense(128,
activation= tf.nn.tanh,
# kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
kernel_initializer='random_uniform'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64,
activation=tf.nn.tanh,
kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
kernel_initializer='random_uniform'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(32,
activation=tf.nn.tanh,
# kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
kernel_initializer='random_uniform'))
if output_bias is not None:
model.add(layers.Dense(1, activation=tf.nn.sigmoid,bias_initializer = output_bias))
else:
model.add(layers.Dense(1, activation=tf.nn.sigmoid))
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=init_learning_rate),
loss=keras.losses.BinaryCrossentropy(),metrics=METRICS)
return model
model = make_model(feature_layer=feature_layer,output_bias=initial_bias)
参考
对结构化数据进行分类 | TensorFlow Corehttps://tensorflow.google.cn/tutorials/structured_data/feature_columns不平衡分类_人上人酿酒师的博客-CSDN博客文章目录所依赖的包1. 标准化2. 定义模型和指标3. 基线模型4. 设置正确的初始偏差---初始化偏差5. 初始化 权重6. 训练模型7. 类别 权重8. 用class_weight 训练模型9. 评估指标10. 过采样11. 总结:11.1. 加入 初始化偏差,有助于 加快收敛速度。在输出层增加11.2. 加入初始化权重,有可比性11.3. 引入class_weight,11.4. 评估指标11.5. 混淆矩阵11.6. roc:12. 代码总结13. 参考:所依赖的包import tensorflhttps://blog.csdn.net/qq_24729325/article/details/120264495
def model_embedding(metrics = METRICS, output_bias=None):
if output_bias is not None:
output_bias = tf.keras.initializers.Constant(output_bias)
# shape = 【2】表示,分类特征有2个,性别 和 是否复现
input_cate = Input(shape=[2],name='cate_input')
input_embedding = keras.layers.Embedding(input_dim=emb_dict_cnt,
output_dim=embedding_out_dim,
input_length=2
)(input_cate)
flatten = keras.layers.Flatten()(input_embedding)
input_num = Input(shape=(len(columns),))
combined = keras.layers.concatenate([flatten,input_num])
layer_1 = layers.Dense(128,
activation= tf.nn.tanh,
# kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
kernel_initializer='random_uniform')(combined)
layer_1_drop = layers.Dropout(0.5)(layer_1)
layer_2 = keras.layers.Dense(64,
activation=tf.nn.tanh,
kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
kernel_initializer='random_uniform')(layer_1_drop)
layer_2_drop = keras.layers.Dropout(0.3)(layer_2)
layer_3 = keras.layers.Dense(32,
activation=tf.nn.tanh,
# kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
kernel_initializer='random_uniform')(layer_2_drop)
if output_bias is not None:
z = keras.layers.Dense(1, activation=tf.nn.sigmoid,bias_initializer = output_bias)(layer_3)
else:
z = keras.layers.Dense(1, activation=tf.nn.sigmoid)(layer_3)
model = tf.keras.Model(inputs=[input_cate,input_num],outputs = z)
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=init_learning_rate),
loss=keras.losses.BinaryCrossentropy(),metrics=metrics)
return model
model = model_embedding()
model.fit(
x = [train_df[cate_feature],train_df[columns]],
y = train_df["label"],
validation_data = ([val_df[cate_feature],val_df[columns]],val_df["label"]),
batch_size= BATCH_SIZE,
epochs=EPOCHS,
# callbacks=[early_stopping],
class_weight=class_weight)
# 方式2
input_is_bindcard = Input(shape=[1],name = "is_bindcard")
input_gender = Input(shape=[1],name = "gender")
em_is_bindcard = Embedding(2,1)(input_is_bindcard)
em_gender = Embedding(3,1)(input_gender)
emb_is_bindcard = Reshape(target_shape=(1,))(em_is_bindcard)
emb_gender = Reshape(target_shape=(1,))(em_gender)
emb = Add()([emb_is_bindcard,emb_gender])
emd_cate = Dense(32,activation='tanh')(emb)
cate_out = Dense(16, activation='tanh')(emd_cate)
# cate_out = Flatten()(cate_out)
cate_model = Model(inputs=[input_is_bindcard,input_gender,input_hr_risk,input_is_prvd_ratio,input_dow_risk],
outputs=cate_out)
# 数值类型
num = Input(shape=(30,),name='num')
num_concate = Dense(32,activation='tanh')(num)
num_out = Dense(16, activation='tanh')(num_concate)
# z_num = Dense(1, activation="sigmoid")(num_out)
num_model = Model(inputs=[num],outputs=num_out)
combined = concatenate([cate_model.output,num_model.output])
out = Dropout(0.2)(combined)
z = Dense(1, activation="sigmoid")(num_out)
model = Model(inputs=[input_is_bindcard,input_gender,input_hr_risk,input_is_prvd_ratio,input_dow_risk,num],outputs=z)
model.compile(
optimizer=tf.keras.optimizers.Adam(lr=0.005),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=metrics)
model.fit(x=[df['is_bindcard'],df['gender'],num_df],y=df['label'],shuffle=True,epochs=100,validation_split=0.2,batch_size=128)