category类型和数值类型 输入 神经网络

数据类型分为:

  1. 数值类型,如【次数等等】

  2. 分类类型,如【gender,是否点击等等,年龄,小时,风险等级】

    2.1. 无序分类类型,如【性别】

    2.2. 有序分类类型,如【年龄,小时】可作为分类类型,也可作数值类型,

  3. 分桶类型,如【年龄段,小时段】

神经网络处理 以上类型有 2种方式;

  1. DenseFeature,在这个 层中 处理相应的 类型特征,具体可参考

  2. Embedding ,又有两种方式

    2.1. 单特征输入

    2.2. 多个特征综合输入

关键代码

  1. DenseFeature方式
# 将输入的 Dataframe 转化成tensor
def generate_feature_columns(df,fin_features,cate_feature):
    # fin_feature 所有特征列名称
    # cate_feature 分类类型特征
    feature_columns = []
    for feature_name in fin_features:
        if feature_name != 'is_major_case' and feature_name not in cate_feature:  ## numeric columns , they need to be preporcessed by standarization or normalization
            feature_columns.append(tf.feature_column.numeric_column(feature_name))
        elif feature_name in cate_feature:
            thal = tf.feature_column.categorical_column_with_vocabulary_list(feature_name,[str(i) for i in df[feature_name].value_counts().index])
            thal_one_hot = tf.feature_column.indicator_column(thal)
            feature_columns.append(thal_one_hot)
#     feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    return feature_columns
    
    
def do_column_normalization(df, column_list):
  min_max_dict = {}
  for item in column_list:
    max_tmp = np.max(np.array(df[item]))
    min_tmp = np.min(np.array(df[item]))
    if (max_tmp != min_tmp):
      df[item] = df[item].apply(lambda x: (x - min_tmp) / (max_tmp - min_tmp))
      min_max_dict["item"] = (min_tmp, max_tmp)
  return min_max_dict


def do_column_standarization(df, column_list):
  for item in column_list:
    mean_tmp = np.mean(np.array(df[item]))
    std_tmp = np.std(np.array(df[item]))
    if (std_tmp):
      df[item] = df[item].apply(lambda x: (x - mean_tmp) / std_tmp)
  return df
  
  
feature_layer = generate_feature_columns(df,fin_features,cate_feature)

def make_model(metrics = METRICS, output_bias=None,feature_layer=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    
    model = keras.Sequential()
    if feature_layer is not None:
        model.add(layers.DenseFeatures(feature_layer))
    model.add(layers.Dense(128, 
                           activation= tf.nn.tanh,
#                            kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
                           kernel_initializer='random_uniform'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64,
                           activation=tf.nn.tanh,
                           kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
                           kernel_initializer='random_uniform'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(32, 
                           activation=tf.nn.tanh,
#                            kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
                           kernel_initializer='random_uniform'))
    if output_bias is not None:
        
        model.add(layers.Dense(1, activation=tf.nn.sigmoid,bias_initializer = output_bias))
    else:
        model.add(layers.Dense(1, activation=tf.nn.sigmoid))
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=init_learning_rate),
        loss=keras.losses.BinaryCrossentropy(),metrics=METRICS)
    return model
    
model = make_model(feature_layer=feature_layer,output_bias=initial_bias)

参考

对结构化数据进行分类  |  TensorFlow Coreicon-default.png?t=LBL2https://tensorflow.google.cn/tutorials/structured_data/feature_columns不平衡分类_人上人酿酒师的博客-CSDN博客文章目录所依赖的包1. 标准化2. 定义模型和指标3. 基线模型4. 设置正确的初始偏差---初始化偏差5. 初始化 权重6. 训练模型7. 类别 权重8. 用class_weight 训练模型9. 评估指标10. 过采样11. 总结:11.1. 加入 初始化偏差,有助于 加快收敛速度。在输出层增加11.2. 加入初始化权重,有可比性11.3. 引入class_weight,11.4. 评估指标11.5. 混淆矩阵11.6. roc:12. 代码总结13. 参考:所依赖的包import tensorflhttps://blog.csdn.net/qq_24729325/article/details/120264495

  1. Embedding 方式
def model_embedding(metrics = METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    # shape = 【2】表示,分类特征有2个,性别 和 是否复现
    input_cate = Input(shape=[2],name='cate_input')
    input_embedding = keras.layers.Embedding(input_dim=emb_dict_cnt, 
                                             output_dim=embedding_out_dim, 
                                             input_length=2
                                            )(input_cate)
    flatten = keras.layers.Flatten()(input_embedding)
    input_num = Input(shape=(len(columns),))   
    combined = keras.layers.concatenate([flatten,input_num])
    
    layer_1 = layers.Dense(128, 
                           activation= tf.nn.tanh,
#                            kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
                           kernel_initializer='random_uniform')(combined)
    layer_1_drop = layers.Dropout(0.5)(layer_1)
    layer_2 = keras.layers.Dense(64,
                           activation=tf.nn.tanh,
                           kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
                           kernel_initializer='random_uniform')(layer_1_drop)
    layer_2_drop = keras.layers.Dropout(0.3)(layer_2)
    layer_3 = keras.layers.Dense(32, 
                           activation=tf.nn.tanh,
#                            kernel_regularizer=regularizers.l1_l2(l1=0, l2=0.5e-5),
                           kernel_initializer='random_uniform')(layer_2_drop)
    if output_bias is not None:
        z = keras.layers.Dense(1, activation=tf.nn.sigmoid,bias_initializer = output_bias)(layer_3)
    else:
        z = keras.layers.Dense(1, activation=tf.nn.sigmoid)(layer_3)
    model = tf.keras.Model(inputs=[input_cate,input_num],outputs = z)
   
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=init_learning_rate),
        loss=keras.losses.BinaryCrossentropy(),metrics=metrics)
    return model
    
 model = model_embedding()
 model.fit(
    x = [train_df[cate_feature],train_df[columns]],
    y = train_df["label"],
    validation_data = ([val_df[cate_feature],val_df[columns]],val_df["label"]),
    batch_size= BATCH_SIZE,
    epochs=EPOCHS,
#     callbacks=[early_stopping],
    class_weight=class_weight)

# 方式2
input_is_bindcard = Input(shape=[1],name = "is_bindcard")
input_gender = Input(shape=[1],name = "gender")

em_is_bindcard = Embedding(2,1)(input_is_bindcard)
em_gender = Embedding(3,1)(input_gender)

emb_is_bindcard = Reshape(target_shape=(1,))(em_is_bindcard)
emb_gender = Reshape(target_shape=(1,))(em_gender)
   
emb = Add()([emb_is_bindcard,emb_gender])
emd_cate = Dense(32,activation='tanh')(emb)
cate_out = Dense(16, activation='tanh')(emd_cate)    
#     cate_out = Flatten()(cate_out)
cate_model = Model(inputs=[input_is_bindcard,input_gender,input_hr_risk,input_is_prvd_ratio,input_dow_risk],
                 outputs=cate_out)
    
# 数值类型
num = Input(shape=(30,),name='num')
num_concate = Dense(32,activation='tanh')(num)
num_out = Dense(16, activation='tanh')(num_concate)
#     z_num = Dense(1, activation="sigmoid")(num_out)
num_model = Model(inputs=[num],outputs=num_out)
    
combined = concatenate([cate_model.output,num_model.output])
out = Dropout(0.2)(combined)
z = Dense(1, activation="sigmoid")(num_out)
    
model = Model(inputs=[input_is_bindcard,input_gender,input_hr_risk,input_is_prvd_ratio,input_dow_risk,num],outputs=z)
    
model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=0.005),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=metrics)



model.fit(x=[df['is_bindcard'],df['gender'],num_df],y=df['label'],shuffle=True,epochs=100,validation_split=0.2,batch_size=128)

参考

你可能感兴趣的:(机器学习,算法,神经网络,人工智能,深度学习)