Python3机器学习笔记

  1. 字典数据按照value降序排序
d={"a":3,"b":2,"c":2}
d={k:v for k,v in  sorted(d.items(),key=lambda x:x[1],reverse=True)}
  1. pandas 的两个函数pd.read_csv()pd.read_table()的一个重要区别是默认的separator不一样
pd.read_csv(file_path,sep=',')
pd.read_table(file_path,sep='\t')
  1. pandas 聚合groupby后,合并聚合后的某个列为一个字符串
corpus = df.groupby(['file_id'])['api']
	.transform(lambda x: ' '.join([str(a) for a in list(x)]))
  1. 词向量指定词汇表
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(vocabulary=['a','b','c','d','e','f','g'],stop_words=[])
re = tfidf2.fit_transform(corpus)
  1. DataFrame数据计算常见统计量
data=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3]})
statics = ['count','unique','max','min','median','std']
for stata in statics:
    data['tid_'+stata] = list(df.groupby(['file_id'])['tid'].agg(stata))
quantiles = [0.05,0.25,0.5,0.75,0.95]
for quant in quantiles:
    data['tid_qua_'+str(100*quant)] = list(df.groupby(['file_id'])['tid'].quantile(quant).values) 
  1. N-GRAM模型(这里求了2,3,4gram并合并到data)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
        ngram_range=(2, 4),#token_pattern = r'\b\w+\b', #vocabulary=vocabulary,
        stop_words=[','],decode_error="ignore",
        max_df=0.90,
        min_df=0.01
    )
df=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3],'api':[1,2,3,2,4,3,2]})
data = df[['file_id']].drop_duplicates()
corpus = df.groupby(['file_id'])['api'].apply(lambda x: ' '.join([str(a) for a in list(x)]))
corpus = list(corpus)
tfidfs=vectorizer.fit_transform(corpus)
tfidf = pd.DataFrame(tfidfs.todense(),columns=['n_gram_'+ i for i in vectorizer.get_feature_names()])
print('there is %s 2-gram features'%len(vectorizer.vocabulary_))
    tfidf['file_id']=list(data['file_id'])
    data=pd.merge(data,tfidf,on='file_id')
  1. Numpy处理NaN, inf值
train=np.array([[np.nan, np.nan, 1, 2], [np.inf, np.inf, 3, 4], [1, 1, 1, 1], [2, 2, 2, 2]])
train[np.isnan(train)]=0
train[np.isinf(train)]=0
  1. python垃圾回收garbage collection的一些主要观点
  • python通过为每一个对象做引用计数,思路感觉很清晰整洁
  • 但是因为存在循环引用等现象,事实上的垃圾回收比较麻烦
  • 当数据结构特别大如表格时,更新引用计数可能要递归很多步
  • del(varable)并不能使varable的占用立即被清除
  • gc.collect()来主动回收垃圾
  1. python做一维数据的插值 scipy.interpolate
from scipy.interpolate import interp1d
import numpy as np
x = np.array([2,4,6,8,10])
y = np.array([38,39,21,56,77])
px = np.array([2,3,4,5,6,7,8,9,10])
# ‘linear’, ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic, ‘cubic’
#  线性  临近  零 
py = interp1d(x,y,kind='quadratic')(px)
  1. SK-Learn或者Keras框架保存模型,加载模型
# serialize to json,yml or hdf5 file
def save_model(model,file_name='./model.json'):
    model_json = model.to_json()   
    with open(file_name, "w") as json_file:
        json_file.write(model_json) 

# load model from 
def load_model(file_name='./model.json'):
    from keras.models import model_from_json
    model = None
    with open(file_name, "r") as json_file:
        model_json=json_file.read()
        model=model_from_json(model_json)
    return model
  1. 一个TextCNN的例子,此处是一个8分类问题,每条输入文本具有超过6000个词,长度不一,此处简单截断(仅模型定义,训练见下文)
import numpy as np
import pandas as pd 
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM,Dense,Conv1D,MaxPooling1D,Dropout,Input,GlobalMaxPooling1D
from keras.layers import SpatialDropout1D,GRU
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
from keras.utils import plot_model # model visliation

def dnn_1():
    embedding_vecor_length=256
    api_total_num=301
    max_seq_length=6000
    drop_rate1=0.25
    drop_rate2=0.5
    drop_rate3=0.25
    num_filters=64
    nb_classes=8
    kernel_size=[2,3,4,5]


    model = Sequential()
    input_type = Input(shape=(max_seq_length,), dtype='int16')
    model = Sequential()
    embd = Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length, mask_zero=False)(input_type)
    embd = SpatialDropout1D(drop_rate1)(embd)
    warppers = []
    for sizei in kernel_size:
        for dilated_rate in [1,2,3,4]:
            conv1d = Conv1D(filters=num_filters, kernel_size=sizei, activation='relu', dilation_rate=dilated_rate)(embd)
            warppers.append(GlobalMaxPooling1D()(conv1d))
    fc = concatenate(warppers)
    fc = Dropout(drop_rate2)(fc)
    fc = Dense(256, activation='relu')(fc)
    fc = Dropout(drop_rate3)(fc)
    preds = Dense(nb_classes, activation = 'softmax')(fc)
    
    model = Model(inputs=input_type, outputs=preds)
    
    model.compile(loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    print(model.summary())
    return model
  1. 一个LSTM的例子,输入和上述LSTM类似(仅模型定义,训练见下文)
# 依赖同11
def lstm_1():
    embedding_vecor_length=256
    api_total_num=301
    max_seq_length=6000
    nb_classes=8
    
    model = Sequential()
    model.add(Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length)) 
    model.add(Conv1D(filters=128, kernel_size=2, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))
    model.add(LSTM(64))
    model.add(Dense(nb_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
                  optimizer='rmsprop', # root mean square prop/adam
                  metrics=['accuracy']) # correct rate
    print(model.summary())
    return model
  1. 一个随机森林的例子(仅模型定义,训练见下文)
def rf_1():
    '''
    # Document
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    '''
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(
        n_estimators=100,
        criterion='gini',# entropy
        min_samples_split=100,# The minimum number of samples required to split an internal node
        min_samples_leaf=20, # The minimum number of samples required to be at a leaf node.
        max_depth=None, # 8
        max_features='sqrt' ,
        random_state=100,
    )
    return model
  1. 一个LightGBM的例子(仅模型定义,训练见下文)
def lgb_1():
    '''
    # Document
    http://lightgbm.apachecn.org/cn/latest/index.html
    '''
    import lightgbm as lgb
    params = {
            'task':'train', 
            'boosting_type':'gbdt',
            'num_leaves': 31,
            'objective': 'multiclass',
            'num_class':8,
            'learning_rate': 0.05,
            'feature_fraction': 0.85,
            'subsample':0.85,
            'num_threads': 32,
            'metric':'multi_logloss',
            'seed':100
    }  
    model=lgb.LGBMRegressor(**params)
    return model
  1. 一个XGBoost的例子(仅模型定义,训练见下文)
def xgb_1():
    from xgboost.sklearn import XGBClassifier
    ''' https://www.programcreek.com/python/example/95386/xgboost.sklearn.XGBClassifier
    ''' 
    model=XGBClassifier(
        silent=0 ,#设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
        #nthread=4,# cpu 线程数 默认最大
        learning_rate= 0.3, # 如同学习率
        min_child_weight=3, 
        # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
        #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
        #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
        max_depth=6, # 构建树的深度,越大越容易过拟合
        gamma=0.1,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
        subsample=0.7, # 随机采样训练样本 训练实例的子采样比
        max_delta_step=0,#最大增量步长,我们允许每个树的权重估计。
        colsample_bytree=1, # 生成树时进行的列采样 
        reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
        #reg_alpha=0, # L1 正则项参数
        #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
        objective= 'multi:softprob', #多分类的问题 指定学习任务和相应的学习目标
        num_class=8, # 类别数,多分类与 multisoftmax 并用
        # n_estimators=100, #树的个数
        seed=1000, #随机种子

        eval_metric= 'mlogloss' # roc_auc
    )
    return model
  1. 一个模型训练的例子,xgb为例
model = xgb_1()
train = np.load(train_x_path)
#test = np.load(test_x_path)
labels = np.load(label_path)
model=model.fit(train,labels)
save_model(model,'xbg_1.json') # 调用上文中保存模型的函数
  1. 一个模型预测的例子
test = np.load(test_x_path)
model = load_model('xbg_1.json') # 调用上文中的函数读取模型
# res = model.predict(test) 两种方式的返回有所差异
res = model.predict_proba(test)
  1. K 折方法(K-Fold)训练的例子,用于避免过拟合,通常为5折和10折
def train_xgb_2():
    params = {
        'booster':'gbtree',
        'objective': 'multi:softprob',
        'num_class':8,
        'gamma':0.1,
        'max_depth':5,
        'lambda':2,
        'subsample':0.7,
        'colsample_bytree':0.7, 
        'min_child_weight':3, 
        'silent':0 ,
        'eta': 0.01, 
        'seed':1000,
        'eval_metric': 'mlogloss'
    }   
    train = np.load(train_x_path)
    test = np.load(test_x_path)
    labels = np.load(label_path)
    n_splits=5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    meta_train = np.zeros(shape = (len(train),8))
    meta_test = np.zeros(shape = (len(test),8))

    for i,(train_ids,test_ids) in enumerate(skf.split(train,labels)):
        X_train,X_train_label = train[train_ids],labels[train_ids]
        X_val,X_val_label = train[test_ids],labels[test_ids]
        xgb_val = xgb.DMatrix(X_val,label=X_val_label)
        xgb_train = xgb.DMatrix(X_train, label=X_train_label)
        xgb_test = xgb.DMatrix(test)
        train_ = xgb.DMatrix(train)
        plst=list(params.items())
        num_rounds=5000
        watchlist=[(xgb_train, 'train'),(xgb_val, 'val')]
        model = xgb.train(plst , xgb_train, num_rounds , watchlist,
                      early_stopping_rounds=100)

        pred_val = model.predict(xgb_val,ntree_limit=model.best_ntree_limit)
        meta_test_ = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
        meta_train_ = model.predict(train_,ntree_limit=model.best_ntree_limit)

        meta_train[test_ids] = pred_val
        dp.save_model(model,model_save+'xgb_2_%s.m'%i)
        dp.save_submit(meta_train_,submit_save+'xgb_2_%s_train.csv'%i)
        dp.save_submit(meta_test_,submit_save+'xgb_2_%s_test.csv'%i)
        meta_test+=meta_test_
        # meta_train+=meta_train_
    
    # meta_train/=n_splits
    meta_test/=n_splits
    dp.save_submit(meta_train,submit_save+'xgb_2_train.csv')
    dp.save_submit(meta_test,submit_save+'xgb_2_test.csv')
  1. 设置模型训练中GPU动态占用显存(默认会占用全部显存,如果多人共用的物理机,则需要此代码)
def gpu_memeray_dynamic():
    import keras.backend.tensorflow_backend as KTF
    import tensorflow as tf
    # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True   #不全部占满显存, 按需分配
    sess = tf.Session(config=config)
    KTF.set_session(sess)

你可能感兴趣的:(人工智能,计算科学)