- 字典数据按照value降序排序
d={"a":3,"b":2,"c":2}
d={k:v for k,v in sorted(d.items(),key=lambda x:x[1],reverse=True)}
- pandas 的两个函数
pd.read_csv()
和pd.read_table()
的一个重要区别是默认的separator不一样
pd.read_csv(file_path,sep=',')
pd.read_table(file_path,sep='\t')
- pandas 聚合groupby后,合并聚合后的某个列为一个字符串
corpus = df.groupby(['file_id'])['api']
.transform(lambda x: ' '.join([str(a) for a in list(x)]))
- 词向量指定词汇表
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(vocabulary=['a','b','c','d','e','f','g'],stop_words=[])
re = tfidf2.fit_transform(corpus)
- DataFrame数据计算常见统计量
data=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3]})
statics = ['count','unique','max','min','median','std']
for stata in statics:
data['tid_'+stata] = list(df.groupby(['file_id'])['tid'].agg(stata))
quantiles = [0.05,0.25,0.5,0.75,0.95]
for quant in quantiles:
data['tid_qua_'+str(100*quant)] = list(df.groupby(['file_id'])['tid'].quantile(quant).values)
- N-GRAM模型(这里求了2,3,4gram并合并到data)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
ngram_range=(2, 4),
stop_words=[','],decode_error="ignore",
max_df=0.90,
min_df=0.01
)
df=pd.DataFrame({'file_id':[1,1,1,1,2,2,2],'tid':[1,1,2,2,3,3,3],'api':[1,2,3,2,4,3,2]})
data = df[['file_id']].drop_duplicates()
corpus = df.groupby(['file_id'])['api'].apply(lambda x: ' '.join([str(a) for a in list(x)]))
corpus = list(corpus)
tfidfs=vectorizer.fit_transform(corpus)
tfidf = pd.DataFrame(tfidfs.todense(),columns=['n_gram_'+ i for i in vectorizer.get_feature_names()])
print('there is %s 2-gram features'%len(vectorizer.vocabulary_))
tfidf['file_id']=list(data['file_id'])
data=pd.merge(data,tfidf,on='file_id')
- Numpy处理NaN, inf值
train=np.array([[np.nan, np.nan, 1, 2], [np.inf, np.inf, 3, 4], [1, 1, 1, 1], [2, 2, 2, 2]])
train[np.isnan(train)]=0
train[np.isinf(train)]=0
- python垃圾回收garbage collection的一些主要观点
- python通过为每一个对象做
引用计数
,思路感觉很清晰整洁
- 但是因为存在
循环引用
等现象,事实上的垃圾回收比较麻烦
- 当数据结构特别大如表格时,更新引用计数可能要
递归
很多步
del(varable)
并不能使varable的占用立即被清除
- 用
gc.collect()
来主动回收垃圾
- python做一维数据的插值 scipy.interpolate
from scipy.interpolate import interp1d
import numpy as np
x = np.array([2,4,6,8,10])
y = np.array([38,39,21,56,77])
px = np.array([2,3,4,5,6,7,8,9,10])
py = interp1d(x,y,kind='quadratic')(px)
- SK-Learn或者Keras框架保存模型,加载模型
def save_model(model,file_name='./model.json'):
model_json = model.to_json()
with open(file_name, "w") as json_file:
json_file.write(model_json)
def load_model(file_name='./model.json'):
from keras.models import model_from_json
model = None
with open(file_name, "r") as json_file:
model_json=json_file.read()
model=model_from_json(model_json)
return model
- 一个TextCNN的例子,此处是一个8分类问题,每条输入文本具有超过6000个词,长度不一,此处简单截断(仅模型定义,训练见下文)
import numpy as np
import pandas as pd
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM,Dense,Conv1D,MaxPooling1D,Dropout,Input,GlobalMaxPooling1D
from keras.layers import SpatialDropout1D,GRU
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.merge import concatenate
from keras.utils import plot_model
def dnn_1():
embedding_vecor_length=256
api_total_num=301
max_seq_length=6000
drop_rate1=0.25
drop_rate2=0.5
drop_rate3=0.25
num_filters=64
nb_classes=8
kernel_size=[2,3,4,5]
model = Sequential()
input_type = Input(shape=(max_seq_length,), dtype='int16')
model = Sequential()
embd = Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length, mask_zero=False)(input_type)
embd = SpatialDropout1D(drop_rate1)(embd)
warppers = []
for sizei in kernel_size:
for dilated_rate in [1,2,3,4]:
conv1d = Conv1D(filters=num_filters, kernel_size=sizei, activation='relu', dilation_rate=dilated_rate)(embd)
warppers.append(GlobalMaxPooling1D()(conv1d))
fc = concatenate(warppers)
fc = Dropout(drop_rate2)(fc)
fc = Dense(256, activation='relu')(fc)
fc = Dropout(drop_rate3)(fc)
preds = Dense(nb_classes, activation = 'softmax')(fc)
model = Model(inputs=input_type, outputs=preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print(model.summary())
return model
- 一个LSTM的例子,输入和上述LSTM类似(仅模型定义,训练见下文)
def lstm_1():
embedding_vecor_length=256
api_total_num=301
max_seq_length=6000
nb_classes=8
model = Sequential()
model.add(Embedding(api_total_num, embedding_vecor_length, input_length=max_seq_length))
model.add(Conv1D(filters=128, kernel_size=2, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dense(nb_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
print(model.summary())
return model
- 一个随机森林的例子(仅模型定义,训练见下文)
def rf_1():
'''
# Document
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
'''
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
n_estimators=100,
criterion='gini',
min_samples_split=100,
min_samples_leaf=20,
max_depth=None,
max_features='sqrt' ,
random_state=100,
)
return model
- 一个LightGBM的例子(仅模型定义,训练见下文)
def lgb_1():
'''
# Document
http://lightgbm.apachecn.org/cn/latest/index.html
'''
import lightgbm as lgb
params = {
'task':'train',
'boosting_type':'gbdt',
'num_leaves': 31,
'objective': 'multiclass',
'num_class':8,
'learning_rate': 0.05,
'feature_fraction': 0.85,
'subsample':0.85,
'num_threads': 32,
'metric':'multi_logloss',
'seed':100
}
model=lgb.LGBMRegressor(**params)
return model
- 一个XGBoost的例子(仅模型定义,训练见下文)
def xgb_1():
from xgboost.sklearn import XGBClassifier
''' https://www.programcreek.com/python/example/95386/xgboost.sklearn.XGBClassifier
'''
model=XGBClassifier(
silent=0 ,
learning_rate= 0.3,
min_child_weight=3,
max_depth=6,
gamma=0.1,
subsample=0.7,
max_delta_step=0,
colsample_bytree=1,
reg_lambda=1,
objective= 'multi:softprob',
num_class=8,
seed=1000,
eval_metric= 'mlogloss'
)
return model
- 一个模型训练的例子,xgb为例
model = xgb_1()
train = np.load(train_x_path)
labels = np.load(label_path)
model=model.fit(train,labels)
save_model(model,'xbg_1.json')
- 一个模型预测的例子
test = np.load(test_x_path)
model = load_model('xbg_1.json')
res = model.predict_proba(test)
- K 折方法(K-Fold)训练的例子,用于避免过拟合,通常为5折和10折
def train_xgb_2():
params = {
'booster':'gbtree',
'objective': 'multi:softprob',
'num_class':8,
'gamma':0.1,
'max_depth':5,
'lambda':2,
'subsample':0.7,
'colsample_bytree':0.7,
'min_child_weight':3,
'silent':0 ,
'eta': 0.01,
'seed':1000,
'eval_metric': 'mlogloss'
}
train = np.load(train_x_path)
test = np.load(test_x_path)
labels = np.load(label_path)
n_splits=5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
meta_train = np.zeros(shape = (len(train),8))
meta_test = np.zeros(shape = (len(test),8))
for i,(train_ids,test_ids) in enumerate(skf.split(train,labels)):
X_train,X_train_label = train[train_ids],labels[train_ids]
X_val,X_val_label = train[test_ids],labels[test_ids]
xgb_val = xgb.DMatrix(X_val,label=X_val_label)
xgb_train = xgb.DMatrix(X_train, label=X_train_label)
xgb_test = xgb.DMatrix(test)
train_ = xgb.DMatrix(train)
plst=list(params.items())
num_rounds=5000
watchlist=[(xgb_train, 'train'),(xgb_val, 'val')]
model = xgb.train(plst , xgb_train, num_rounds , watchlist,
early_stopping_rounds=100)
pred_val = model.predict(xgb_val,ntree_limit=model.best_ntree_limit)
meta_test_ = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)
meta_train_ = model.predict(train_,ntree_limit=model.best_ntree_limit)
meta_train[test_ids] = pred_val
dp.save_model(model,model_save+'xgb_2_%s.m'%i)
dp.save_submit(meta_train_,submit_save+'xgb_2_%s_train.csv'%i)
dp.save_submit(meta_test_,submit_save+'xgb_2_%s_test.csv'%i)
meta_test+=meta_test_
meta_test/=n_splits
dp.save_submit(meta_train,submit_save+'xgb_2_train.csv')
dp.save_submit(meta_test,submit_save+'xgb_2_test.csv')
- 设置模型训练中GPU动态占用显存(默认会占用全部显存,如果多人共用的物理机,则需要此代码)
def gpu_memeray_dynamic():
import keras.backend.tensorflow_backend as KTF
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
KTF.set_session(sess)