输入:实值或者是离散数
输出:一个连续值域上的任意值
输入:实值或者是离散数
输出:将实例分为两个或多个类中的一个
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
train_data_file = "./zhengqi_train.txt"
test_data_file = "./zhengqi_test.txt"
train_data = pd.read_csv(train_data_file,sep='\t',encoding='utf-8')
test_data = pd.read_csv(test_data_file,sep='\t',encoding='utf-8')
#1、看值
train_data.head()
test_data.head()
#2、看形状
train_data.shape
test_data.shape
#3、看训练集和测试集的占比
train_data.shape[0]/(train_data.shape[0]+test_data.shape[0])
test_data.shape[0]/(train_data.shape[0]+test_data.shape[0])
#4、查看数据基本信息:看有无缺失值(说明没有缺失值)和数据类型
train_data.info()
test_data.info()
#划分数据集:X和label,train和test[大赛给的test是没有标签的,我们需要在train数据集里面划分我们自己的train和test]
from sklearn.model_selection import train_test_split
X = train_data.drop('target',axis=1)
y = train_data.target
X.shape,y.shape
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True)#80%做训练集,20%做测试集
X_train.shape,X_test.shape,y_train.shape,y_test.shape
相关性系数的取值区间为 [1, 1] 。
一般来说,在取绝对值后 ,0-0.09 为没有相关性, 0.1-0.3 为弱相关, 0.3-0.5 为中相关, 0.5-1.0 为强相关。
代码如下
import nurnpy as np
X = np.array((65 , 72, 78 , 65 , 72 , 70 , 65 , 68])
Y=np . array([72 , 69 , 79 , 69 , 84 , 75 , 60 , 73])
np.corrcoef(X, Y)
from sklearn.da asets import load_ iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris= load_iris()
X, y = iris.data, iris.target
chiValues = chi2(X, y)
X new = Select.KBest(chi2, k=2).fit_transform(X, y)
pd.set_option ('display.max_columns', 10)
pd.set_option ('display.max_rows', 10)
data_trainl =train_data.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
train_corr = data_trainl.corr()
train_corr
ax = plt.subplots(figsize= (20, 16)) #调整画布大小
ax= sns.heatmap(train_corr,vmax=.8 , square=True, annot=True)#画热力图
#所有变量的箱型图
column = train_data.columns.tolist()[:39] #列表头
fig = plt.figure(figsize=(80,60),dpi=75) #指定绘图对象的宽度和高度
for i in range(38):
plt.subplot(7,8,i+1) #8列子图
sns.boxplot(data=train_data[column[i]],width=0.5)
plt.ylabel(column[i],fontsize=36)#显示的纵坐标的大小
plt.show()
#另一种画法
plt.figure(figsize=(18,10))
plt.boxplot(x=train_data.values,labels=train_data.columns)
plt.hlines([-7.5,7.5],0,40,colors='r')#0,40是这条线的长度
plt.show()
#异常值的处理
def find_outliers(model,X,y,sigma=3):
try:#用模型来预测y值
y_pred = pd.Series(model.predict(X),index=y.index)
except:#如果预测失败,尝试先调整模型
model.fit(X,y)
y_pred = pd.Series(model.predict(X),index=y.index)
resid = y-y_pred #模型的预测值和真实值相减
mean_resid = resid.mean()#均值
std_resid = resid.std()#方差
z = (resid-mean_resid)/std_resid
outliers = z[abs(z)>sigma].index#异常值
print('R2=',model.score(X,y))#用来度量样本回归线的拟合优度=回归平方和/总平方和
print('mse=',mean_squared_error(y,y_pred))
print('-----------------------------------------')
print('mean of residuals:',mean_resid)
print('std of residuals:',std_resid)
print('-----------------------------------------')
print((len(outliers),'outliers:'))
print(outliers.tolist())
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred,'.')#中间的蓝点
plt.plot(y.loc[outliers],y_pred.loc[outliers],'ro')#outlier 的红点
plt.legend(['Accepted','Outlier'])#顺序要和上面的对应
plt.xlabel('y')
plt.ylabel('y_pred')
ax_132 = plt.subplot(1,3,2)
plt.plot(y,y-y_pred,'.')
plt.plot(y.loc[outliers],y.loc[outliers]-y_pred.loc[outliers],'ro')
plt.legend(['Accepted','Outlier'])
plt.xlabel('y')
plt.ylabel('y-y_pred')
ax_133 = plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)#hist直方图
z.loc[outliers].plot.hist(color='r',bins=50,ax=ax_133)
plt.legend(['Accepted','Outlier'])
plt.xlabel('z')
plt.savefig('outliers.png')
return outliers
#用数据预测y y值与均值之差 超过多少个方差的 视为异常值
outliers = find_outliers(Ridge(),X_train,y_train,sigma=3)
#1、删除异常值(横着删)
X_train = X_train.drop(outliers)
y_train = y_train.drop(outliers)
X_train.shape,y_train.shape
#2、删除异常值(竖着删)
#针对每一个特征向量删掉样本异常点
for f in X_train.columns:
X_ = X_train.drop(f,axis=1)#把特征值“V0”删掉
y_ = X_train[f]#特征值“V0”
outliers = find_outliers(Ridge(),X_,y_,sigma=4)
X_train = X_train.drop(outliers)
y_train = y_train.drop(outliers)
X_train.shape,y_train.shape
from sklearn.preprocessing import StandardScaler
StandardScaler().fit_transform(iris.data)
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler().fit_transform(iris.data)
from sklearn.preprocessing import Normalizer
Normalizer().fit_transform(iris.data)
归一化与标准化的使用场景:
from sklearn.preprocessing import Binarizer
Binarizer(threshold=3).fit_transform(iris.data)
from sklearn.preprocessing import OneHotEncoder
OneHotEncoder(categories='auto').fit_transform(iris.target.reshape((-1,1)))
from numpy import vstack,array,nan
from sklearn.impute import SimpleImputer
#缺失值处理,返回值为处理缺失值后的数据
#参数 missing_value 为缺失俏的表示形式,默认为 NaN
#参数 strategy 为缺失值的填充方式,默认为 mean (均值)
SimpleImputer().fit_transform(vstack((array([nan,nan,nan,nan]),iris.data)))
from sklearn.preprocessing import PolynomialFeatures
PolynomialFeatures().fit_transfor(iris.data)
from numpy import loglp
from sklearn.preprocessing import FunctionTransformer
FunctionTransformer(loglp,validate=False).fit_transform(iris.data)
针对本题代码:
#(1)用sklearn归一化【建议】
from sklearn import preprocessing
features_columns = list(X_train.columns)
min_max_scaler = preprocessing.MinMaxScaler()#缩放
min_max_scaler = min_max_scaler.fit(X_train[features_columns])
X_train_scaler = pd.DataFrame(min_max_scaler.transform(X_train[features_columns]))
X_test_scaler = pd.DataFrame(min_max_scaler.transform(X_test[features_columns]))#自己分的测试集
X_test_data_scaler = pd.DataFrame(min_max_scaler.transform(test_data[features_columns]))#大赛给的测试集
#(2)标准化变换过程
#法二:使用sklearn实现Yeo-Johnson
import numpy as np
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()#这里method默认是Yeo-Johnson
pt.fit(X_train_scaler)
X_train_s_bc = pt.transform(X_train_scaler)
X_test_s_bc = pt.transform(X_test_scaler)
X_test_data_s_bc = pt.transform(X_test_data_scaler)
X_train_s_bc = pd.DataFrame(X_train_s_bc)
X_test_s_bc = pd.DataFrame(X_test_s_bc)
X_test_data_s_bc = pd.DataFrame(X_test_data_s_bc)
#(3)查看目前的数据状况(之前也要查看滴)
#qq图【散点图】是数据分位数(蓝色那条)与正态分布分位数(红色那条)
#直方图:数据的分布图
train_cols = 6
train_rows = len(X_train_s_bc.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))
i=0
for col in X_train_s_bc.columns:
i += 1
ax = plt.subplot(train_rows,train_cols,i)
sns.distplot(X_train)#直方图
i += 1
ax = plt.subplot(train_rows,train_cols,i)
res = stats.probplot(X_train_s_bc[col],plot=plt)#qq图
plt.tight_layout()
plt.show()
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_iris
iris = load_iris()
#方差选择法,返回值为特征选择后的数据
#参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(iris.data)
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
from array import array
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
#选择K个最好的特征 返回选择特征后的数据
#第一个参数为计算评估特征的函数,该函数输入特征矩阵和目标向量,输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
#参数k为选择的特征个数
SelectKBest(lambda X,Y:np.array(list(map(lambda x:pearsonr(x, Y),X.T))).T[0],k=2).fit_transform(iris.data,iris.target)
from sklearn datasets import load_iris
iris = load_iris()
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#选择k个最好的特征,返回选择特征后的数据
SelectKBest(chi2, k=2) fit_transform(iris.data,iris.target)
import numpy as np
from sklearn.feature.selection import SelectKBest
from minepy import MINE
#由于 MINE 的设计不是函数式的,因此需要定义 mic 方法将其转换为函数式,返回一个二元组,二元组的第2项设置成固定的P值,为0.5
def mic(x, y):
m =MINE()
m.compute_score(x y)
return (m.mic(), 0.5)
#选择k个最好的特征,返回特征选择后的数据
SelectKBest(lambda X, Y: np.array(list(map(lambda x: mic(x, Y), X.T))).T[0],k=2).fit_transform(iris.data, iris. target)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#递归特征消除法,返回特征选择后的数据
#参数 estimator 为基模型
#参数 n_features_to_select 为选择的特征个数
RFE(estimator=LogisticRegression(multi_class='auto',solver= 'lbfgs',max_iter=5OO),n_features_to_select=2).fit_transform(iris.data,iris.target)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#将带 Ll 惩罚项的逻辑同归作为基模型的特征选择
SelectFromModel(
LogisticRegression(penalty='l2',C=0.1,solver='lbfgs',multi_class='auto')).fit_transform(iris.data,iris.target)
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#将 GBDT 作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data,iris.target)
这道题的代码
#统一特征
X_train_s_bc.columns = test_data.columns
X_test_s_bc.columns = test_data.columns
X_test_data_s_bc.columns = test_data.columns
dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i = 1
for col in test_data.columns:
ax = plt.subplot(dist_rows,dist_cols,i)
ax = sns.kdeplot(train_data[col],color='Red',shade=True)
ax = sns.kdeplot(test_data[col],color='Blue',shade=True)
ax.set_xlabel(col)
ax.set_ylabel('Frequency')
ax = ax.legend(['train','test'])
i += 1
plt.show()
X_train_s_bc_t = pd.concat([X_train_s_bc,y_train],axis = 1)#带个target
X_train_s_bc_kde = X_train_s_bc.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
X_test_s_bc_kde = X_test_s_bc.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
X_test_data_s_bc_kde = X_test_data_s_bc.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
data_train1 = X_train_s_bc_t.drop(['V5','V9','V11','V17','V22','V28'],axis=1)
#k个与target最相关特征
k = 10
cols = train_corr.nlargest(k,'target')['target'].index
cm = np.corrcoef(train_data[cols].values.T)
hm = plt.subplots(figsize=(10,10)) #调整画布大小
hm = sns.heatmap(train_data[cols].corr(),annot=True,square=True)
plt.show()
#与target相关系数大于0.5的特征变量
threshold = 0.5
corrmat = train_data.corr()
top_corr_features = corrmat.index[abs(corrmat['target'])>threshold]
plt.figure(figsize=(10,10))
g = sns.heatmap(train_data[top_corr_features].corr(),annot=True,cmap='RdYlGn')
(2)相关性只是两个变量之间的关系,如果一个变量可以被其他三个变量解释呢,无法光靠相关性检验出来。用VIF。
from statsmodels.stats.outliers_influence import variance_inflation_factor
#多重共线性
new_numerical = X_train_s_bc_kde.columns.tolist()
X = X_train_s_bc_kde.values
VIF_list = [variance_inflation_factor(X,i) for i in range(X.shape[1])]
VIF_list
#一般0.9比较高
from sklearn decomposition import PCA
#主成分分析法,返回降维后的数据
#参数 n_components 为主成分的数目
PCA(n_components=2).fit_transform(iris.data)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#线性判别分析法,返回降维后的数据
#参数n_components为降维后的维数
LDA(n_components=2).fit_transform(iris.data, iris.target)
这道题的代码
#处理PCA降维
from sklearn.decomposition import PCA #主成分分析法
pca = PCA(n_components=0.95)
X_train_s_bc_kde_pca_95 = pca.fit_transform(X_train_s_bc_kde)
X_test_s_bc_kde_pca_95 = pca.transform(X_test_s_bc_kde)
X_test_data_s_bc_kde_pca_95 = pca.transform(X_test_data_s_bc_kde)
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
pca = PCA (n_components=16)
features_columns = [col for col in train_data.columns if col not in ['target']]
train_data_scaler = pd.DataFrame(min_max_scaler.transform(train_data[features_columns]))
test_data_scaler = pd.DataFrame(min_max_scaler.transform(test_data[features_columns]))
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
new_test_pca_16 = pca.transform(test_data_scaler.iloc[:, 0:-1])
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] = train_data['target']
线性回归调用
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
new_train_pca_16 = new_train_pca_16.fillna(0) #采用 PCA 保留 16 维特征的数据
train= new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']
train_data, test_data,train_target,test_target = train_test_split(train,target,test_size=0.2,random_state=0)
clf = LinearRegression()
clf.fit(train_data,train_target)
test_pred = clf.predict(test_data)
score = mean_squared_error(test_target,clf.predict(test_data))
print("LinearRegression: ",score)
from sklearn.neighbors import KNeighborsRegressor
clf = KNeighborsRegressor(n_neighbors=3)
clf.fit(train_data,train_target)
test_pred = clf.predict(test_data)
score = mean_squared_error(test_target,clf.predict(test_data))
print("KNeighborsRegressor: ",score)
#从 sklearn 算法库中导入决策树回归算法
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor()
clf.fit(train_data,train_target)
test_pred = clf.predict(test_data)
score= mean_squared_error(test_target,clf.predict(test_data))
print("DecisionTreeRegressor: ", score)
#从 sklearn 算法库中导入随机森林回归树模型
from sklearn.ensemble import RandomForestRegressor
clf= RandomForestRegressor(n_estimators=200) # 200 棵树模型
clf.fit(train_data,train_target)
test_pred = clf.predict(test_data)
score= mean_squared_error(test_target,clf.predict(test_data))
print ("RandomForestRegressor: ",score)
# LGB 回们模型
clf= lgb.LGBMRegressor(learning_rate=0.01,max_depth=-1,n_estimators=5000,boosting_type='gbdt ',random_state=2019,objective='regression',)
#训练模型
clf.fit(X=train_data, y=train_target, eval_metric='MSE', verbose=50)
score = mean_squared_error(test_target,clf.predict(test_data))
print ("lightGbm: ", score)
数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression #线性回归
from sklearn.neighbors import KNeighborsRegressor # 近邻回归
from sklearn.tree import DecisionTreeRegressor #决策树回归
from sklearn.ensemble import RandomForestRegressor #随机森林回归
from sklearn.svm import SVR #支持向量回归
import lightgbm as lgb # LightGBM 模型
from sklearn.model_selection import train_test_split #切分数据
from sklearn.metrics import mean_squared_error #评价指标
from sklearn.linear_model import SGDRegressor
#读取数据
train_data_file = "./zhengqi_train.txt"
test_data_file = "./zhengqi_test.txt"
train_data = pd.read_csv(train_data_file, sep= '\t', encoding= 'utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
#归一化处理
from sklearn import preprocessing
features_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns= features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['target'] = train_data['target']
# PCA 方法降维
from sklearn.decomposition import PCA #主成分分析法
#保留 16 个上成分
pca= PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)
new_train_pca_16['target'] =train_data_scaler['target']
#将数据切分为训练数据和验证数据
#保留16维特征并切分数据
new_train_pca_l6 = new_train_pca_16.fillna(0)
train= new_train_pca_l6[new_test_pca_16.columns]
target= new_train_pca_16['target']
#切分数据,训练数据为80% ,验证数据为20%
train_data, test_data, train_target, test_target = train_test_split(train,target, test_size=0.2, random_state=0)
对于数据
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = np.random.uniform(-3.0,3.0,size=100)
X = x.reshape(-1,1)
y = 0.5*x**2+x+2+np.random.normal(0,1,size=100)
plt.scatter(x,y)
plt.show()
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)#训练
lin_reg.score(X,y)#评估模型性能
from sklearn.metrics import mean_squared_error
y_predict = lin_reg.predict(X)#预测
display(mean_squared_error(y,y_predict))
plt.scatter(x, y)
plt.plot(np.sort(x),y_predict[np.argsort(x)] , color='r')
plt.show()
#多项式拟合
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolynomialRegression(degree):
return Pipeline ([('poly', PolynomialFeatures(degree=degree)),('std_scaler', StandardScaler()), ('lin reg ' , LinearRegression ())])
poly2_reg = PolynomialRegression(degree=2)
poly2_reg.fit(X,y)
y2_predict = poly2_reg.predict(X)
#比较真实值和预测值的均方误差
display(mean_squared_error(y,y2_predict))
#拟合结果可视化
plt.scatter(x,y)
plt.plot(np.sort(x),y2_predict[np.argsort(x)],color='r')
plt.show()
poly10_reg = PolynomialRegression(degree=10)
poly10_reg.fit(X,y)
y10_predict = poly10_reg.predict(X)
#比较真实值和预测值的均方误差
display(mean_squared_error(y,y10_predict))
plt.scatter(x,y)
plt.plot(np.sort(x),y10_predict[np.argsort(x)],color='r')
plt.show()
poly100_reg = PolynomialRegression(degree=100)
poly100_reg.fit(X,y)
y100_predict = poly100_reg.predict(X)
#比较真实值和预测值的均方误差
display(mean_squared_error(y,y100_predict))
plt.scatter(x,y)
plt.plot(np.sort(x),y100_predict[np.argsort(x)],color='r')
plt.show()
degree 越大,拟合的效果越好,因为样本点是一定的,我们总能找到一条曲线将所有的样本点拟合,也就是说让所有的样本点都落在这条曲线上,使得整体的均方误差为0; 曲线并不是所计算出的拟合曲线,只是原有的数据点对应的y的预测值连接出来的结果,而且有的地方没有数据点,因此连接的结果和原来的曲线不一样;而在未知的待预测数据的预测过程中,过量拟合训练集会造成泛化能力的降低,预测偏差增大。
这道题的欠拟合情况:
#欠拟合情况
clf = SGDRegressor(max_iter=500,tol=1e-2)
clf.fit(train_data,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data))
score_test = mean_squared_error(test_target,clf.predict(test_data))
print('SGDRegressor train MSE: ',score_train)
print('SGDRegressor test MSE: ',score_test)
过拟合情况
#过拟合情况
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print('SGDRegressor train MSE: ',score_train)
print('SGDRegressor test MSE: ',score_test)
#正常拟合情况
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print('SGDRegressor train MSE: ',score_train)
print('SGDRegressor test MSE: ',score_test)
在正常拟合的情况下,模型在训练集和测试集上得到的 MSE值均为三种拟合情况下的较小值,此时模型很好地表达了数据关系
加入正则化项后的代价函数(损失函数)为
J ( w ) = 1 2 ∑ j = 1 N { j i − w T σ ( x j ) } 2 + λ 2 ∑ j = 1 N ∣ w j ∣ q J(w)=\frac{1}{2}\sum_{j=1}^N{\{j_i-w^T\sigma(x_j)\}}^2+\frac{\lambda}{2}\sum_{j=1}^N|w_j|_q J(w)=21∑j=1N{ji−wTσ(xj)}2+2λ∑j=1N∣wj∣q
L2正则化
J = J 0 + α ∑ w 2 J = J_0+\alpha\sum w^2 J=J0+α∑w2
L1正则化
J = J 0 + α ∑ ∣ w 1 ∣ J = J_0+\alpha\sum |w^1| J=J0+α∑∣w1∣
这道题的正则化:
#模型正则化L2
poly = PolynomialFeatures(degree=3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3,penalty='L2',alpha=0.0001)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print('SGDRegressor train MSE: ',score_train)
print('SGDRegressor test MSE: ',score_test)
#模型正则化L1
poly = PolynomialFeatures(degree=3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3,penalty='L1',alpha=0.0001)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print('SGDRegressor train MSE: ',score_train)
print('SGDRegressor test MSE: ',score_test)
#采用ElasticNet联合L1和L2范数加权正则化
poly = PolynomialFeatures(degree=3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3,penalty='elasticnet',l1_ratio = 0.9,alpha=0.0001)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print('SGDRegressor train MSE: ',score_train)
print('SGDRegressor test MSE: ',score_test)
M A E = 1 n ∑ i = 1 n ∣ f i − y i ∣ = 1 n ∑ i = 1 n ∣ e i ∣ MAE = \frac{1}{n}\sum_{i=1}^{n}|f_i-y_i|=\frac{1}{n}\sum_{i=1}^n|e_i| MAE=n1∑i=1n∣fi−yi∣=n1∑i=1n∣ei∣
from sklearn .metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred)
M S E = 1 n ∑ i = 1 n ( o b s e r v e d i − p r e d i c t e d i ) 2 MSE = \frac{1}{n}\sum_{i=1}^n(observed_i-predicted_i)^2 MSE=n1∑i=1n(observedi−predictedi)2
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)
from sklearn.metrics import mean_squared_error
Pred_Error = mean_squared_error(y_test,y_pred)
Sqrt(Pred_Error)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)
在某种意义下将原始数据进行分组, 一部分作为训练集,另 一部分作为验证集。首先用训练集对分类器进行训练,再利用验证集来测试训练得到的模型,以此来作为评价分类器的性能指标。
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(iris.data,iris.targe, test_size=.4, random_state=0)
from sklearn.model_selection import Kfold
kf = KFold(n_splits=lO)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut ()
from sklearn.model_selection import LeavePOut
lpo = LeavePOut (p=5)
这道题的交叉检验
#简单交叉检验
from sklearn.model_selection import train_test_split #切分数据
#切分数据,训练数据为 80%,验证数据为 20%
train_data, test_data, train_target, test_target =train_test_split(train, target, test_size=0.2, random_state=0)
clf= SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data))
score_test = mean_squared_error(test_target,clf.predict(test_data))
print ("SGDRegressor train MSE: ", score_train)
print("SGDRegressor test MSE: ", score_test)
#K折交叉检验
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for k,(train_index, test_index) in enumerate(kf.split(train)):
train_data, test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index], target[test_index]
clf= SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data))
score_test = mean_squared_error(test_target,clf.predict(test_data))
print(k,"折", "SGDRegressor train MSE : ", score_train)
print(k,"折", "SGDRegressor test MSE " , score_test,'\n')
#留一法
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
num = 100
for k,(train_index,test_index) in enumerate(loo.split(train)):
train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
clf = SGDRegressor(max_iter=1000,tol=1e-3)
clf.fit(train_data,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data))
score_test = mean_squared_error(test_target,clf.predict(test_data))
print(k,"个","SGDRegressor train MSE: ",score_train)
print(k,"个","SGDRegressor test MSE: ",score_test,'\n')
if k>= 9:
break
#留P法交叉验证
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=10)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
train_data, test_data, train_target,test_target = train.values[train_index], train.values[test_index], target[train_index], target[test_index]
clf= SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(train_data, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data))
score_test = mean_squared_error(test_target, clf.predict(test_data))
print (k, "10个","SGDRegressor train MSE: ", score_train)
print (k, "10个","SGDRegressor test MSE: ", score_test,'\n')
if k >= 9:
break
穷举搜索的调参手段。
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
iris = load_iris()
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,random_state=0)
print("Size of training set:{} size of testing set:{}".format(X_train.shape[0],X_test.shape[0]))
#grid search start
best_score = 0
for gamma in [0.001,0.01,0.1,1,10,100] :
for C in [0.001,0.01,0.1,1,10,100]:
svm = SVC(gamma=gamma,C=C) #对于每种参数可能的组合 都进行一次训练
svm.fit(X_train,y_train)
score= svm.score(X_test,y_test)
if score > best_score : #找到表现最好的参数
best_score = score
best_parameters = {'gamma':gamma,'C':C}
### grid search end
print("Best score:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
#穷举网格搜索
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split #切分数据
#切分数据,训练数据为80%, 验证数据为 20%
train_data,test_data,train_target,test_target =train_test_split(train,target,test_size=0.2,random_state=0)
randomForestRegressor = RandomForestRegressor()
parameters = { 'n_estimators': [50,100,200] ,'max_depth': [1,2,3]}
clf= GridSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit (train_data,train_target)
score_test = mean_squared_error(test_target,clf.predict(test_data))
print("RandornForestRegressor GridSearchCV tes MSE ", score_test)
sorted (clf.cv_results_.keys())
#随机参数优化
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split #切分数据
#切分数据, 练数据为 80% 验证数据为 20
train_data,test_data,train_target,test_target = train_test_split(train,target,test_size=0.2, random_state=0)
randomForestRegressor = RandomForestRegressor()
parameters = {
'n_estimators': [50 , 100 , 200 , 300] ,
'max_depth': [1 , 2 , 3 , 4 , 5 ]
}
clf= RandomizedSearchCV(randomForestRegressor, parameters, cv=5)
clf.fit(train_data,train_target)
score_test= mean_squared_error(test_target,clf.predict(test_data))
print ("RandomForestRegressor RandomizedSearchCV test MSE : ", score_test)
sorted (clf.cv_results_.keys ())
学习曲线是在训练集大小不同时,通过绘制模型训练集和交叉验证集上的准确率来观察模型在新数据上的表现,进而判断模型的方差或偏差是否过高,以及增大训练集是否可以减小过拟合。
这道题的学习曲线:
#学习曲线
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve
plt.figure(figsize=(18,10),dpi=150)
train_data2 = pd.read_csv('./zhengqi_train.txt', sep='\t')
test_data2 = pd.read_csv ('./zhengqi_test.txt', sep= '\t')
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_sizes=np.linspace(.1,1.0,5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
train_sizes,train_scores,test_scores = learning_curve(estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std = np.std(test_scores,axis=1)
plt.grid()
plt.fill_between(train_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.1,color='r')
plt.fill_between(train_sizes,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.1,color='g')
plt.plot(train_sizes,train_scores_mean,'o-',color='r',label='Training score')
plt.plot(train_sizes,test_scores_mean,'o-',color='g',label='Cross-validation score')
plt.legend(loc='best')
return plt
X = train_data2[test_data2.columns].values
y = train_data2['target'].values
title = 'LinearRegression'
cv = ShuffleSplit(n_splits=100,test_size=0.2,random_state=0)
estimator = SGDRegressor()
plot_learning_curve(estimator,title,X,y,ylim=(0.7,1.01),cv=cv,n_jobs=-1)
和学习曲线不同,验证曲线的横轴为某个超参数的一系列值,由此比较不同参数设置下(而非不同训练集大小)模型的准确率。
这道题的验证曲线
#验证曲线
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import validation_curve
X = train_data2[test_data2.columns].values
y = train_data2['target'].values
param_range = [0.1,0.01,0.001,0.0001,0.00001,0.000001]
train_scores,test_scores = validation_curve(SGDRegressor(max_iter=1000,tol=1e-3,penalty='L1'),X,y,param_name='alpha',param_range=param_range,cv=10,scoring='r2',n_jobs=1)
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std = np.std(test_scores,axis=1)
plt.title("Validation Curve with SGDRegressor")
plt.xlabel('alpha')
plt.ylabel('Score')
plt.ylim(0.0,1.1)
plt.semilogx(param_range, train_scores_mean, label="Training score", color= "r")
plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean+ train_scores_std, alpha=0.2, color="r")
plt.semilogx(param_range,test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(param_range, test_scores_mean - test_scores_std,test_scores_mean+ test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show ()
epsilon = 1e-5
func_dict = {
'add':lambda x,y:x+y,
'min':lambda x,y:x-y,
'div':lambda x,y:x/(y+epsilon),
'multi':lambda x,y:x*y
}
def auto_features_make(train_data,test_data,func_dict,col_list):
train_data,test_data = train_data.copy(),test_data.copy()
for col_i in col_list:
for col_j in col_list:
for func_name,func in func_dict.items():
for data in [trian_data,test_data]:
func_features = func(data[col_i],data[col_j])
col_func_features = '-'.join([col_i,func_name,col_j])
data[col_func_features] = func_features
return train_data,test_data
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.data import iris_data
from mlxtend.plotting import plot_decision_regions
clf1 = LogisticRegression(random_state=0,solver='lbfgs',multi_class='auto')
clf2 = RandomForestClassifier(random_state=0,n_estimators=100)
clf3 = SVC(random_state=0,probability=True,gamma='auto')
eclf = EnsembleVoteClassifier(clfs=[clf1,clf2,clf3],weights=[2,1,1],voting='soft')
X,y = iris_data()
X = X[:,[0,2]]
gs = gridspec.GridSpec(1,4)
fig = plt.figure(figsize=(16,4))
for clf,lab,grd in zip([clf1,clf2,clf3,eclf],['Logistic Regression','Random Forest','RBF kernel SVM','Ensemble'],itertools.product([0,1],repeat=2)):
clf.fit(X,y)
ax = plt.subplot(gs[0,grd[0]*2+grd[1]])
fig = plot_decision_regions(X=X,y=y,clf=clf,legend=2)
plt.title(lab)
model='Ridge'
opt_models[model]=Ridge()
alph_range = np.arange(0.25,6,0.25)
param_grid = {'alpha':alph_range}
opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid,splits=splits, repeats=repeats)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(alph_range,abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
(1)真实值(横轴:y) 与模型预测值(竖轴:y_pred)的散点图,图形上方显示了相关性数值,其越接近1越好。对于岭回归模型,相关性数值为 0.947, 预测值与真实值比较一致。
(2) 其为在交叉验证训练模型时,真实值(横轴:y) 与栈型预测值和真实值的残 (竖轴: y - y_pred)的散点图,图形上方显示了方差,其越小说明模型越稳定。可以看到,对于岭回归模型,在真实值 y=-3 附近的预测值有较大的偏差,同时,方差为 0.319, 较为稳定。
(3) 图是由模型预测值和真实值的残差(横轴: z = (resid - mean_resid) / std_resid)
落在按 z 轴划分区间的频率(竖轴:频数)所画的直方图,图形上方显示了预测值与真实值的残差大于三倍标准差的数,其越小越好,越大说明预测中有些样本的偏差很大。对于岭回归模型,预测值与真实值的残差大于三倍标准差的数为 5 个,模型对偏差大的数据有较好的包容性。
(4) 岭回归模型的参数(横轴 :alpha) 与模型的评价指标 MSE (竖轴: score)的误差棒图。
2. Lasso 回归
model ='Lasso'
opt_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}
opt_models[model],cv_score,grid_results = train_model(opt_models[model],param_grid=param_grid,splits=splits,repeats=repeats)
cv_score.name= model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(alph_range,abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits * repeats))
plt.xlabel ('alpha')
plt.ylabel ('score')
model='ElasticNet'
opt_models[model] = ElasticNet()
param_grid = {'alpha': np.arange(1e-4, 1e-3, 1e-4),'ll_ratio': np.arange(0.1,1.0,0.1), 'max iter': [100000]}
opt_models[model], cv_score, grid_results = train_model(opt_models[model],param_grid=param_grid,splits=splits,repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
model='LinearSVR'
opt_rnodels[model] = LinearSVR()
crange = np.arange(0.1,1.0,0.1)
param_grid = {'C':crange,'max_iter':[1000]}
opt_models[model],cv_score,grid_results = train_model(opt_models[model],param_grid=param_grid,splits=splits,repeats=repeats)
cv_score.name= model
score_models = score_models.append(cv_score)
plt.figure()
plt.errorbar(crange,abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqr(splits * repeats))
plt.xlabel('C')
plt.ylabel('score')
model='KNeighbors'
opt_models[model] = KNeighborsRegressor()
param_grid = {'n_neighbors': np.arange(3,11,1)}
opt_models[model], cv_score, grid_results = train_model(opt_models[model],param_grid=param_grid,splits=splits,repeats=1)
cv_score.name = model
score_models = score_models.append(cv_score)
plt.figure ()
plt.errorbar(np.arange(3, 11, 1),
abs(grid_results['mean_test_score']),
abs(grid_results['mean_test_score']) /np.sqrt(splits * 1))
plt.xlabel('n_neighbors')
plt.ylabel('score')
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
def stacking_reg(clf, train_x , train_y , test_x , clf_name , kf,
label_split=None):
train= np.zeros((train_x.shape[0],1))
test = np.zeros((test_x.shape[0],1))
test_pre = np.empty((folds, test_x.shape[O], 1))
cv_scores= []
for i,(train_index,test_index) in enumerate(kf.split(train_x, label split)):
tr_x = train_x[train_index]
tr_y = train_y[train_index]
te_x= train_x[test_index]
te_y = train_y[test_index]
if clf_name in ["rf","ada", "gb", "et", "lr", "lsvc", "knn"]:
clf.fit (tr_x , tr_y)
pre= clf.predict(te_x).reshape (-1,1)
train[test_index] = pre
test_pre[i,:] = clf.predict(test_x).reshape (-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
elif clf_name in ["xgb"]:
train_matrix = clf.DMatrix(tr_x, label=tr_y,missing=-1)
test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
z = clf.DMatrix (test_x,label=te_y,missing=-1)
params = { 'booster' : 'gbtree',
'eval_metric' : 'rmse',
'gamma' : 1 ,
'min_child_weight': 1.5,
'max_ ep': 5,
'lambda' : 10 ,
'subsample': 0.7 ,
'colsaMple_bytree' : 0.7,
'colsample_bylevel': 0.7,
'eta' : 0.03 ,
'tree_ method':'exact',
'seed': 2017 ,
'nthread': 12
}
num_round= 10000
early_stopping_rounds = 100
watchlist =[(train_matrix,'train'), (test_matrix ,'eval')]
if test_matrix:
model = clf.train(params, train_matrix,num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds)
pre = model.predict(test_matrix, ntree_limit=model_best_ntree_limit).reshape(-1 , 1)
train[test_index] = pre
test_pre[i, :] =model.predict(z, ntree_limit=model.best_ntree_limit).reshape(-1, 1)cv_scores.append(mean_squared_error(te_y, pre))
elif clf_name in ["lgb"]:
train_matrix = clf.Dataset(tr_x, label=tr_y)
test_matrix = clf.Dataset(te_x, label=te_y)
params = {
'boosting_ type':'gbdt',
'objective':'regression_12',
'metric':'mse',
'min_child_weight': 1.5,
'num leaves': 2**5,
'lambda 12': 10,
'subsample': 0. 7,
'colsample_bytree': 0. 7,
'colsample_bylevel': 0.7,
'learning_rate': 0. 03,
'tree_ method':'exact',
'seed': 2017,
'nth read': 12,
'silent': True,
}
nuro_round = 10000
early_stopping_rounds = 100
if test_matrix:
model = clf.train (
params,
train_matrix,
num_round,
valid_sets=test_matrix,
early_stopping_rounds=early_stopping_rounds)
pre = model.predict(te_x,num_iteration=model.best_iteration).reshape (-1, 1)
train[test_index]= pre
test_pre[i, :] = model.predict(test_x, num_iteration=model.best_iteration).reshape(-1, 1)
cv_scores.append(mean_squared_error(te_y, pre))
else:
raise IOError ("Please add new clf.")
print("%s now score is:" % clf_name, cv_scores)
test(:]= test_pre.mean(axis=O)
print("%s_score_list:" % clf_name, cv_scores)
print("%s_score_mean: "% clf_name, np.mean{cv_scores))
return train.reshape(-1, 1), test.reshape(-1, 1)
def rf_reg(x_train, y_train, x_valid, kf, label_split=None):
randomforest = RandomForestRegressor(n_estimators=600,max_depth=20, n_jobs=-1, random_state=2017,max_features="auto", verbose=1)
rf_train, rf_test = stacking_reg(randomforest, x_train, y_train , x_valid, "rf", kf, label_ split=label_split)
return rf_train, rf_test, "rf_reg"
def ada_reg(x_train, y_train, x_valid, kf, label_split=None) :
adaboost = AdaBoostRegressor(n_estimators=30,random_state=2017, learning_rate=0.01)
ada_train, ada_test= stacking_reg(adaboost,x_train, y_train, x_valid, "ada", kf, label split= label_split)
return ada_train,ada_test, "ada_reg"
def gb_reg(x_train, y_train, x_valid, kf, label_split=None):
gbdt = GradientBoostingRegressor(learning_rate=0.04,n_estimators=100, subsample=0.8, random_state=2017, max_depth=5,verbose=1)
gbdt_train, gbdt_test= stacking_reg(gbdt,x_train, y_train,x_valid,"gb",kf,label_split=label_split)
return gbdt_trian,gbdt_test,"gb_reg"
def et_reg(x_train, y_train,x_valid,kf,label_split=None):
extratree = ExtraTreesRegressor(n_estimators=600,max_depth=35,max_features="auto", n_jobs=-1 , random_state=2017,verbose=1)
et_train, et_test= stacking_reg(extratree,x_train, y_train, x_valid, "et", kf, label_split=label_split)
return et_train, et_test ,"et_reg"
def lr_reg(x_train , y_train,x_valid, kf, label_split=None):
lr_reg = LinearRegression(n_jobs=- 1)
lr_train, lr_test = stacking_reg(lr_reg,x_train,y_train, x_valid, "lr", kf, label_split=label_split)
return lr_train,lr_test,"lr_reg"
def xgb_reg(x_train,y_train,x_valid, kf, label_split=None):
xgb_train , xgb_test = stacking_reg(xgboost,x_train, y_train, x_valid, "xgb", kf, label_split= label_split)
return xgb_train, xgb_test,"xgb_reg"
def lgb_reg(x_train,y_train,x_valid, kf, label_split=None):
lgb_train, lgb_test= stacking_reg(lightgbm,x_train,y_train, x_valid, "lgb", kf, label_split=label_split)
return lgb_train, lgb_test, "lgb_reg"
对模型融合 Stacking 的预测函数进行定义
def stacking_pred(x_train,y_train, x_valid, kf,clf_list,label_split=None, clf_fin="lgb", if_concat_origin=True):
for k, clf_list in enumerate(clf_list):
clf_list = [clf_list]
column_list = []
train_data_list = []
test_data_list= []
for clf in clf_list:
train_data, test_data, clf_name= clf(x_train,y_train, x_valid, kf, label_split=label_split)
train_data_list.append(train_data)
test_data_list.append(test_data)
column_list.append("clf_%s" % (clf_name))
train= np.concatenate(train_data_list, axis=1)
test= np.concatenate(test_data_list, axis=1)
if if_concat_origin:
train= np.concatenate([x_train, train], axis=1)
test= np.concatenate([x_valid, test], axis=1)
print(x_train.shape)
print(train.shape)
print(clf_name)
print(clf_name in ["lgb"])
if clf_fin in [ "rf", "ada", "gb", "et", "lr", "lsvc", "knn"]:
if clf_fin in ["rf"]:
clf= RandomForestRegressor(n_estimators=600,max_depth=20, n_jobs=-1, random_state=2017, max_features="auto" , verbose=1)
elif clf_fin in ["ada"] :
clf = AdaBoostRegressor (n_estimators=30,random_stat e=2017,learning_rate=0.01)
elif clf_fin in ["gb"]:
clf= GradientBoostingRegressor(learning_rate=0.04, n_estimators=100,subsample=0.8, random_state=2017 ,max_depth=5,verbose=1)
elif clf_fin in ["et"] :
clf= ExtraTreesRegressor(n_estimators=600,max_depth=35 , max_features="auto",n_jobs=-1 , random_state=2017,verbose=1)
elif clf_fin in [ "lr"] :
clf= LinearRegression(n_jobs=-1)
clf.fit(train, y_train)
pre= clf.predict(test).reshape(-1,1)
return pred _
elif clf_fin in ["xgb"]:
clf = xgboost
train_matrix = clf. DMatrix(train, label=y_train,missing=-1)
test_matrix = clf.DMatrix(train , label=y_train,missing=-1)
params = {
' booster':'gbtree',
' eval _metric ':'rmse',
'gamma ': 1 ,
'min_child_weight' : 1.5,
'max_depth' : 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7 ,
'colsample_bylevel': 0.7 ,
'eta': 0.03 ,
'tree_method' :'exact',
'seed': 2017,
'nthread': 12
}
num_round = 10000
early_stopping_rounds = 100
watchlist = [(train_matrix, 'train'), (test_matrix ,' eval')]
model = clf. train(params, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds)
pre = model.predict(test, ntree_limit=model.best_ntree_limit).reshape(-1, 1)
return pre
elif clf_fin in ["lgb"]:
print(clf_name)
clf= lightgbm
train_matrix = clf.Dataset(train, label=y_train)
test_matrix = clf.Dataset(train, label=y_train)
params = {
'boosting_type ':' gbd ',
'objective':'regression_l2',
'metric':'mse',
'min_child_weight' : 1. 5,
'num_leaves' : 2**5 ,
'lambda_12' : 10,
'subsample': 0. 7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0. 03,
'tree_method' :' exact ',
'seed': 2017,
'nthread': 12,
'silent': True,
}
num_round = 10000
early_stopping_rounds = 100
model = clf.train(params,train_matrix, num_round, valid_sets =test_matrix,early_stopping_rounds=early_stopping_rounds)
print (' pred ' )
pre = model.predict(test,num_iteration=model.best_itera ion.rehape(- 1 , 1 )
print (pre)
return pre
with open(". /zhengqi_train.txt") as fr:
data_train=pd.read_table(fr, sep="\t")
with open( ". /zhengqi_test.txt ") as fr_test:
data_test = pd.read_table(fr_test, sep="\t")
from sklearn.model_selection import StratifiedKFold, KFold
folds= 5
seed= 1
kf = KFold(n_splits=5 , shuffle=True , random_state=0)
x_train = data_train[data_test.columns].values
x_valid = data_test[data_test.columns].values
y_train = data_train['target'].values
pred = stacking_pred(x_train, y_train , x_valid, kf, clf_list, label_split=None , clf_fin="lgb", if_concat_origin=True)