scikit-learn带有一些标准数据集,例如用于分类的iris和digits数据集,用于回归的bostonhouse prices 数据集。
fromsklearn.datasets importload_iris
fromsklearn.datasets importload_boston
iris=load_iris()
boston=load_boston
x_iris=iris.data
y_iris=iris.target
printx_iris.shape
printy_iris.shape
输出为:
(150, 4)
(150,)
fromsklearn.datasets importmake_regression
x,y=make_regression(n_samples=5, n_features=5, n_informative=3, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
print1
print x
printy
print2
x,y=make_regression(n_samples=5, n_features=3, n_informative=3, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None)
print x
printy
fromsklearn.datasets importmake_classification
x,y=make_classification(n_samples=5, n_features=5, n_informative=3)
print3
print x
printy
输出为:
1
[[ 1.3887649 -0.67264273 0.43790666 -0.17069749 0.38238322]
[ 0.47251375 -0.75166612 0.23178239-1.661831 -0.81182998]
[ 0.03739107 -1.34200362 0.24249631 0.43151133 0.26080355]
[ 1.12646446 0.43570731-0.21344475 0.36708684 -0.87513419]
[ 1.55191935 -0.81177223 -0.03115494 -0.80722552 -0.04601602]]
[ -9.35751369 -158.24803182 42.61616431 21.69803828 -73.08849933]
2
[[ 2.01145171 1.17357249-1.85035062]
[-0.53409156 0.55599955 0.04779895]
[ 0.43052286 1.60386627 0.48718064]
[-1.93366169 -0.01782231 0.30727286]
[-0.46353664 -0.63285618-0.09485492]]
[ 76.90215821 5.33297211 186.78542704 -115.69362051 -86.88520421]
3
[[ 1.53228845 -1.5708226 -0.8524258 -1.77329403 1.02937806]
[ 1.04725244 0.10639027-1.12618235 1.40243761 -0.6630892 ]
[ 0.59248386 -0.08554676 -0.25855043 0.30970562 0.21408284]
[-0.49888926 -0.46972133 1.7796799 -2.13879689 2.21883169]
[-1.78928116 1.32412246 1.39049161 0.85772537-0.39507264]]
[0 0 1 0 1]
import numpyas np
y_true=np.array([1,0,1,1,0])
y_pred=np.array([0,0,1,1,1])
fromsklearn.metrics importaccuracy_score
printaccuracy_score(y_true,y_pred)
输出为:
0.6
import numpyas np
data=np.loadtxt("E:/sklearn/data.txt",delimiter=",")
printdata.shape
printdata[:9,:]
x=data[:,:7]
y=data[:,8]
输出为:
(768, 9)
[[ 6.00000000e+00 1.48000000e+02 7.20000000e+01 3.50000000e+01
0.00000000e+00 3.36000000e+01 6.27000000e-01 5.00000000e+01
1.00000000e+00]
[ 1.00000000e+00 8.50000000e+01 6.60000000e+01 2.90000000e+01
0.00000000e+00 2.66000000e+01 3.51000000e-01 3.10000000e+01
0.00000000e+00]
[ 8.00000000e+00 1.83000000e+02 6.40000000e+01 0.00000000e+00
0.00000000e+00 2.33000000e+01 6.72000000e-01 3.20000000e+01
1.00000000e+00]
[ 1.00000000e+00 8.90000000e+01 6.60000000e+01 2.30000000e+01
9.40000000e+01 2.81000000e+01 1.67000000e-01 2.10000000e+01
0.00000000e+00]
[ 0.00000000e+00 1.37000000e+02 4.00000000e+01 3.50000000e+01
1.68000000e+02 4.31000000e+01 2.28800000e+00 3.30000000e+01
1.00000000e+00]
[ 5.00000000e+00 1.16000000e+02 7.40000000e+01 0.00000000e+00
0.00000000e+00 2.56000000e+01 2.01000000e-01 3.00000000e+01
0.00000000e+00]
[ 3.00000000e+00 7.80000000e+01 5.00000000e+01 3.20000000e+01
8.80000000e+01 3.10000000e+01 2.48000000e-01 2.60000000e+01
1.00000000e+00]
[ 1.00000000e+01 1.15000000e+02 0.00000000e+00 0.00000000e+00
0.00000000e+00 3.53000000e+01 1.34000000e-01 2.90000000e+01
0.00000000e+00]
[ 2.00000000e+00 1.97000000e+02 7.00000000e+01 4.50000000e+01
5.43000000e+02 3.05000000e+01 1.58000000e-01 5.30000000e+01
1.00000000e+00]]
import numpyas np
importurllib
url="http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
raw_data = urllib.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=",")
X = dataset[:,0:7]
y = dataset[:,8]
print X
printy
输出为:
[[ 6. 148.72. ..., 0. 33.6 0.627]
[ 1. 85.66. ..., 0. 26.6 0.351]
[ 8. 183.64. ..., 0. 23.3 0.672]
...,
[ 5. 121. 72. ..., 112. 26.2 0.245]
[ 1. 126.60. ..., 0. 30.1 0.349]
[ 1. 93.70. ..., 0. 30.4 0.315]]
[ 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1.
0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1.
0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.
1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0.
0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1.
0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1.
1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1.
1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0.
0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0.
0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1.
1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1.
0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0.
1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1.
0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0.
0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1.
1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.
1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0.
0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0.
1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0.
0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0.
0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1.
0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0.
0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0.
0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1.
1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1.
1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0.
1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1.
0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1.
1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1.
0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.
0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1.
0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0.]
主要在sklearn.preprcessing包下。
规范化:
编码:
通过特征提取,我们能得到未经处理的特征,这时的特征可能有以下问题:
不属于同一量纲:即特征的规格不一样,不能够放在一起比较。无量纲化可以解决这一问题。
信息冗余:对于某些定量特征,其包含的有效信息为区间划分,例如学习成绩,假若只关心“及格”或不“及格”,那么需要将定量的考分,转换成“1”和“0”表示及格和未及格。二值化可以解决这一问题。
定性特征不能直接使用:某些机器学习算法和模型只能接受定量特征的输入,那么需要将定性特征转换为定量特征。最简单的方式是为每一种定性值指定一个定量值,但是这种方式过于灵活,增加了调参的工作。通常使用哑编码的方式将定性特征转换为定量特征:假设有N种定性值,则将这一个特征扩展为N种特征,当原始特征值为第i种定性值时,第i个扩展特征赋值为1,其他扩展特征赋值为0。哑编码的方式相比直接指定的方式,不用增加调参的工作,对于线性模型来说,使用哑编码后的特征可达到非线性的效果。
存在缺失值:缺失值需要补充。
信息利用率低:不同的机器学习算法和模型对数据中信息的利用是不同的,之前提到在线性模型中,使用对定性特征哑编码可以达到非线性的效果。类似地,对定量变量多项式化,或者进行其他的转换,都能达到非线性的效果。
preproccessing库来进行数据预处理,可以覆盖以上问题的解决方案。
无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。标准化的前提是特征值服从正态分布,标准化后,其转换成标准正态分布。区间缩放法利用了边界值信息,将特征的取值区间缩放到某个特点的范围,例如[0, 1]等。
fromsklearn.datasets importload_iris
iris=load_iris()
x=iris['data']
print x[:5,:]
fromsklearn.preprocessing importStandardScaler
x_stand= StandardScaler().fit_transform(iris.data)
printx_stand[:5,:]
输出为:
[[ 5.1 3.5 1.4 0.2]
[ 4.9 3. 1.4 0.2]
[ 4.7 3.2 1.3 0.2]
[ 4.6 3.1 1.5 0.2]
[ 5. 3.6 1.4 0.2]]
[[-0.90068117 1.03205722-1.3412724 -1.31297673]
[-1.14301691 -0.1249576 -1.3412724 -1.31297673]
[-1.38535265 0.33784833 -1.39813811 -1.31297673]
[-1.50652052 0.10644536 -1.2844067 -1.31297673]
[-1.02184904 1.26346019 -1.3412724 -1.31297673]]
区间缩放法的思路有多种,常见的一种为利用两个最值进行缩放,公式表达为:
fromsklearn.preprocessing import MinMaxScaler
x_MM=MinMaxScaler().fit_transform(iris.data)
print x_MM[:5,:]
输出为:
[[ 0.22222222 0.625 0.06779661 0.04166667]
[ 0.16666667 0.41666667 0.06779661 0.04166667]
[ 0.11111111 0.5 0.05084746 0.04166667]
[ 0.08333333 0.45833333 0.08474576 0.04166667]
[ 0.19444444 0.66666667 0.06779661 0.04166667]]
简单来说,标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,将样本的特征值转换到同一量纲下。归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时,拥有统一的标准,也就是说都转化为“单位向量”。
fromsklearn.preprocessing importNormalizer
x_N=Normalizer().fit_transform(iris.data)
print x_N[0:5,:]
输出为:
[[ 0.80377277 0.55160877 0.22064351 0.0315205 ]
[ 0.82813287 0.50702013 0.23660939 0.03380134]
[ 0.80533308 0.54831188 0.2227517 0.03426949]
[ 0.80003025 0.53915082 0.26087943 0.03478392]
[ 0.790965 0.5694948 0.2214702 0.0316386 ]]
定量特征二值化的核心在于设定一个阈值,大于阈值的赋值为1,小于等于阈值的赋值为0,公式表达如下:
使用preproccessing库的Binarizer类对数据进行二值化的代码如下:
fromsklearn.preprocessing importBinarizer
x_B=Binarizer().fit_transform(iris.data)
print x_B[:5,:]
输出为:
[[ 1. 1. 1. 1.]
[ 1. 1. 1. 1.]
[ 1. 1. 1. 1.]
[ 1. 1. 1. 1.]
[ 1. 1. 1. 1.]]
One-Hot编码,又称为一位有效编码,主要是采用位状态寄存器来对个状态进行编码,每个状态都由他独立的寄存器位,并且在任意时候只有一位有效。
在实际的机器学习的应用任务中,特征有时候并不总是连续值,有可能是一些分类值,如性别可分为“male”和“female”。在机器学习任务中,对于这样的特征,通常我们需要对其进行特征数字化,如下面的例子:
有如下三个特征属性:
性别:["male","female"]
地区:["Europe","US","Asia"]
浏览器:["Firefox","Chrome","Safari","Internet Explorer"]
对于某一个样本,如["male","US","Internet Explorer"],我们需要将这个分类值的特征数字化,最直接的方法,我们可以采用序列化的方式:[0,1,3]。但是这样的特征处理并不能直接放入机器学习算法中。
对于上述的问题,性别的属性是二维的,同理,地区是三维的,浏览器则是四维的,我们可以采用One-Hot编码的方式对上述的样本“["male","US","InternetExplorer"]”编码,“male”则对应着[1,0],同理“US”对应着[0,1,0],“Internet Explorer”对应着[0,0,0,1]。则完整的特征数字化的结果为:[1,0,0,1,0,0,0,0,1]。这样导致的一个结果就是数据会变得非常的稀疏。
from sklearnimport preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit([[0,0,3],[1,1,0],[0,2,1],[1,0,2]])
array = enc.transform([[0,1,3]]).toarray()
printarray
输出为:
[[ 1. 0. 0. 1. 0. 0. 0. 0. 1.]]
fromsklearn.preprocessing importImputer
import numpyas np
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X=np.array([[1,2], [np.nan,7],[5,8]])
imp.fit(X)
print imp.transform(X)
Y=[[5,2], [6, np.nan], [7,6]]
imp.fit(Y)
print imp.transform(Y)
输出为:
[[ 1. 2.]
[ 3. 7.]
[ 5. 8.]]
[[ 5. 2.]
[ 6. 4.]
[ 7. 6.]]
from numpyimport log1p
fromsklearn.preprocessing importFunctionTransformer
printiris.data[0:10]
print FunctionTransformer(log1p).fit_transform(iris.data)[0:10]
输出为:
[[ 5.1 3.5 1.4 0.2]
[ 4.9 3. 1.4 0.2]
[ 4.7 3.2 1.3 0.2]
[ 4.6 3.1 1.5 0.2]
[ 5. 3.6 1.4 0.2]
[ 5.4 3.9 1.7 0.4]
[ 4.6 3.4 1.4 0.3]
[ 5. 3.4 1.5 0.2]
[ 4.4 2.9 1.4 0.2]
[ 4.9 3.1 1.5 0.1]]
[[ 1.80828877 1.5040774 0.87546874 0.18232156]
[ 1.77495235 1.38629436 0.87546874 0.18232156]
[ 1.74046617 1.43508453 0.83290912 0.18232156]
[ 1.7227666 1.41098697 0.91629073 0.18232156]
[ 1.79175947 1.5260563 0.87546874 0.18232156]
[ 1.85629799 1.58923521 0.99325177 0.33647224]
[ 1.7227666 1.48160454 0.87546874 0.26236426]
[ 1.79175947 1.48160454 0.91629073 0.18232156]
[ 1.68639895 1.36097655 0.87546874 0.18232156]
[ 1.77495235 1.41098697 0.91629073 0.09531018]]
类 |
功能 |
说明 |
StandardScaler |
无量纲化 |
标准化,基于特征矩阵的列,将特征值转换至服从标准正态分布 |
MinMaxScaler |
无量纲化 |
区间缩放,基于最大最小值,将特征值转换到[0, 1]区间上 |
Normalizer |
归一化 |
基于特征矩阵的行,将样本向量转换为“单位向量” |
Binarizer |
二值化 |
基于给定阈值,将定量特征按阈值划分 |
OneHotEncoder |
哑编码 |
将定性数据编码为定量数据 |
Imputer |
缺失值计算 |
计算缺失值,缺失值可填充为均值等 |
FunctionTransformer |
自定义单元数据转换 |
使用单变元的函数来转换数据 |
包:sklearn.feature_extraction
特征抽取是数据挖掘任务最为重要的一个环节,一般而言,它对最终结果的影响要高过数据挖掘算法本身。只有先把现实用特征表示出来,才能借助数据挖掘的力量找到问题的答案。特征选择的另一个优点在于:降低真实世界的复杂度,模型比现实更容易操纵。
一般最常使用的特征抽取技术都是高度针对具体领域的,对于特定的领域,如图像处理,在过去一段时间已经开发了各种特征抽取的技术,但这些技术在其他领域的应用却非常有限。
当数据预处理完成后,我们需要选择有意义的特征输入机器学习的算法和模型进行训练。通常来说,从两个方面考虑来选择特征:
· 特征是否发散:如果一个特征不发散,例如方差接近于0,也就是说样本在这个特征上基本上没有差异,这个特征对于样本的区分并没有什么用。
· 特征与目标的相关性:这点比较显见,与目标相关性高的特征,应当优选选择。除方差法外,本文介绍的其他方法均从相关性考虑。
根据特征选择的形式又可以将特征选择方法分为3种:
· Filter:过滤法,按照发散性或者相关性对各个特征进行评分,设定阈值或者待选择阈值的个数,选择特征。
· Wrapper:包装法,根据目标函数(通常是预测效果评分),每次选择若干特征,或者排除若干特征。
· Embedded:嵌入法,先使用某些机器学习的算法和模型进行训练,得到各个特征的权值系数,根据系数从大到小选择特征。类似于Filter方法,但是是通过训练来确定特征的优劣。
我们使用sklearn中的feature_selection库来进行特征选择。
交叉验证
使用方差选择法,先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。使用feature_selection库的VarianceThreshold类来选择特征的代码如下:
fromsklearn.preprocessing importImputer
importnumpy as np
fromsklearn.datasets importload_iris
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection import VarianceThreshold
iris_=VarianceThreshold(threshold=0.5).fit_transform(iris.data)
printlen(iris_)
print len(iris_[0])
输出为:
150
4
150
3
使用相关系数法,先要计算各个特征对目标值的相关系数以及相关系数的P值。用feature_selection库的SelectKBest类结合相关系数来选择特征的代码如下:
fromsklearn.datasets importload_iris
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection import SelectKBest,chi2
data_K=SelectKBest(chi2,k=2).fit_transform(iris.data,iris.target)
printlen(data_K)
print len(data_K[0])
输出为:
150
4
150
2
再来一个例子
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
SelectKBest(lambda X, Y: np.array(map(lambda x:pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)
经典的卡方检验是检验定性自变量对定性因变量的相关性。假设自变量有N种取值,因变量有M种取值,考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距,构建统计量:
这个统计量的含义简而言之就是自变量对因变量的相关性。用feature_selection库的SelectKBest类结合卡方检验来选择特征的代码如下:
fromsklearn.datasets importload_iris
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection import SelectKBest,chi2
data_K=SelectKBest(chi2,k=2).fit_transform(iris.data,iris.target)
printlen(data_K)
print len(data_K[0])
输出为:
150
4
150
2
经典的互信息也是评价定性自变量对定性因变量的相关性的,互信息计算公式如下:
为了处理定量数据,最大信息系数法被提出,使用feature_selection库的SelectKBest类结合最大信息系数法来选择特征的代码如下:
import numpyas np
fromsklearn.datasets importload_iris
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection import SelectKBest
fromminepy import MINE
defmic(x, y):
m = MINE()
m.compute_score(x, y)
return(m.mic(), 0.5)
data_K=SelectKBest(lambda X, Y:np.array(map(lambdax:mic(x, Y), X.T)).T, k=2).fit_transform(iris.data,iris.target)
printlen(data_K)
printlen(data_K[0])
递归消除特征法使用一个基模型来进行多轮训练,每轮训练后,消除若干权值系数的特征,再基于新的特征集进行下一轮训练。使用feature_selection库的RFE类来选择特征的代码如下:
importnumpy as np
fromsklearn.datasets importload_iris
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection import RFE
fromsklearn.linear_model importLogisticRegression
data_K=RFE(estimator=LogisticRegression(),n_features_to_select=2).fit_transform(iris.data,iris.target)
printlen(data_K)
print len(data_K[0])
输出为:
150
4
150
2
使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。使用feature_selection库的SelectFromModel类结合带L1惩罚项的逻辑回归模型,来选择特征的代码如下:
importnumpy as np
fromsklearn.datasets importload_iris
fromsklearn.feature_selection.from_modelimport SelectFromModel
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection importRFE
fromsklearn.linear_model importLogisticRegression
data_K=SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data,iris.target)
printlen(data_K)
printlen(data_K[0])
L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。故,可结合L2惩罚项来优化。具体操作为:若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合,将这一集合中的特征平分L1中的权值,故需要构建一个新的逻辑回归模型:
使用feature_selection库的SelectFromModel类结合带L1以及L2惩罚项的逻辑回归模型,来选择特征的代码如下:
1from sklearn.feature_selection import SelectFromModel
2
3#带L1和L2惩罚项的逻辑回归作为基模型的特征选择
4#参数threshold为权值系数之差的阈值
5SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target)
树模型中GBDT也可用来作为基模型进行特征选择,使用feature_selection库的SelectFromModel类结合GBDT模型,来选择特征的代码如下:
importnumpy as np
fromsklearn.datasets importload_iris
fromsklearn.feature_selection.from_modelimport SelectFromModel
fromsklearn.ensemble.gradient_boostingimport GradientBoostingClassifier
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection importRFE
fromsklearn.linear_model importLogisticRegression
data_K=SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data,iris.target)
printlen(data_K)
print len(data_K[0])
输出为:
150
4
150
2
fromsklearn import metrics
fromsklearn.ensemble importExtraTreesClassifier
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)
输出为:
[ 0.11938889 0.06338136 0.42425273 0.39297703]
import numpy as np
from scipy.stats importpearsonr
np.random.seed(0)
size = 300
x = np.random.normal(0,1, size)
print "Lowernoise",pearsonr(x, x + np.random.normal(0, 1, size))
print "Highernoise", pearsonr(x,x + np.random.normal(0, 10, size))
Lower noise(0.71824836862138386,7.3240173129992273e-49)Higher noise (0.057964292079338148,0.31700993885324746)
Pearson相关系数的一个明显缺陷是,作为特征排序机制,他只对线性关系敏感。如果关系是非线性的,即便两个变量具有一一对应的关系,Pearson相关性也可能会接近0。
x = np.random.uniform(-1, 1, 100000)
print pearsonr(x, x**2)[0]
-0.00230804707612
想把互信息直接用于特征选择其实不是太方便:1、它不属于度量方式,也没有办法归一化,在不同数据及上的结果无法做比较;2、对于连续变量的计算不是很方便(X和Y都是集合,x,y都是离散的取值),通常变量需要先离散化,而互信息的结果对离散化的方式很敏感。
最大信息系数克服了这两个问题。它首先寻找一种最优的离散化方式,然后把互信息取值转换成一种度量方式,取值区间在[0,1]。 minepy 提供了MIC功能。
反过头来看y=x^2这个例子,MIC算出来的互信息值为1(最大的取值)。
from minepy import MINE
m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x**2)
print m.mic()
1.0
MIC的统计能力遭到了 一些质疑 ,当零假设不成立时,MIC的统计就会受到影响。在有的数据集上不存在这个问题,但有的数据集上就存在这个问题
from scipy import spatial
x=np.random.uniform(-1,1,1000)
spatial.distance.correlation(x,x**2)
0.99133695904559382
类 |
所属方式 |
说明 |
VarianceThreshold |
Filter |
方差选择法 |
SelectKBest |
Filter |
可选关联系数、卡方校验、最大信息系数作为得分计算的方法 |
RFE |
Wrapper |
递归地训练基模型,将权值系数较小的特征从特征集合中消除 |
SelectFromModel |
Embedded |
训练基模型,选择权值系数较高的特征 |
model.feature_importances_ |
Model |
|
Pearsonr |
|
|
IV |
模型选择法 |
|
Distance correlation |
|
|
使用decomposition库的PCA类选择特征的代码如下:
importnumpy as np
fromsklearn.datasets importload_iris
fromsklearn.feature_selection.from_modelimportSelectFromModel
fromsklearn.ensemble.gradient_boostingimportGradientBoostingClassifier
fromsklearn.decomposition.pca import PCA
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection importRFE
fromsklearn.linear_model importLogisticRegression
data_K=PCA(n_components=2).fit_transform(iris.data)
printlen(data_K)
print len(data_K[0])
输出为:
150
4
150
2
使用lda库的LDA类选择特征的代码如下:
fromsklearn.preprocessing importImputer
importnumpy as np
fromsklearn.datasets importload_iris
fromsklearn.feature_selection.from_modelimportSelectFromModel
fromsklearn.ensemble.gradient_boostingimportGradientBoostingClassifier
fromsklearn.decomposition.pca importPCA
fromsklearn.lda import LDA
iris=load_iris()
printlen(iris.data)
printlen(iris.data[0])
fromsklearn.feature_selection importRFE
fromsklearn.linear_model importLogisticRegression
data_K=LDA(n_components=2).fit_transform(iris.data, iris.target)
printlen(data_K)
print len(data_K[0])
输出为:
150
4
150
2
库 |
类 |
说明 |
decomposition |
PCA |
主成分分析法 |
lda |
LDA |
线性判别分析法 |
使用举例
AdaBoostClassifier(DecisionTreeClassifier(max_depth=
1),
algorithm=
"SAMME",
n_estimators=
200)
fromsklearn import metrics
fromsklearn.linear_model importLogisticRegression
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))
输出为:
LogisticRegression(C=1.0, class_weight=None, dual=False,fit_intercept=True,
intercept_scaling=1,max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
precision recall f1-score support
0 1.00 1.00 1.00 50
1 0.98 0.90 0.94 50
2 0.91 0.98 0.94 50
avg / total 0.96 0.96 0.96 150
[[50 0 0]
[ 0 45 5]
[ 0 1 49]]
fromsklearn import metrics
fromsklearn.linear_model importLogisticRegression
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
#model = LogisticRegression()
fromsklearn.naive_bayes importGaussianNB
model = GaussianNB()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))
输出为:
GaussianNB(priors=None)
precision recall f1-score support
0 1.00 1.00 1.00 50
1 0.94 0.94 0.94 50
2 0.94 0.94 0.94 50
avg / total 0.96 0.96 0.96 150
[[50 0 0]
[ 0 47 3]
[ 0 3 47]]
fromsklearn import metrics
fromsklearn.linear_model importLogisticRegression
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
#model = LogisticRegression()
#from sklearn.naive_bayes import GaussianNB
#model = GaussianNB()
fromsklearn.neighbors importKNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))
输出为:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None,n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
precision recall f1-score support
0 1.00 1.00 1.00 50
1 0.96 0.94 0.95 50
2 0.94 0.96 0.95 50
avg / total 0.97 0.97 0.97 150
[[50 0 0]
[ 0 47 3]
[ 0 2 48]]
fromsklearn import metrics
fromsklearn.linear_model importLogisticRegression
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
#model = LogisticRegression()
#from sklearn.naive_bayes import GaussianNB
#model = GaussianNB()
#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier()
fromsklearn.tree importDecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))
输出为:
DecisionTreeClassifier(class_weight=None, criterion='gini',max_depth=None,
max_features=None,max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2,min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
precision recall f1-score support
0 1.00 1.00 1.00 50
1 1.00 1.00 1.00 50
2 1.00 1.00 1.00 50
avg / total 1.00 1.00 1.00 150
[[50 0 0]
[ 0 50 0]
[ 0 0 50]]
fromsklearn import metrics
fromsklearn.linear_model importLogisticRegression
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
#model = LogisticRegression()
#from sklearn.naive_bayes import GaussianNB
#model = GaussianNB()
#from sklearn.neighbors import KNeighborsClassifier
#model = KNeighborsClassifier()
#from sklearn.tree import DecisionTreeClassifier
#model = DecisionTreeClassifier()
fromsklearn.svm importSVC
model = SVC()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected,predicted))
print(metrics.confusion_matrix(expected,predicted))
输出为:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None,degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False,random_state=None, shrinking=True,
tol=0.001, verbose=False)
precision recall f1-score support
0 1.00 1.00 1.00 50
1 1.00 0.96 0.98 50
2 0.96 1.00 0.98 50
avg / total 0.99 0.99 0.99 150
[[50 0 0]
[ 0 48 2]
[ 0 0 50]]
网格搜索为自动化调参的常见技术之一,grid_search包提供了自动化调参的工具,包括GridSearchCV类。对组合好的对象进行训练以及调参的代码如下:
1from sklearn.grid_search import GridSearchCV
2
3#新建网格搜索对象
4#第一参数为待训练的模型
5 #param_grid为待调参数组成的网格,字典格式,键为参数名称(格式“对象名称__子对象名称__参数名称”),值为可取的参数值列表
6 grid_search = GridSearchCV(pipeline, param_grid={'FeatureUnionExt__ToBinary__threshold':[1.0, 2.0, 3.0, 4.0], 'LogisticRegression__C':[0.1, 0.2, 0.4, 0.8]})
7#训练以及调参
8 grid_search.fit(iris.data, iris.target)
import numpyas np
fromsklearn.linear_model import Ridge
fromsklearn.grid_search importGridSearchCV
fromsklearn.datasets importload_iris
X=load_iris().data
y=load_iris().target
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid=GridSearchCV(estimator=model,param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)
输出为:
GridSearchCV(cv=None, error_score='raise',
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001),
fit_params={}, iid=True,n_jobs=1,
param_grid={'alpha':array([ 1.00000e+00, 1.00000e-01, 1.00000e-02, 1.00000e-03,
1.00000e-04, 0.00000e+00])},
pre_dispatch='2*n_jobs',refit=True, scoring=None, verbose=0)
0.0
1.0
importnumpy as np
fromsklearn.linear_model import Ridge
fromsklearn.grid_search import GridSearchCV
fromsklearn.datasets import load_iris
X=load_iris().data
y=load_iris().target
importnumpy as np
fromscipy.stats import uniformas sp_rand
fromsklearn.linear_model import Ridge
fromsklearn.grid_search import RandomizedSearchCV
# prepare a uniform distribution to sample for the alphaparameter
param_grid= {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alphavalues
model =Ridge()
rsearch =RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X,y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)
输出为:
RandomizedSearchCV(cv=None,error_score='raise',
estimator=Ridge(alpha=1.0,copy_X=True, fit_intercept=True, max_iter=None,
normalize=False,random_state=None, solver='auto', tol=0.001),
fit_params={}, iid=True, n_iter=100,n_jobs=1,
param_distributions={'alpha':
pre_dispatch='2*n_jobs',random_state=None, refit=True,
scoring=None,verbose=0)
0.0
0.878599646028
from sklearn.learning_curve importvalidation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits=load_digits()
X=digits.data
y=digits.target
param_range=np.logspace(-6,-2.3,5)
train_loss,test_loss=validation_curve(
SVC(),X,y,param_name='gamma',param_range=param_range,cv=10,
scoring='mean_squared_error')
train_loss_mean=-np.mean(train_loss,axis=1)
test_loss_mean=-np.mean(test_loss,axis=1)
plt.plot(param_range,train_loss_mean,'o-',color="r",label="Training")
plt.plot(param_range,test_loss_mean,'o-',color="g",label="Cross_validation")
plt.xlabel("gamma")
plt.ylabel("Loss")
plt.show()
from sklearn.learning_curve importlearning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits=load_digits()
X=digits.data
y=digits.target
param_range=np.logspace(-6,-2.3,5)
train_sizes,train_loss,test_loss=learning_curve(
SVC(gamma=0.001),X,y,cv=10,
scoring='mean_squared_error',
train_sizes=[0.1,0.25,0.5,0.75,1])
train_loss_mean=-np.mean(train_loss,axis=1)
test_loss_mean=-np.mean(test_loss,axis=1)
plt.plot(train_sizes,train_loss_mean,'o-',color="r",label="Training")
plt.plot(train_sizes,test_loss_mean,'o-',color="g",label="Cross_validation")
plt.xlabel("Training Learning")
plt.ylabel("Loss")
plt.show()
from sklearn.metricsimport accuracy_score
y_pred=[0,1,1,0,0]
y_true=[1,1,1,1,0]
printaccuracy_score(y_true,y_pred)
输出为:
0.6
>>>fromsklearn.metrics import confusion_matrix
>>>y_true=[0,0,1,1,0]
>>>y_pred=[0,0,0,1,0]
>>>confusion_matrix(y_true,y_pred)
array([[3,0],
[1, 1]])
>>>from sklearn.metrics importclassification_report
>>> y_true = [0, 1, 2, 2, 2]
>>> y_pred = [0, 0, 2, 2, 1]
>>> target_names = ['class 0', 'class 1','class 2']
>>>print(classification_report(y_true,y_pred, target_names=target_names))
precision recall f1-score support
class 0 0.50 1.00 0.67 1
class 1 0.00 0.00 0.00 1
class 2 1.00 0.67 0.80 3
avg / total 0.70 0.60 0.61 5
>>>from sklearn.metrics importprecision_recall_fscore_support
>>> y_true = np.array(['cat', 'dog', 'pig','cat', 'dog', 'pig'])
>>> y_pred = np.array(['cat', 'pig', 'dog','cat', 'cat', 'dog'])
>>> precision_recall_fscore_support(y_true,y_pred, average='macro')
...
(0.22..., 0.33..., 0.26..., None)
>>> precision_recall_fscore_support(y_true,y_pred, average='micro')
...
(0.33..., 0.33..., 0.33..., None)
>>> precision_recall_fscore_support(y_true,y_pred, average='weighted')
...
(0.22..., 0.33..., 0.26..., None)
>>> import numpy as np
>>>from sklearn.metrics importjaccard_similarity_score
>>> y_pred = [0, 2, 1, 3]
>>> y_true = [0, 1, 2, 3]
>>> jaccard_similarity_score(y_true, y_pred)
0.5
>>> jaccard_similarity_score(y_true, y_pred,normalize=False)
2
>>>from sklearn.metrics import hamming_loss
>>> y_pred = [1, 2, 3, 4]
>>> y_true = [2, 2, 3, 4]
>>> hamming_loss(y_true, y_pred)
0.25
>>>from sklearn.metrics import zero_one_loss
>>> y_pred = [1, 2, 3, 4]
>>> y_true = [2, 2, 3, 4]
>>> zero_one_loss(y_true, y_pred)
0.25
>>> zero_one_loss(y_true, y_pred,normalize=False)
1
>>>from sklearn import svm
>>>from sklearn.metrics import hinge_loss
>>> X = [[0], [1]]
>>> y = [-1, 1]
>>>est = svm.LinearSVC(random_state=0)
>>> est.fit(X, y)
LinearSVC(C=1.0, class_weight=None, dual=True,fit_intercept=True,
intercept_scaling=1, loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
verbose=0)
>>> pred_decision =est.decision_function([[-2], [3], [0.5]])
>>> pred_decision
array([-2.18..., 2.36..., 0.09...])
>>> hinge_loss([-1, 1, 1],pred_decision)
0.30...
再来一个例子
>>> X = np.array([[0], [1], [2], [3]])
>>> Y = np.array([0, 1, 2, 3])
>>>labels = np.array([0, 1, 2, 3])
>>>est = svm.LinearSVC()
>>> est.fit(X, Y)
LinearSVC(C=1.0, class_weight=None, dual=True,fit_intercept=True,
intercept_scaling=1, loss='squared_hinge', max_iter=1000,
multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
verbose=0)
>>> pred_decision =est.decision_function([[-1], [2], [3]])
>>> y_true = [0, 2, 3]
>>> hinge_loss(y_true, pred_decision,labels)
0.56...
>>> log_loss(["spam","ham", "ham", "spam"],
... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
0.21616...
其中,F1是以每个类别为基础进行定义的,包括两个概念:准确率(precision)和召回率(recall)。准确率是指预测结果属于某一类的个体,实际属于该类的比例。召回率是被正确预测为某类的个体,与数据集中该类个体总数的比例。F1是准确率和召回率的调和平均数。
回归结果度量
多标签的度量
聚类的度量
>>>from sklearn import svm
>>> X = [[0, 0], [1, 1]]
>>> y = [0, 1]
>>>clf = svm.SVC()
>>> clf.fit(X, y)
>>>clf.fit(train_X,train_y)
>>>joblib.dump(clf,"train_model.m")
>>>clf =joblib.load("train_model.m")
from skleanr.externals import joblib
joblib.dump(clf,'save/clf')
clf2=joblib.load('save/clf')
fromsklearn import svm
fromsklearn import datasets
fromsklearn.cross_validation import train_test_split
fromsklearn.ensemble importRandomForestClassifier
#model = svm.SVC()
model=RandomForestClassifier()
iris = datasets.load_iris()
X, y = iris.data, iris.target
printX.shape
printy.shape
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
model.fit(x_train, y_train)
importpickle
s = pickle.dumps(model)
model_ = pickle.loads(s)
printmodel_.predict(x_test)
#或者
with open("save/model.pickle","wb") as f:
pickle.dump(model,f)
with open("save/model.pickle","rb") as f:
model2=pickle.load(f)
//////////////////////////////////////////
欢迎加我qq:308747509,大家一起讨论,此文章会持续更新,修正其中的错误,改进其中的方法...