from sklearn import svm
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
import numpy as np
from sklearn.externals import joblib
# 生成测试数据
X, y = make_blobs(n_samples=100, centers=3, random_state=0, cluster_std=0.8)
# 构造svm分类器实例
clf_linear = svm.SVC(C=1.0, kernel='linear')
clf_poly = svm.SVC(C=1.0, kernel='poly', degree=3)
clf_rbf = svm.SVC(C=1.0, kernel='rbf', gamma=0.5)
clf_rbf2 = svm.SVC(C=1.0, kernel='rbf', gamma=0.1)
plt.figure(figsize=(10, 10), dpi=144)
clfs = [clf_linear, clf_poly, clf_rbf, clf_rbf2]
titles = [ 'Linear Kernel',
'Polynomial Kernel with Degree=3',
'Gaussian Kernel with gamma=0.5',
'Gaussian Kernel with gamma=0.1']
# train and predict
for clf, i in zip(clfs, range(len(clfs))):
clf.fit(X, y)
print("{}'s score:{}".format(titles[i], clf.score(X,y)))
out = clf.predict(X)
print("out's shape:{}, out:{}".format(out.shape, out))
# plt.subplot(2, 2, i+1)
# plot_hyperplane(clf, X, y, title=titles[i])
# 参考页面:http://scikit-learn.org/stable/modules/model_persistence.html
# http://sofasofa.io/forum_main_post.php?postid=1001002
# save trained model to disk-file
for clf, i in zip(clfs, range(len(clfs))):
joblib.dump(clf, str(i)+'.pkl')
# load model from file and test
for i in range(len(clfs)):
clf = joblib.load(str(i)+'.pkl')
print( "{}'s score:{}".format( titles[i], clf.score( X, y ) ) )
- fit:训练
- predict:预测
- score:评估准确率
Linear Kernel's score:0.98
out's shape:(100,), out:[1 0 1 0 0 0 2 2 1 0 0 0 1 0 2 1 2 0 2 2 2 2 2 0 1 1 1 1 2 2 0 1 1 0 2 0 0
1 1 2 2 1 1 0 0 0 1 1 2 2 0 1 0 1 2 2 1 1 0 1 1 2 2 2 2 1 0 2 1 0 2 0 0 1
1 0 0 0 2 1 0 0 1 0 1 0 0 0 1 0 1 1 2 2 2 2 0 0 2 2]
Polynomial Kernel with Degree=3's score:0.95
out's shape:(100,), out:[1 0 1 0 2 0 2 2 1 0 0 0 1 0 2 1 2 2 2 2 2 2 2 2 1 1 1 1 2 2 0 1 1 0 2 0 0
1 1 2 2 1 1 0 0 0 1 1 2 2 0 1 0 1 2 2 1 1 0 1 1 2 2 2 2 1 0 2 1 0 2 0 0 1
1 0 0 0 2 1 0 0 1 0 1 0 0 0 1 0 1 1 2 2 2 2 0 0 2 2]
Gaussian Kernel with gamma=0.5's score:0.98
out's shape:(100,), out:[1 0 1 0 0 0 2 2 1 0 0 0 1 0 2 1 2 0 2 2 2 2 2 0 1 1 1 1 2 2 0 1 1 0 2 0 0
1 1 2 2 1 1 0 0 0 1 1 2 2 0 1 0 1 2 2 1 1 0 1 1 2 2 2 2 1 0 2 1 0 2 0 0 1
1 0 0 0 2 1 0 0 1 0 1 0 0 0 1 0 1 1 2 2 2 2 0 0 2 2]
Gaussian Kernel with gamma=0.1's score:0.96
out's shape:(100,), out:[1 0 1 0 0 0 2 2 1 0 0 0 1 0 2 1 2 0 2 2 2 2 2 0 1 1 1 1 2 2 0 1 1 0 2 0 0
1 1 2 2 1 1 0 0 0 1 1 2 2 0 1 0 1 2 2 1 1 0 1 1 2 2 2 2 1 0 2 1 0 2 1 2 1
1 0 0 0 2 1 0 0 1 0 1 0 0 0 1 0 1 1 2 2 2 2 0 0 2 2]
Linear Kernel's score:0.98
Polynomial Kernel with Degree=3's score:0.95
Gaussian Kernel with gamma=0.5's score:0.98
Gaussian Kernel with gamma=0.1's score:0.96
from sklearn import svm
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
X, y = make_blobs(n_samples=500, centers=2, random_state=0, cluster_std=0.8)
X_train = X[:350]
y_train = y[:350]
X_test = X[350:]
y_test = y[350:]
thresholds = np.linspace(0, 0.001, 100)
C_nums = np.linspace(0.1, 0.02, 5)
#param_grid = {'gamma': thresholds, 'C':C_nums}
param_grid = {'gamma': thresholds}
#param_grid = {'C':C_nums}
clf = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=5)
clf.fit(X_train, y_train)
print("best param: {0}\nbest score: {1}".format(clf.best_params_,
y_pred = clf.predict(X_test)
print("查准率:",metrics.precision_score(y_pred, y_test))
print("召回率:",metrics.recall_score(y_pred, y_test))
print("F1:",metrics.f1_score(y_pred, y_test))
best param: {'gamma': 0.00047474747474747476}
best score: 0.9857142857142858
y_pred:[0 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0
0 1 1 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0
1 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1
0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 0
1 0]
y_test:[0 1 1 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0
0 1 1 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0
1 0 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 1 1
0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 0
1 0]
查准率: 0.9642857142857143
召回率: 1.0
F1: 0.9818181818181818
可以看到,搜索到的最优gamma值是0.00047474747474747476, 对应的最优分数是 0.9857142857142858。
The total number of points equally divided among clusters.
The number of features for each sample.
(default=3) The number of centers to generate, or the fixed center locations.
The standard deviation of the clusters.
The bounding box for each cluster center when centers are generated at random.
shuffle : boolean, optional (default=True)
Shuffle the samples.
If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.
The generated samples.
The integer labels for cluster membership of each sample.
kernel :核函数,默认是rbf,可以是‘linear’,‘poly’, ‘rbf’
degree :多项式poly函数的维度,默认是3,选择其他核函数时会被忽略。
gamma : ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’,则会选择1/n_features
coef0 :核函数的常数项。对于‘poly’和 ‘sigmoid’有用。
probability :是否采用概率估计?.默认为False
shrinking :是否采用shrinking heuristic方法,默认为true
tol :停止训练的误差值大小,默认为1e-3
cache_size :核函数cache缓存大小,默认为200
class_weight :类别的权重,字典形式传递。设置第几类的参数C为weight * C(C-SVC中的C)
verbose :允许冗余输出?
max_iter :最大迭代次数。-1为无限制。
decision_function_shape :‘ovo’, ‘ovr’ or None, default=None3
random_state :数据洗牌时的种子值,int值