import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.utils.validation import column_or_1d
import numpy as np
from sklearn.model_selection import GridSearchCV
original_data = pd.read_csv("krkopt.data")
增加表头
格式化数据
original_data.columns = ["wx", "wy", "wwx", "wwy", "vx", "vy", "outcome"]
original_data.replace(to_replace={'^a$': 1, '^b$': 2, '^c$': 3, '^d$': 4, '^e$': 5, '^f$': 6, '^g$': 7, '^h$': 8, '^draw$': 1, "(?!draw)": 0}, regex=True, inplace=True)
original_data.head
original_data[['wx', 'wy', 'wwx', 'wwy', 'vx', 'vy']] = preprocessing.scale(original_data[['wx', 'wy', 'wwx', 'wwy', 'vx', 'vy']])
pd.DataFrame(data=original_data).to_csv("krkopt_fill.csv")
original_data.shape
(28055, 7)
new_original_data = pd.read_csv("krkopt_fill.csv")
original_data_x = new_original_data[['wx', 'wy', 'wwx', 'wwy', 'vx', 'vy']]
original_data_y = new_original_data[['outcome']]
original_data_x.head(5)
original_data_y.head(5)
outcome | |
---|---|
0 | 1 |
1 | 1 |
2 | 1 |
3 | 1 |
4 | 1 |
X_train, X_test, y_train, y_test = train_test_split( original_data_x, original_data_y, train_size=5000, random_state=0)
X_train.shape,X_test.shape
y_train.shape
(5000, 1)
y_train = column_or_1d(y_train,warn=False)
y_train.shape
(5000,)
clf = svm.SVC(C=10, tol=1e-3, gamma=0.8, kernel='rbf', decision_function_shape='ovr', probability=True)
clf.fit(X_train,y_train)
SVC(C=10, gamma=0.8, probability=True)
clf.score(X_test,y_test)
0.9888093689004555
CScale = [-5,-3,-1,1,3,5,7,9,11,13,15];
gammaScale = [-15,-13,-11,-9,-7,-5,-3,-1,1,3]
C=[]
gamma=[]
for cs in CScale:
C.append(2**cs)
for gs in gammaScale:
gamma.append(2**gs)
C,gamma
([0.03125, 0.125, 0.5, 2, 8, 32, 128, 512, 2048, 8192, 32768],
[3.0517578125e-05,
0.0001220703125,
0.00048828125,
0.001953125,
0.0078125,
0.03125,
0.125,
0.5,
2,
8])
clf = svm.SVC(tol=1e-3, kernel='rbf', decision_function_shape='ovr', probability=True)
tuned_parameters={"gamma": gamma, "C": C}
clf = GridSearchCV(svm.SVC(), tuned_parameters, n_jobs=5,cv=5)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print(clf.best_score_)
print()
Best parameters set found on development set:
{'C': 128, 'gamma': 0.125}
0.9942
newC = np.linspace((32+128)/2,(128+512)/2,10)
newGamma = np.linspace((0.03125+0.125)/2,(0.125+0.5)/2,10)
newC,newGamma
(array([ 80. , 106.66666667, 133.33333333, 160. ,
186.66666667, 213.33333333, 240. , 266.66666667,
293.33333333, 320. ]),
array([0.078125 , 0.10416667, 0.13020833, 0.15625 , 0.18229167,
0.20833333, 0.234375 , 0.26041667, 0.28645833, 0.3125 ]))
clf = svm.SVC(tol=1e-3, kernel='rbf', decision_function_shape='ovr', probability=True)
tuned_parameters={"gamma": newGamma, "C": newC}
clf = GridSearchCV(svm.SVC(), tuned_parameters, n_jobs=5,cv=5)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print(clf.best_score_)
print()
Best parameters set found on development set:
{'C': 106.66666666666667, 'gamma': 0.18229166666666669}
0.9945999999999999
clf = svm.SVC(C= 106.66666666666667,gamma=0.18229166666666669,tol=1e-3, kernel='rbf', decision_function_shape='ovr', probability=True)
clf.fit(X_train,y_train)
clf.score
clf.score(X_test,y_test)
0.9944480589893733