from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
%matplotlib inline
digits = load_digits()
plt.imshow(digits.images[0],cmap='gray')
from sklearn.model_selection import train_test_split
dt = DecisionTreeClassifier(max_depth=10)
trainX, testX, trainY, testY = train_test_split(digits.data, digits.target)
dt.fit(trainX,trainY)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
dt.score(testX,testY)
0.8355555555555556
dt.score(trainX,trainY)
0.9740163325909429
dt = DecisionTreeClassifier(max_depth=7)
dt.fit(trainX,trainY)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
dt.score(testX,testY)
0.8155555555555556
dt.score(trainX,trainY)
0.8864142538975501
from sklearn.model_selection import cross_val_score
scores = cross_val_score(dt, digits.data, digits.target)
d:\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
warnings.warn(CV_WARNING, FutureWarning)
scores
array([0.68604651, 0.8096828 , 0.74161074])
scores.mean()
0.7457800181857993
from sklearn.model_selection import cross_validate
scoring = ['precision_macro', 'recall_macro', 'accuracy']
results=cross_validate(dt, digits.data, digits.target, scoring=scoring, cv=5)
results
{'fit_time': array([0.01800108, 0.01200056, 0.01400089, 0.01300073, 0.01400089]),
'score_time': array([0.00300026, 0.00300002, 0.00200009, 0.00300002, 0.00300002]),
'test_precision_macro': array([0.7732771 , 0.71087424, 0.77524663, 0.78964348, 0.7585891 ]),
'test_recall_macro': array([0.76278636, 0.66593093, 0.76876662, 0.77198413, 0.74553688]),
'test_accuracy': array([0.76373626, 0.66574586, 0.76880223, 0.77310924, 0.74366197])}
for k, v in results.items():
print(k,end=' ')
print(v)
fit_time [0.01800108 0.01200056 0.01400089 0.01300073 0.01400089]
score_time [0.00300026 0.00300002 0.00200009 0.00300002 0.00300002]
test_precision_macro [0.7732771 0.71087424 0.77524663 0.78964348 0.7585891 ]
test_recall_macro [0.76278636 0.66593093 0.76876662 0.77198413 0.74553688]
test_accuracy [0.76373626 0.66574586 0.76880223 0.77310924 0.74366197]
results.keys()
dict_keys(['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro', 'test_accuracy'])
import pandas as pd
results=pd.DataFrame(results.values(),index=results.keys()).T
results
fit_time | score_time | test_precision_macro | test_recall_macro | test_accuracy | |
---|---|---|---|---|---|
0 | 0.018001 | 0.003 | 0.773277 | 0.762786 | 0.763736 |
1 | 0.012001 | 0.003 | 0.710874 | 0.665931 | 0.665746 |
2 | 0.014001 | 0.002 | 0.775247 | 0.768767 | 0.768802 |
3 | 0.013001 | 0.003 | 0.789643 | 0.771984 | 0.773109 |
4 | 0.014001 | 0.003 | 0.758589 | 0.745537 | 0.743662 |
import numpy as np
Y = np.append(np.ones(12),np.zeros(6))
X = np.ones((18,3))
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)
list(skf.split(X,Y))
[(array([ 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17]),
array([ 0, 1, 2, 3, 12, 13])),
(array([ 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 16, 17]),
array([ 4, 5, 6, 7, 14, 15])),
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15]),
array([ 8, 9, 10, 11, 16, 17]))]
Y[[ 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17]]
array([1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.])
Searches sequentially for all the configued params
For all possible combinations
模型参数由学习算法根据数据学习算法来学习。
需要配置超参数。
超参数依赖于数据,很多时候需要通过实验来找到最佳的参数。
sklearn提供了GridSerach,用于寻找最佳的超参数。
按顺序搜索所有配置的参数。
对于所有可能的组合
trainX, testX, trainY, testY = train_test_split(digits.data, digits.target)
dt = DecisionTreeClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(dt, param_grid={'max_depth':range(5,30,5)}, cv=5)
grid_search.fit(digits.data,digits.target)
d:\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
DeprecationWarning)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None,
criterion='gini', max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort=False, random_state=None,
splitter='best'),
iid='warn', n_jobs=None, param_grid={'max_depth': range(5, 30, 5)},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
grid_search.best_params_
{'max_depth': 20}
grid_search.best_score_
0.7868670005564831
grid_search.best_estimator_
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
from time import time
#randint is an intertor for generating numbers between range specified
from scipy.stats import randint
X = digits.data
Y = digits.target
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": randint(1,11),
"min_samples_split": randint(2, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
param_dist
{'max_depth': [3, None],
'max_features': ,
'min_samples_split': ,
'bootstrap': [True, False],
'criterion': ['gini', 'entropy']}
rf = RandomForestClassifier(n_estimators=20)
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
n_iter=n_iter_search, cv=5)
start = time()
random_search.fit(X, Y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
RandomizedSearchCV took 4.52 seconds for 20 candidates parameter settings.
d:\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
DeprecationWarning)
random_search.best_score_
0.9365609348914858
param_grid = {"max_depth": [3, None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
start = time()
grid_search.fit(X, Y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(grid_search.cv_results_['params'])))
GridSearchCV took 15.34 seconds for 72 candidate parameter settings.
d:\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
DeprecationWarning)
grid_search.best_score_
0.9354479688369505
trainX, testX, trainY, testY = train_test_split(X,Y)
rf.fit(trainX, trainY)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=20,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
rf.score(testX,testY)
0.9577777777777777
cross_val_score(rf,X,Y,cv=5)
array([0.92307692, 0.90055249, 0.93871866, 0.94677871, 0.88169014])
from sklearn.datasets import load_breast_cancer
dt = DecisionTreeClassifier()
cancer_data = load_breast_cancer()
trainX, testX, trainY, testY = train_test_split(cancer_data.data, cancer_data.target)
dt.fit(trainX,trainY)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
pred = dt.predict(testX)
from sklearn import metrics
metrics.accuracy_score(y_pred=pred, y_true=testY)
0.9300699300699301
confusion_result=metrics.confusion_matrix(y_pred=pred, y_true=testY, labels=[0,1])
confusion_result
array([[60, 6],
[ 4, 73]], dtype=int64)
tp=confusion_result[1][1]
tn=confusion_result[0][0]
fp=confusion_result[0][1]
fn=confusion_result[1][0]
precision_result=tp/(tp+fp)
precision_result
0.9240506329113924
metrics.precision_score(y_pred=pred, y_true=testY)
0.9240506329113924
metrics.recall_score(y_pred=pred, y_true=testY)
0.948051948051948
metrics.f1_score(y_pred=pred, y_true=testY)
0.9358974358974359
from sklearn.datasets import california_housing
house_data = california_housing.fetch_california_housing()
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(house_data.data, house_data.target)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
pred = lr.predict(house_data.data)
metrics.mean_squared_error(y_pred=pred, y_true=house_data.target)
0.5243209861846071
metrics.mean_absolute_error(y_pred=pred, y_true=house_data.target)
0.5311643817546461
r2 score
metrics.r2_score(y_pred=pred, y_true=house_data.target)
0.6062326851998051
from sklearn.metrics.cluster import completeness_score
completeness_score( labels_true=[10,10,11,11],labels_pred=[1,1,0,0])
1.0
completeness_score( labels_true=[11,22,22,11],labels_pred=[1,0,1,1])
0.3836885465963443
print(completeness_score([10, 10, 11, 11], [0, 0, 0, 0]))
1.0
from sklearn.metrics.cluster import homogeneity_score
homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])
1.0
homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3])
0.9999999999999999
homogeneity_score([0, 0, 0, 0], [1, 1, 0, 0])
1.0
from sklearn.datasets import make_blobs
X, Y = make_blobs(n_samples=500,
n_features=2,
centers=4,
cluster_std=1,
center_box=(-10.0, 10.0),
shuffle=True,
random_state=1)
plt.scatter(X[:,0],X[:,1],s=10)
range_n_clusters = [2, 3, 4, 5, 6]
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
for n_cluster in range_n_clusters:
kmeans = KMeans(n_clusters=n_cluster)
kmeans.fit(X)
labels = kmeans.predict(X)
print (n_cluster, silhouette_score(X,labels))
2 0.7049787496083262
3 0.5882004012129721
4 0.6505186632729437
5 0.5746932321727457
6 0.49417400746431644
from sklearn.metrics import calinski_harabaz_score
for n_cluster in range_n_clusters:
kmeans = KMeans(n_clusters=n_cluster)
kmeans.fit(X)
labels = kmeans.predict(X)
print (n_cluster, calinski_harabaz_score(X,labels))
2 1604.112286409658
3 1809.991966958033
4 2704.4858735121097
5 2281.91411035916
6 2040.6320809618921
import pickle
s = pickle.dumps(dt)
pickle.loads(s)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
type(s)
bytes
from sklearn.externals import joblib
joblib.dump(dt, 'dt.joblib')
['dt.joblib']
dt = joblib.load('dt.joblib')
dt
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=None, splitter='best')
from sklearn.model_selection import validation_curve
param_range = np.arange(1, 50, 2)
train_scores, test_scores = validation_curve(RandomForestClassifier(),
digits.data,
digits.target,
param_name="n_estimators",
param_range=param_range,
cv=3,
scoring="accuracy",
n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean, label="Training score", color="black")
plt.plot(param_range, test_mean, label="Cross-validation score", color="dimgrey")
plt.title("Validation Curve With Random Forest")
plt.xlabel("Number Of Trees")
plt.ylabel("Accuracy Score")
plt.tight_layout()
plt.legend(loc="best")
plt.show()
from sklearn.model_selection import learning_curve