best_accuracy = 0
best_parameters = {"a": 0, "b": 0, "c": 0}
for a in range(1, 11):
for b in range(1, 11):
for c in range(1, 11):
model = MODEL(a, b, c)
model.fit(training_data)
preds = model.predict(validation_data)
accuracy = metrics.accuracy_score(targets, preds)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_parameters["a"] = a
best_parameters["b"] = b
best_parameters["c"] = c
RandomForestClassifier(
n_estimators=100,
criterion='gini',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
bootstrap=True,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None,
ccp_alpha=0.0,
max_samples=None,
)
# rf_grid_search.py
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
if __name__ == "__main__":
df = pd.read_csv("./input/mobile_train.csv")
X = df.drop("price_range", axis=1).values
y = df.price_range.values
classifier = ensemble.RandomForestClassifier(n_jobs=-1)
param_grid = {
"n_estimators": [100, 200, 250, 300, 400, 500],
"max_depth": [1, 2, 5, 7, 11, 15],
"criterion": ["gini", "entropy"]
}
model = model_selection.GridSearchCV(
estimator=classifier,
param_grid=param_grid,
scoring="accuracy",
verbose=10,
n_jobs=1,
cv=5
)
model.fit(X, y)
print(f"Best score: {model.best_score_}")
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print(f"\t{param_name}: {best_parameters[param_name]}")
[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.895 ,total = 1.0 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.890 ,total = 1.1 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.910 ,total = 1.1 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.880 ,total = 1.1 s[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 ...............[ CV ] criterion = entropy , max_depth = 15 , n_estimators = 500 , score = 0.870 , total = 1.1 s[ Parallel ( n_jobs = 1 )]: Done 360 out of 360 | elapsed : 3.7 min finishedBest score : 0.889Best parameters set :criterion : 'entropy'max_depth : 15n_estimators : 500
if __name__ == "__main__":
classifier = ensemble.RandomForestClassifier(n_jobs=-1)
param_grid = {
"n_estimators": np.arange(100, 1500, 100),
"max_depth": np.arange(1, 31),
"criterion": ["gini", "entropy"]
}
model = model_selection.RandomizedSearchCV(
estimator=classifier,
param_distributions=param_grid,
n_iter=20,
scoring="accuracy",
verbose=10,
n_jobs=1,
cv=5
)
model.fit(X, y)
print(f"Best score: {model.best_score_}")
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print(f"\t{param_name}: {best_parameters[param_name]}")
我们更改了随机搜索的参数⽹格,结果似乎有了些许改进。
Best score : 0.8905Best parameters set :criterion : entropymax_depth : 25n_estimators : 300
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
def quadratic_weighted_kappa(y_true, y_pred):
return metrics.cohen_kappa_score(y_true, y_pred, weights="quadratic")
if __name__ == '__main__':
train = pd.read_csv('./input/train.csv')
idx = test.id.values.astype(int)
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
y = train.relevance.values
traindata = list(train.apply(lambda x:'%s %s' % (x['text1'], x['text2']), axis=1))
testdata = list(test.apply(lambda x:'%s %s' % (x['text1'], x['text2']), axis=1))
tfv = TfidfVectorizer(
min_df=3,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 3),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,
stop_words='english'
)
tfv.fit(traindata)
X = tfv.transform(traindata)
X_test = tfv.transform(testdata)
svd = TruncatedSVD()
scl = StandardScaler()
svm_model = SVC()
clf = pipeline.Pipeline([
('svd', svd),
('scl', scl),
('svm', svm_model)
])
param_grid = {
'svd__n_components': [200, 300],
'svm__C': [10, 12]
}
kappa_scorer = metrics.make_scorer(
quadratic_weighted_kappa,
greater_is_better=True
)
model = model_selection.GridSearchCV(
estimator=clf,
param_grid=param_grid,
scoring=kappa_scorer,
verbose=10,
n_jobs=-1,
refit=True,
cv=5
)
model.fit(X, y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
best_model = model.best_estimator_
best_model.fit(X, y)
preds = best_model.predict(X_test)
# rf_gp_minimize.py
import numpy as np
import pandas as pd
from functools import partial
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from skopt import gp_minimize
from skopt import space
def optimize(params, param_names, x, y):
params = dict(zip(param_names, params))
model = ensemble.RandomForestClassifier(**params)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=x, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = x[train_idx]
ytrain = y[train_idx]
xtest = x[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_accuracy = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_accuracy)
return -1 * np.mean(accuracies)
if __name__ == "__main__":
df = pd.read_csv("./input/mobile_train.csv")
X = df.drop("price_range", axis=1).values
y = df.price_range.values
param_space = [
space.Integer(3, 15, name="max_depth"),
space.Integer(100, 1500, name="n_estimators"),
space.Categorical(["gini", "entropy"], name="criterion"),
space.Real(0.01, 1, prior="uniform", name="max_features")
]
param_names = [
"max_depth",
"n_estimators",
"criterion",
"max_features"
]
optimization_function = partial(
optimize,
param_names=param_names,
x=X,
y=y
)
result = gp_minimize(
optimization_function,
dimensions=param_space,
n_calls=15,
n_random_starts=10,
verbose=10
)
best_params = dict(
zip(
param_names,
result.x
)
)
print(best_params)
这同样会产⽣⼤量输出,最后⼀部分如下所⽰。
Iteration No : 14 started . Searching for the next optimal point .Iteration No : 14 ended . Search finished for the next optimal point .Time taken : 4.7793Function value obtained : - 0.9075Current minimum : - 0.9075Iteration No : 15 started . Searching for the next optimal point .Iteration No : 15 ended . Search finished for the next optimal point .Time taken : 49.4186Function value obtained : - 0.9075Current minimum : - 0.9075{ 'max_depth' : 12 , 'n_estimators' : 100 , 'criterion' : 'entropy' ,'max_features' : 1.0 }
from skopt . plots import plot_convergenceplot_convergence ( result )
收敛图如图 2 所⽰。
import numpy as np
import pandas as pd
from functools import partial
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
def optimize(params, x, y):
model = ensemble.RandomForestClassifier(**params)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=x, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = x[train_idx]
ytrain = y[train_idx]
xtest = x[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_accuracy = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_accuracy)
return -1 * np.mean(accuracies)
if __name__ == "__main__":
df = pd.read_csv("./input/mobile_train.csv")
X = df.drop("price_range", axis=1).values
y = df.price_range.values
param_space = {
"max_depth": scope.int(hp.quniform("max_depth", 1, 15, 1)),
"n_estimators": scope.int(hp.quniform("n_estimators", 100, 1500, 1)),
"criterion": hp.choice("criterion", ["gini", "entropy"]),
"max_features": hp.uniform("max_features", 0, 1)
}
optimization_function = partial(
optimize,
x=X,
y=y
)
trials = Trials()
hopt = fmin(
fn=optimization_function,
space=param_space,
algo=tpe.suggest,
max_evals=15,
trials=trials
)
print(hopt)
❯ python rf_hyperopt . py100 %| ██████████████████ | 15 / 15 [ 0 4 : 38 < 0 0 : 0 0 , 18.57 s / trial , best loss : -0.9095000000000001 ]{ 'criterion' : 1 , 'max_depth' : 11.0 , 'max_features' : 0.821163568049807 ,'n_estimators' : 806.0 }